php解析html dom节点树
程序员文章站
2022-04-13 16:44:24
...
不得不感叹用DOM直接解析HTML DOM树的灵活和强大,因为基本的HTML元素就是那么几种常见的,再加上ID属性或者CLASS属性之类的。。
在解析html文件时,完全可以用正则中脱离出来,毕竟HTML文件中存在大量相似的模式,而且代码看上去功能比较显而易见,当然正则是非常强大的,应用的领域也更广。。
代码如下:
doc = new DomDocument(); //判断$source类型 if(is_file($source)) { file_exists($source)?$this->doc->loadHTMLFile($source):die("文件不存在"); } else if(is_string($source)) { empty($source)?die("传入的字符串不能为空"):$this->doc->loadHTML($source); } else { preg_match('#^(http|ftp)://#i', $source)?$this->doc->loadHTML(file_get_contents($source)):die("不支持的资源类型"); } //获取div元素列表 $div_list = $this->doc->getElementsByTagName("div"); $div_list_len = $div_list->length; for($i=0; $iitem($i)->hasAttribute("class")) { switch(trim($div_list->item($i)->getAttribute ("class"))) { case "basic clearfix": $this->getBasicMeans($div_list->item($i)); break; case "layout dual": $this->getEnOrCh($div_list->item($i)); break; case "layout en": $this->getEnToEn($div_list->item($i)); break; case "layout sort": $this->getExample($div_list->item($i)); break; case "layout patt": $this->normalSentence($div_list->item($i)); break; case "layout coll": $this->getGlossary($div_list->item($i)); break; case "layout auth": $this->getAuth($div_list->item($i)); break; case "layout comn": $this->useInWrong($div_list->item($i)); break; case "layout nfw": $this->getApproximateWords($div_list->item($i)); break; case "layout baike"; $this->getBaike($div_list->item($i)); break; } } } } //获取基本解释 private function getBasicMeans($basic_div) { $li_list = $basic_div->getElementsByTagName("li"); $li_list_len = $li_list->length; for($i=0; $iitem($i); if($item->hasAttribute("style")) { continue; } else { $strong_list = $item->getElementsByTagName("strong"); $strong_list_len = $strong_list->length; for($j=0; $jbasic_meaning[]=$strong_list->item($j)->nodeValue; } } } } //获取英汉双解释义 private function getEnOrCh($div_elem) { $li_list = $div_elem->getElementsByTagName("li"); $li_list_len = $li_list->length; for($i=0; $ien_or_ch[]=$li_list->item($i)->nodeValue; } } //获取英英释义 private function getEnToEn($div_elem) { $li_list = $div_elem->getElementsByTagName("li"); $li_list_len = $li_list->length; for($i=0; $ien_to_en[]= $this->strip_Empty($li_list->item($i)->nodeValue); } } //格式化操作 private function strip_Empty($string) { if(is_string($string)) { return preg_replace('#\s{2,}#', ' ', $string); } } //获取例句 private function getExample($div_elem) { if($div_elem->hasChildNodes()) { $ol_list = $div_elem->getElementsByTagName("ol"); $ol_list_len = $ol_list->length; for($i=0; $iitem($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $jexample[] = $this->strip_Empty($li_list->item($j)->nodeValue); } } } } //常见句型 private function normalSentence($div_elem) { $ol_list = $div_elem->getElementsByTagName("ol"); $ol_list_len = $ol_list->length; for($i=0; $iitem($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $jsentences[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //常见词汇 private function getGlossary($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $iitem($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $jglossary[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //获取名人名言 private function getAuth($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $iitem($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $jauth[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //获取常见错误用法 private function useInWrong($div_elem) { $ol_list = $div_elem->getElementsByTagName("ol"); $ol_list_len = $ol_list->length; for($i=0; $iitem($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $juse_in_wrong[]=$this->strip_Empty($li_list->item($j)->nodeValue); } } } //获取近义词 private function getApproximateWords($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $iitem($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $jitem($j)->getElementsByTagName("a"); $a_list_len = $a_list->length; for($k=0; $kapproximate_words[]=$a_list->item($k)->nodeValue; } } } } //获取百科解释 private function getBaike($div_elem) { $ul_list = $div_elem->getElementsByTagName("ul"); $ul_list_len = $ul_list->length; for($i=0; $iitem($i)->getElementsByTagName("li"); $li_list_len = $li_list->length; for($j=0; $jbaike_trans[]=$li_list->item($j)->nodeValue; } } } //接口: 返回基本释义 public function getBasicMeaning() { if(!empty($this->basic_meaning)) { return $this->basic_meaning; } } //接口: 返回英汉双解 public function getEnOrChMeaning() { if(!empty($this->en_or_ch)) { return $this->en_or_ch; } } //接口: 返回英英释义 public function getEnToEnMeaning() { if(!empty($this->en_to_en)) { return $this->en_to_en; } } //接口: 返回例句 public function getExampleMeaning() { if(!empty($this->example)) { return $this->example; } } //接口: 返回常用句型 public function getNormalSentenceMeaning() { if(!empty($this->sentences)) { return $this->sentences; } } //接口: 返回词汇表 public function getGlossaryMeaning() { if(!empty($this->glossary)) { return $this->glossary; } } //接口: 返回名人名言 public function getAuthMeaning() { if(!empty($this->auth)) { return $this->auth; } } //接口: 返回常见错误用法 public function getUseInWrongMeaning() { if(!empty($this->use_in_wrong)) { return $this->use_in_wrong; } } //接口: 获取近义词 public function getApproximateWordsMeaning() { if(!empty($this->approximate_words)) { return $this->approximate_words; } } //接口: 获取百度百科的解释 public function getBaikeMeaning() { if(!empty($this->baike_trans)) { return $this->baike_trans; } } //返回所有的翻译 public function getAllMeaning() { $all_meaning = array(); $all_meaning['basic_meaning'] = $this->getBasicMeaning(); $all_meaning['en_or_ch'] = $this->getEnOrChMeaning(); $all_meaning['en_to_en'] = $this->getEnToEnMeaning(); $all_meaning['example']=$this->getExampleMeaning(); $all_meaning['normal_sentence'] = $this->getNormalSentenceMeaning(); $all_meaning['glossary_sentence'] = $this->getGlossaryMeaning(); $all_meaning['auth_sentence'] = $this->getAuthMeaning(); $all_meaning['wrong_use'] = $this->getUseInWrongMeaning(); $all_meaning['approximate_words'] = $this->getApproximateWordsMeaning(); $all_meaning['baike_meaning'] = $this->getBaikeMeaning(); return $all_meaning; } } $dom = new DomTree("./com.html"); $trans = $dom->getAllMeaning(); echo ""; print_r($trans); ?>结果如下:
Array ( [basic_meaning] => Array ( [0] => 单词;消息;话语;诺言 [1] => 用词语表达 ) [en_or_ch] => Array ( [0] => [C] 字,词 the smallest unit of spoken language which has meaning and can stand alone [1] => [C] (说的)话,话语,言语 anything said; remark or statement [2] => [S] 消息,信息; 谣言 piece of news; message; rumour [3] => [S] 口令,号令; 命令 spoken command or signal [4] => [S] 诺言,保证 a promise [5] => vt. 用词语表达; 选用 express (sth) in particular words; phrase sth ) [en_to_en] => Array ( [0] => a unit of language that native speakers can identify; "words are the blocks from which sentences are made" "he hardly said ten words all morning" [1] => a brief statement; "he didn't say a word about it" [2] => information about recent and important events; "they awaited news of the outcome" [3] => a verbal command for action; "when I give the word, charge!" [4] => an exchange of views on some topic; "we had a good discussion" "we had a word or two about it" [5] => a promise; "he gave his word" [6] => a word is a string of bits stored in computer memory; "large computers use words up to 64 bits long" [7] => the divine word of God; the second person in the Trinity (incarnate in Jesus) [8] => a secret word or phrase known only to a restricted group; "he forgot the password" [9] => the sacred writings of the Christian religions; "he went to carry the Word to the heathen" [10] => put into words or an expression; "He formulated his concerns to the board of trustees" ) [example] => Array ( [0] => Could we have a word before you go to the meeting? 你去开会之前,咱们能私下说句话吗? [1] => My friend sent word that he was well. 我朋友捎来口信说他很好。 ) [normal_sentence] => Array ( [0] => What does this word mean? 这个词是什么意思? [1] => I couldn't look up the spelling of the word, as I hadn't a dictionary at hand. 我没法查这个词的拼写,因为我手边没有词典。 [2] => Many English words are derived from Latin. 许多英文单词源于拉丁文。 [3] => All the words beside the central idea should be crossed out. 凡偏离中心思想的词语都应通通删掉。 [4] => The editor eliminated slang words from the essay. 编辑将俚语从这篇文章中剔除。 [5] => These words can't be staled by repetition. 这些词语不会因为经常使用而变成陈词滥调。 [6] => He gave me his visiting card, with a few words in pencil. 他把他的名片给我,上面有几个铅笔字。 [7] => I don't believe a word of his story. 他说的这件事我一句话都不相信。 [8] => At the press conference, the reporters copied down every word spoken by the prime minister. 在新闻发布会上,记者们逐字记下了首相的讲话。 [9] => Tell me what happened in your words. 用你自己的话把发生的事告诉我。 [10] => Deeds are better than words when people are in need of help. 当别人需要帮助时,行动胜于语言。 [11] => I would like a word with you. 我想和你谈谈。 [12] => After a word with the colonel he went away . 他和上校简单谈过之后就走了。 [13] => There's been no word from her for weeks. 已经有好几个星期没有她的音信了。 [14] => Word came that I was needed at home. 有信儿来说家里需要我。 [15] => Word has come that meeting will be held on Tuesday. 通知已到,星期二开会。 [16] => Word is that the election will be held in June. 有消息说选举将在六月份举行。 [17] => Word is that he's left the country. 据说他已经离开这个国家了。 [18] => Word got round that he had resigned. 谣传他已辞职。 [19] => Stay hidden until I give the word. 我不下令就藏着别动。 [20] => Their word is law. 他们的命令必须服从。 [21] => He gave the word and they let him in. 他说出了口令,他们让他进去了。 [22] => The word now is “freedom”. 现在的口号是“*”。 [23] => I give you my word I'll go. 我向你保证,我会去的。 [24] => Stand by your word. 要守信用。 [25] => Hear The Word of God . 听宣讲《圣经》。 [26] => Be careful how you word your answer. 回答时要斟酌字句。 [27] => She worded the explanation well. 她的解释措辞得体。 [28] => The advice wasn't very tactfully worded. 这份通知措辞不太得体。 [29] => The suggestion might be worded more politely. 那项建议的措辞可以更婉转些。 [30] => This is a carefully worded contract. 这是一份措辞严谨的合同。 ) [glossary_sentence] => Array ( [0] => address a few words 讲几句话 [1] => await word from sb 等待某人的消息 [2] => break one's words 食言 [3] => breathe a word 走漏消息 [4] => bring word 带来消息 [5] => choose a word 选择词 [6] => coin a word 杜撰一个词 [7] => cook up words 造新词 [8] => cross out a word 划掉一个词 [9] => cut out many words 删掉许多词 [10] => digest a word 消化一个词 [11] => doubt sb's words 怀疑某人的话 [12] => drink in all the words 吸收所有的词语 [13] => eat one's words 收回前言,认错,道歉 [14] => exchange angry words 发生口角 [15] => find words 找出言语(来表达) [16] => gain the good word of 博得…的赞扬 [17] => get word 得到消息 [18] => get a word 插嘴 [19] => give one's word 保证,允许 [20] => give the word 发出命令 [21] => have words together 争吵 [22] => have words with sb 与某人吵嘴 [23] => have a word with sb 同某人谈一谈 [24] => hunt up a word 查一个词 [25] => keep one's word 信守诺言 [26] => leave word 留言 [27] => leave out a word 省略一个词,丢掉一个词 [28] => look up a word (在词典里)查一个词 [29] => memorize words 记单词 [30] => play on words 玩弄字眼 [31] => pronounce a word 读一个词 [32] => put in words for 为…说几句话 [33] => put the words into sb's mouth 教某人怎么讲 [34] => quote a word 引用一个词 [35] => receive word of 收到…消息 [36] => regret one's words 为说过的话而后悔 [37] => respect one's word 遵守自己许下的诺言 [38] => say a word 说句话,进一步,走漏消息 [39] => say a few words 说几句话 [40] => say a good word for sb 为某人说好话 [41] => send sb a word 给某人捎个信儿 [42] => spell a word 拼写一个词 [43] => stress the word 重读那个词 [44] => take back one's word 收回自己的话 [45] => take sb's word for it 相信了某人的话 [46] => understand a word 理解某个词的意思 [47] => use words 用词 [48] => waste one's words 白费口舌 [49] => weigh words 斟酌词句 [50] => write a word 写一个词 [51] => advance word 事先传出的消息 [52] => angry words 气话 [53] => beautiful words 优美的言辞 [54] => big words 大话 [55] => borrowed word 外来词 [56] => broken words 断断续续的话 [57] => burning words 热情洋溢的话 [58] => choice words 精选的词句 [59] => colorful words 丰富的言辞 [60] => cross words 气话 [61] => empty words 空洞的话,无意义的话 [62] => everyday word 日常用语 [63] => farewell words 送别词 [64] => fighting words 容易引起争论的话,挑战性的话 [65] => foreign word 外来词 [66] => hard words 愤怒的话,激烈的话 [67] => heated word 激烈的言词,争吵时使用的话 [68] => high words 愤怒的话,激烈的话 [69] => hollow words 虚假的言语 [70] => honeyed words 甜言蜜语 [71] => hot words 激烈的言词,争吵时使用的话 [72] => household word 家喻户晓的词 [73] => irresponsible words 不负责任的话 [74] => key words 关键的字眼 [75] => last words 临终遗言 [76] => living words 现代语 [77] => meaningful words 意味深长的言语 [78] => meaningless words 无意义的话 [79] => misspelled word 拼错的词 [80] => native word 本国词,本地词 [81] => pleasant words 动听的语言 [82] => regional word 方言 [83] => scientific word 科学用语 [84] => semi-technical words 半科技词 [85] => sharp words 愤怒的话,激烈的话 [86] => simple word 简单的词 [87] => sincere words 真诚的话 [88] => small word 小词 [89] => spoken words 口头语 [90] => suggestive words 含蓄的话 [91] => sweet words 甜言蜜语 [92] => tearful parting words 伤感的离别之言 [93] => the latest word 最新消息,最后消息 [94] => uncleanly words 下流话 [95] => unfamiliar word 生词 [96] => unusual word 冷僻词 [97] => warm words 忿怒的话,激烈的话 [98] => written words 书面语 [99] => wrong words 错词 [100] => dictionary word 词典里出现的词 [101] => English words 英语单词 [102] => law word 法律用语 [103] => newspaper word 新闻用语 [104] => slang word 俚语 [105] => at a word 立即,立刻 [106] => in a word 简言之,总之 [107] => in one's own words 用自己的话说 [108] => in other words 换言之 [109] => upon my word 的确,真的 [110] => without a word 一声没吭 [111] => word in heavy type 黑体字 [112] => words in season 时宜的话 [113] => words of comfort 安慰的话 [114] => words of command 命令 [115] => words of complaint 怨言 [116] => the W- of God 圣经 [117] => words of praise 表扬的话 [118] => word of six letters 六个字母的词 [119] => words of thanks 感谢的话 [120] => word the explanation 解释 [121] => word accurately 准确地用言语表达 [122] => word crudely 简单地用词语〔语言〕表达 [123] => word felicitously 恰当地用言语表达 [124] => word intelligibly 清楚地用语言表达 [125] => word positively 明确地用词语表达 [126] => word vaguely 含糊地表达 [127] => word well 措辞得体 ) [auth_sentence] => Array ( [0] => Rome shall perishswrite that word In the blood that she has spilt. 出自:W. Cowper [1] => We have striven..to draw some word from her; but she..answers nothing. 出自:G. P. R. James [2] => To use his own words, he was in a cleft stick. 出自:H. Conway [3] => Actions speak louder than words. 出自:Proverb [4] => He words me, girls, he words me, that I should not Be noble to myself. 出自:Anthony Cleopatra,Shakespeare ) [wrong_use] => Array ( [0] => 我要跟他说句话。 误 I should like to have word with him. 正 I should like to have a word with him. [1] => 他们听到消息说足球比赛将在今晚电视实况转播。 误 They had a word that the football match would be televised live this evening. 正 They had word that the football match would be televised live this evening. 析 have word是“听到消息〔新闻〕”的意思,“说句话”是have a word。 [2] => 对逐词背课文,我感到厌倦。 误 I was tired of reciting the texts word after word. 正 I was tired of reciting the texts word for word. 析 “一字不变地,逐字(背诵或翻译)”是word for word,不是word after word。 [3] => 我说了什么错话吗? 误 Have I said any wrong words? 正 Have I said anything wrong? 析 误句语法上没有错,但不符合英语习惯。 [4] => 他不遵守诺言。 误 He broke his words. 正 He broke his word. 析 break one's word意为“不遵守诺言”, word在此短语中不用复数形式。 [5] => 我刚得知他到达的消息。 误 I have just received the word of his arrival. 正 I have just received word of his arrival. [6] => 有消息传来说我们的篮球队赢了这场比赛。 误 The word came that our basketball team had won the match. 正 Word came that our basketball team had won the match. 析 作“消息”“信息”解时, word前不加冠词。 [7] => 他大约是30年前开始当教师的,换句话说,他当教师已经有30年了。 误 He began to work as a teacher some thirty years ago, in another word, he has been a teacher for thirty years. 正 He began to work as a teacher some thirty years ago, in other words, he has been a teacher for thirty years. 析 in other words是固定短语,意为“换句话说”。 [8] => 他带信给我说怀特先生不久将动身去美国。 误 He carried me words that Mr.White would soon leave for America. 正 He carried me word that Mr. White would soon leave for America. 析 word作“消息”“信”解时,是不可数名词,其后不可加s。 [9] => 今晨我们争吵了。 误 We had a word this morning. 正 We had words this morning. [10] => 他们曾为鸡毛蒜皮的小事同邻居吵过嘴。 误 They had word with their neighbour over some trifles. 正 They had words with their neighbours over some trifles. 析 表示“同某人发生口角”时,用have words with sb, words用复数形式。 [11] => 他说的大话使我们都感到惊讶。 误 His big word surprised us all. 正 His big words surprised us all. [12] => 我们绝不收回前言。 误 We should on no account eat our word. 正 We should on no account eat our words. 析 习语big words, eat one's words中, words词尾的s不可省。 ) [approximate_words] => Array ( [0] => account [1] => advice [2] => chat [3] => communication [4] => declaration [5] => edict [6] => expression [7] => message [8] => notice [9] => order [10] => password [11] => promise [12] => remark [13] => term [14] => couch [15] => explain [16] => express [17] => phrase [18] => put [19] => say [20] => write ) [baike_meaning] => Array ( [0] => word:Microsoft Word,属于办公软件,人们日常生活都有可能接触到他,对他并不陌生。 简介 wordMicrosoft Word是微软公司的一个文字处理器应用程序。它最初是由Richard Bro… ) )以上就介绍了php解析html dom节点树,包括了方面的内容,希望对PHP教程有兴趣的朋友有所帮助。