欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  后端开发

解析HTML标签,并实现快速查找节点,获取节点信息

程序员文章站 2023-12-30 12:20:46
...
详细介绍和使用请点击源码出处。
  1. /**
  2. * html标签解析包
  3. *
  4. * @category TagParse
  5. * @package TagParse
  6. * @author kun
  7. * @copyright 2014 kun
  8. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  9. * @version 1.0
  10. * @link http://www.blogkun.com
  11. * @since 1.0
  12. */
  13. namespace TagParse;
  14. /**
  15. * TagDomRoot
  16. *
  17. * @category TagParse
  18. * @package TagParse
  19. * @author kun
  20. * @copyright 2014 kun
  21. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  22. * @version 1.0
  23. * @link http://www.blogkun.com
  24. * @since 1.0
  25. */
  26. class TagDomRoot
  27. {
  28. public $tag = 'root';
  29. public $plaintext;
  30. public $child = array();
  31. public $level = 0;
  32. public static $TagParseError = false;
  33. protected static $TagSet = array();
  34. protected static $FoundNode = array();
  35. public static $ErrorTag = array();
  36. /**
  37. * initProperty
  38. *
  39. * @access public
  40. *
  41. * @return null
  42. */
  43. public function initProperty()
  44. {
  45. $TagParseError = false;
  46. $TagSet = array();
  47. $FoundNode = array();
  48. $DumpScriptCode = array();
  49. $ErrorTag = array();
  50. }
  51. /**
  52. * __construct
  53. *
  54. * @param string $str The tag string to be parse.
  55. *
  56. * @access public
  57. *
  58. * @return TagDomRoot
  59. */
  60. public function __construct($str)
  61. {
  62. $this->_removeNoise($str);
  63. if ($str === null) {
  64. self::$TagParseError = true;
  65. } else {
  66. $l = strpos($str, ' if ($l !== false) {
  67. $this->plaintext = substr($str, 0, $l);
  68. }
  69. $res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) {
  70. $this->plaintext .= implode($matches[1]);
  71. }
  72. $r = strrpos($str, '>');
  73. if ($r !== false) {
  74. $this->plaintext .= substr($str, $r+1);
  75. }
  76. $tagCollect = array();
  77. $attrCollect = array();
  78. $innerContentCollect = array();
  79. if ($this->parseTag($str, $tagCollect, $attrCollect, $innerContentCollect) === false) {
  80. self::$TagParseError = true;
  81. }
  82. foreach ($tagCollect as $index => $tag) {
  83. $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);
  84. }
  85. }
  86. }
  87. /**
  88. * parseTag
  89. *
  90. * @param mixed $str Description.
  91. * @param mixed &$tagCollect Description.
  92. * @param mixed &$attrCollect Description.
  93. * @param mixed &$innerContentCollect Description.
  94. *
  95. * @access protected
  96. *
  97. * @return boolean Value.
  98. */
  99. protected function parseTag($str, array &$tagCollect, array &$attrCollect, array &$innerContentCollect)
  100. {
  101. $selfClosingTags = array('img' => 1, 'br' => 1, 'input' => 1, 'meta' => 1, 'link' => 1, 'hr' => 1, 'base' => 1, 'embed' => 1, 'spacer' => 1);
  102. $end = -2;
  103. $close = 0;
  104. $error = false;
  105. $tag = '';
  106. while (true) {
  107. $l = strpos($str, ' if ($l === false) {//parse end
  108. break;
  109. }
  110. if (strpos(substr($str, $l, 2), '/') !== false) {//surplus closing tag,discard
  111. $error = true;
  112. $end = $l+strlen($tag);
  113. self::$ErrorTag[] = substr($str, $l, strpos($str, '>', $l)-$l+1);
  114. continue;
  115. }
  116. $r = strpos($str, '>', $l);
  117. $tag = substr($str, $l+1, $r-$l-1);
  118. if (!ctype_alpha($tag[0]) || strpos($tag, ' $end = $r + 1;
  119. continue;
  120. }
  121. $tag = preg_replace("~\n+~", ' ', $tag);
  122. $space = strpos($tag, ' ');
  123. if ($space !== false) {
  124. $attrCollect[] = substr($tag, $space+1);
  125. $tag = substr($tag, 0, $space);
  126. } else {
  127. $attrCollect[] = '';
  128. }
  129. $tagCollect[] = $tag;
  130. if (isset($selfClosingTags[$tag])) {
  131. $innerContentCollect[] = '';
  132. $end = $r-strlen($tag)-2;
  133. $close = $r+1;
  134. continue;
  135. }
  136. $countOpen = -1;
  137. $open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $open);
  138. if ($close === false) {//surplus opening tag
  139. $innerContentCollect[] = substr($str, $r+1);
  140. $error = true;
  141. self::$ErrorTag[] = '';
  142. break;
  143. }
  144. $start = $open;
  145. while ($open $countOpen++;
  146. $open = strpos($str, ' }
  147. while ($countOpen > 0 && $close !== false) {
  148. $open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $close+strlen($tag)+3);
  149. if ($close === false) {
  150. break;
  151. }
  152. $countOpen--;
  153. while ($open $open = strpos($str, ' $countOpen++;
  154. }
  155. }
  156. if ($close === false) {//标签闭合不配对
  157. $innerContentCollect[] = substr($str, $r+1);
  158. $error = true;
  159. break;
  160. }
  161. $end = $close;
  162. $r = strpos($str, '>', $start);
  163. $innerContentCollect[] = substr($str, $r+1, $end - $r - 1);
  164. }
  165. return !$error;
  166. }
  167. /**
  168. * _removeNoise
  169. *
  170. * @param string &$str The tag string to be parse.
  171. *
  172. * @access private
  173. *
  174. * @return string
  175. */
  176. private function _removeNoise(&$str)
  177. {
  178. $str = preg_replace('~~is', '', $str);
  179. $str = preg_replace('~~is', '', $str);
  180. $str = preg_replace('~*?>~is', '', $str);
  181. }
  182. /**
  183. * parseSelectors
  184. *
  185. * @param string $selectors user's select condition.
  186. * @param array &$selectorsTag tags
  187. * @param array &$selectorsAttr attributes
  188. *
  189. * @access protected
  190. *
  191. * @return null
  192. */
  193. protected function parseSelectors($selectors, array &$selectorsTag, array &$selectorsAttr)
  194. {
  195. preg_match_all('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selectors, $matches);
  196. $selectorsTag = $matches[1];
  197. foreach ($matches[2] as $key => $value) {
  198. $selectorsAttr[$key] = array();
  199. if ($value !== '') {
  200. preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $value, $matches);
  201. foreach ($matches[1] as $index => $attr) {
  202. $selectorsAttr[$key][$attr] = $matches[2][$index];
  203. }
  204. }
  205. }
  206. }
  207. /**
  208. * find
  209. *
  210. * @param mixed $selectors user's select condition.
  211. * @param array $selectorsTag tags.
  212. * @param array $selectorsAttr attributes.
  213. *
  214. * @access public
  215. *
  216. * @return array
  217. */
  218. public function find($selectors, $selectorsTag = array(), $selectorsAttr = array())
  219. {
  220. if ($selectors !== null) {
  221. $this->parseSelectors($selectors, $selectorsTag, $selectorsAttr);
  222. }
  223. var_dump($selectorsTag, $selectorsAttr);exit();
  224. if (!empty($selectorsTag)) {
  225. $this->seek($selectorsTag, $selectorsAttr);
  226. foreach ($this->child as $key => $node) {
  227. $node->find(null, $selectorsTag, $selectorsAttr);
  228. }
  229. }
  230. if ($selectors !== null) {
  231. $res = self::$FoundNode;
  232. self::$FoundNode = array();
  233. return $res;
  234. }
  235. }
  236. /**
  237. * findGlobal
  238. *
  239. * @param string $selectors user's select condition.
  240. *
  241. * @access public
  242. *
  243. * @return array
  244. */
  245. public function findGlobal($selectors)
  246. {
  247. $space = strpos($selectors, ' ', strpos($selectors, ']'));
  248. if ($space === false) {
  249. return $this->findOneGlobal($selectors);
  250. } else {
  251. $selectorsAttr = array();
  252. $selectorsTag = array();
  253. $this->findOneGlobal(substr($selectors, 0, $space), false);
  254. $this->parseSelectors(substr($selectors, $space + 1), $selectorsTag, $selectorsAttr);
  255. if (!empty(self::$FoundNode) && !empty($selectorsTag)) {
  256. $nodes = self::$FoundNode;
  257. self::$FoundNode = array();
  258. foreach ($nodes as $key => $node) {
  259. $node->seek($selectorsTag, $selectorsAttr);
  260. }
  261. }
  262. }
  263. $res = self::$FoundNode;
  264. self::$FoundNode = array();
  265. return $res;
  266. }
  267. /**
  268. * seek
  269. *
  270. * @param array $selectorsTag tags.
  271. * @param array $selectorsAttr attributes.
  272. *
  273. * @access protected
  274. *
  275. * @return null
  276. */
  277. protected function seek($selectorsTag, $selectorsAttr)
  278. {
  279. foreach ($this->child as $key => $node) {
  280. $isFind = true;
  281. if ($node->tag === $selectorsTag[0]) {
  282. foreach ($selectorsAttr[0] as $attrName => $value) {
  283. if (isset($node->attr[$attrName])
  284. && (preg_match('~.*? '.$value.' .*?~', $node->attr[$attrName]) > 0
  285. || preg_match('~^'.$value.'$~', $node->attr[$attrName]) > 0
  286. || preg_match('~^'.$value.' ~', $node->attr[$attrName]) > 0
  287. || preg_match('~ '.$value.'$~', $node->attr[$attrName]) > 0)
  288. ) {
  289. continue;
  290. } else {
  291. $isFind = false;
  292. break;
  293. }
  294. }
  295. } else {
  296. $isFind = false;
  297. }
  298. if ($isFind) {
  299. if (count($selectorsTag) === 1) {
  300. self::$FoundNode[] = $node;
  301. } else {
  302. $node->seek(
  303. array_slice($selectorsTag, 1),
  304. array_slice($selectorsAttr, 1)
  305. );
  306. }
  307. }
  308. }
  309. }
  310. /**
  311. * findOneGlobal
  312. *
  313. * @param string $selector user's select condition.
  314. * @param bool $isReturn weather return value.
  315. *
  316. * @access public
  317. *
  318. * @return array
  319. */
  320. public function findOneGlobal($selector, $isReturn = true)
  321. {
  322. preg_match('~([\w\d]+)(\[[\w\d -="._/]+\])?~', $selector, $matches);
  323. $tag = $matches[1];
  324. $attr = array();
  325. if (isset($matches[2])) {
  326. preg_match_all('~([\w\d-]+)="([\w\d-. _/]+)"~', $matches[2], $matches);
  327. foreach ($matches[1] as $key => $value) {
  328. $attr[$value] = $matches[2][$key];
  329. }
  330. }
  331. if (isset(self::$TagSet[$tag])) {
  332. foreach (self::$TagSet[$tag] as $attrValue => $nodeArray) {
  333. $isFind = true;
  334. foreach ($attr as $attrName => $value) {
  335. if (preg_match('~'.$attrName.'=".*? '.$value.' .*?"~', $attrValue)
  336. || preg_match('~'.$attrName.'="'.$value.' .*?"~', $attrValue)
  337. || preg_match('~'.$attrName.'=".*? '.$value.'"~', $attrValue)
  338. || preg_match('~'.$attrName.'="'.$value.'"~', $attrValue)
  339. ) {
  340. continue;
  341. } else {
  342. $isFind = false;
  343. break;
  344. }
  345. }
  346. if ($isFind) {
  347. foreach ($nodeArray as $key => $node) {
  348. self::$FoundNode[] = $node;
  349. }
  350. }
  351. }
  352. }
  353. if ($isReturn) {
  354. $res = self::$FoundNode;
  355. self::$FoundNode = array();
  356. return $res;
  357. }
  358. }
  359. }
  360. /**
  361. * TagDomNode
  362. *
  363. * @uses TagDomRoot
  364. *
  365. * @category TagParse
  366. * @package TagParse
  367. * @author kun
  368. * @copyright 2014 kun
  369. * @license http://www.php.net/license/3_01.txt PHP License 3.01
  370. * @version 1.0
  371. * @link http://www.blogkun.com
  372. * @since 1.0
  373. */
  374. class TagDomNode extends TagDomRoot
  375. {
  376. public $attr = array();
  377. public $parent = null;
  378. /**
  379. * __construct
  380. *
  381. * @param mixed $tag tag.
  382. * @param mixed $parent parent node.
  383. * @param mixed $attr attribute.
  384. * @param mixed $innerContent tag content.
  385. * @param mixed $level node level.
  386. *
  387. * @access public
  388. *
  389. * @return TagDomNode
  390. */
  391. public function __construct($tag, $parent, $attr, $innerContent, $level)
  392. {
  393. $this->tag = $tag;
  394. $this->parent = $parent;
  395. $this->_parseAttr($attr);
  396. $this->level = $level;
  397. $l = strpos($innerContent, ' if ($l !== false) {
  398. $this->plaintext = substr($innerContent, 0, $l);
  399. }
  400. $res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) {
  401. $this->plaintext .= implode($matches[1]);
  402. } else {
  403. $this->plaintext .= $innerContent;
  404. }
  405. $r = strrpos($innerContent, '>');
  406. if ($r !== false) {
  407. $this->plaintext .= substr($innerContent, $r+1);
  408. }
  409. $tagCollect = array();
  410. $attrCollect = array();
  411. $innerContentCollect = array();
  412. if ($this->parseTag($innerContent, $tagCollect, $attrCollect, $innerContentCollect) === false) {
  413. self::$TagParseError = true;
  414. }
  415. foreach ($tagCollect as $index => $tag) {
  416. $this->child[] = new TagDomNode($tag, $this, $attrCollect[$index], $innerContentCollect[$index], $this->level+1);
  417. }
  418. if (!isset(self::$TagSet[$this->tag])) {
  419. self::$TagSet[$this->tag] = array();
  420. }
  421. if (!isset(self::$TagSet[$this->tag][$attr])) {
  422. self::$TagSet[$this->tag][$attr] = array();
  423. }
  424. self::$TagSet[$this->tag][$attr][] = &$this;
  425. }
  426. /**
  427. * _parseAttr
  428. *
  429. * @param string $str attribute string.
  430. *
  431. * @access public
  432. *
  433. * @return null
  434. */
  435. private function _parseAttr($str)
  436. {
  437. preg_match_all('~(?[\w-]+)="(?.*?)"~s', $str, $matches);
  438. foreach ($matches['attrName'] as $key => $value) {
  439. $this->attr[$value] = $matches['attrValue'][$key];
  440. }
  441. }
  442. }
复制代码

上一篇:

下一篇: