欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  php教程

Html Parser Class

程序员文章站 2022-05-03 19:29:48
...
This is a HTML parser class, used to parse HTML and XML. One of the unique features of this class is that it supports the innerHTML property.

php

/**
* HTML/XML Parser Class
*
* This is a helper class that is used to parse HTML and XML. A unique feature of this parsing class
* is the fact that it includes support for innerHTML (which isn't easy to do).
*
* @author Dennis Pallett
* @copyright Dennis Pallett 2006
* @package HTML_Parser
* @version 1.0
*/

// Helper Class
// To parse HTML/XML
Class HTML_Parser
{
// Private properties
var $_parser
;
var
$_tags
= array();
var
$_html
;
var
$output
= array();
var
$strXmlData
;
var
$_level = 0
;
var
$_outline
;
var
$_tagcount
= array();
var
$xml_error = false
;
var
$xml_error_code
;
var
$xml_error_string
;
var
$xml_error_line_number
;

function
get_html
() {
return
$this->_html
;
}

function
parse($strInputXML
) {
$this->output
= array();

// Translate entities
$strInputXML = $this->translate_entities($strInputXML
);

$this->_parser = xml_parser_create
();
xml_parser_set_option($this->_parser, XML_OPTION_CASE_FOLDING, true
);
xml_set_object($this->_parser,$this
);
xml_set_element_handler($this->_parser, "tagOpen", "tagClosed"
);

xml_set_character_data_handler($this->_parser, "tagData"
);

$this->strXmlData = xml_parse($this->_parser,$strInputXML
);

if (!
$this->strXmlData
) {
$this->xml_error = true
;
$this->xml_error_code = xml_get_error_code($this->_parser
);
$this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser
));
$this->xml_error_line_number = xml_get_current_line_number($this->_parser
);
return
false
;
}

return
$this->output
;
}


function
tagOpen($parser, $name, $attr
) {
// Increase level
$this->_level
++;

// Create tag:
$newtag = $this->create_tag($name, $attr
);

// Build tag
$tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level
);

// Add tag
array_push ($this->output, $tag
);

// Add tag to this level
$this->_tags[$this->_level] = $tag
;

// Add to HTML
$this->_html .= $newtag
;

// Add to outline
$this->_outline .= $this->_level . $newtag
;
}

function
create_tag ($name, $attr
) {
// Create tag:
# Begin with name
$tag = '. strtolower($name) . ' ';

# Create attribute list
foreach ($attr as $key=>$val
) {
$tag .= strtolower($key) . '="' . htmlentities($val) . '" '
;
}

# Finish tag
$tag = trim($tag
);

switch(
strtolower($name
)) {
case
'br'
:
case
'input'
:
$tag .= ' /'
;
break;
}

$tag .= '>'
;

return
$tag
;
}

function
tagData($parser, $tagData
) {
if(
trim($tagData
)) {
if(isset(
$this->output[count($this->output)-1]['tagData'
])) {
$this->output[count($this->output)-1]['tagData'] .= $tagData
;
} else {
$this->output[count($this->output)-1]['tagData'] = $tagData
;
}
}

$this->_html .= htmlentities($tagData
);
$this->_outline .= htmlentities($tagData
);
}

function
tagClosed($parser, $name
) {
// Add to HTML and outline
switch (strtolower($name
)) {
case
'br'
:
case
'input'
:
break;
default:
$this->_outline .= $this->_level . '' . strtolower($name) . '>'
;
$this->_html .= '' . strtolower($name) . '>'
;
}

// Get tag that belongs to this end
$tag = $this->_tags[$this->_level
];
$tag = $this->create_tag($tag['name'], $tag['attr'
]);

// Try to get innerHTML
$regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '' . strtolower($name) . '>', '%') . '%is'
;
preg_match ($regex, $this->_outline, $matches
);

// Get innerHTML
if (isset($matches['1'
])) {
$innerhtml = $matches['1'
];
}

// Remove level identifiers
$this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline
);
$this->_outline = str_replace($this->_level . '' . strtolower($name) . '>', '' . strtolower($name) . '>', $this->_outline
);

// Add innerHTML
if (isset($innerhtml
)) {
$this->output[count($this->output)-1]['innerhtml'] = $innerhtml
;
}

// Fix tree
$this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1
];
array_pop($this->output
);

// Decrease level
$this->_level
--;
}

function
translate_entities($xmlSource, $reverse =FALSE
) {
static
$literal2NumericEntity
;

if (empty(
$literal2NumericEntity
)) {
$transTbl = get_html_translation_table(HTML_ENTITIES
);

foreach (
$transTbl as $char => $entity
) {
if (
strpos('&"', $char) !== FALSE
) continue;
$literal2NumericEntity[$entity] = ''.ord($char).';'
;
}
}

if (
$reverse
) {
return
strtr($xmlSource, array_flip($literal2NumericEntity
));
} else {
return
strtr($xmlSource, $literal2NumericEntity
);
}
}
}

// To be used like this
$parser = new HTML_Parser
;
$output = $parser->parse($html
);

print_r ($output
);

?>