<?php
/**
* html/xml parser class
*
* this is a helper class that is used to parse html and xml. a unique feature of this parsing class
* is the fact that it includes support for innerhtml (which isn't easy to do).
*
* @author dennis pallett
* @copyright dennis pallett 2006
* @package html_parser
* @version 1.0
*/
// helper class
// to parse html/xml
class html_parser {
// private properties
var $_parser;
var $_tags = array();
var $_html;
var $output = array();
var $strxmldata;
var $_level = 0;
var $_outline;
var $_tagcount = array();
var $xml_error = false;
var $xml_error_code;
var $xml_error_string;
var $xml_error_line_number;
function get_html () {
return $this->_html;
}
function parse($strinputxml) {
$this->output = array();
// translate entities
$strinputxml = $this->translate_entities($strinputxml);
$this->_parser = xml_parser_create ();
xml_parser_set_option($this->_parser, xml_option_case_folding, true);
xml_set_object($this->_parser,$this);
xml_set_element_handler($this->_parser, "tagopen", "tagclosed");
xml_set_character_data_handler($this->_parser, "tagdata");
$this->strxmldata = xml_parse($this->_parser,$strinputxml );
if (!$this->strxmldata) {
$this->xml_error = true;
$this->xml_error_code = xml_get_error_code($this->_parser);
$this->xml_error_string = xml_error_string(xml_get_error_code($this->_parser));
$this->xml_error_line_number = xml_get_current_line_number($this->_parser);
return false;
}
return $this->output;
}
function tagopen($parser, $name, $attr) {
// increase level
$this->_level++;
// create tag:
$newtag = $this->create_tag($name, $attr);
// build tag
$tag = array("name"=>$name,"attr"=>$attr, "level"=>$this->_level);
// add tag
array_push ($this->output, $tag);
// add tag to this level
$this->_tags[$this->_level] = $tag;
// add to html
$this->_html .= $newtag;
// add to outline
$this->_outline .= $this->_level . $newtag;
}
function create_tag ($name, $attr) {
// create tag:
# begin with name
$tag = '<' . strtolower($name) . ' ';
# create attribute list
foreach ($attr as $key=>$val) {
$tag .= strtolower($key) . '="' . htmlentities($val) . '" ';
}
# finish tag
$tag = trim($tag);
switch(strtolower($name)) {
case 'br':
case 'input':
$tag .= ' /';
break;
}
$tag .= '>';
return $tag;
}
function tagdata($parser, $tagdata) {
if(trim($tagdata)) {
if(isset($this->output[count($this->output)-1]['tagdata'])) {
$this->output[count($this->output)-1]['tagdata'] .= $tagdata;
} else {
$this->output[count($this->output)-1]['tagdata'] = $tagdata;
}
}
$this->_html .= htmlentities($tagdata);
$this->_outline .= htmlentities($tagdata);
}
function tagclosed($parser, $name) {
// add to html and outline
switch (strtolower($name)) {
case 'br':
case 'input':
break;
default:
$this->_outline .= $this->_level . '</' . strtolower($name) . '>';
$this->_html .= '</' . strtolower($name) . '>';
}
// get tag that belongs to this end
$tag = $this->_tags[$this->_level];
$tag = $this->create_tag($tag['name'], $tag['attr']);
// try to get innerhtml
$regex = '%' . preg_quote($this->_level . $tag, '%') . '(.*?)' . preg_quote($this->_level . '</' . strtolower($name) . '>', '%') . '%is';
preg_match ($regex, $this->_outline, $matches);
// get innerhtml
if (isset($matches['1'])) {
$innerhtml = $matches['1'];
}
// remove level identifiers
$this->_outline = str_replace($this->_level . $tag, $tag, $this->_outline);
$this->_outline = str_replace($this->_level . '</' . strtolower($name) . '>', '</' . strtolower($name) . '>', $this->_outline);
// add innerhtml
if (isset($innerhtml)) {
$this->output[count($this->output)-1]['innerhtml'] = $innerhtml;
}
// fix tree
$this->output[count($this->output)-2]['children'][] = $this->output[count($this->output)-1];
array_pop($this->output);
// decrease level
$this->_level--;
}
function translate_entities($xmlsource, $reverse =false) {
static $literal2numericentity;
if (empty($literal2numericentity)) {
$transtbl = get_html_translation_table(html_entities);
foreach ($transtbl as $char => $entity) {
if (strpos('&"<>', $char) !== false) continue;
$literal2numericentity[$entity] = '&#'.ord($char).';';
}
}
if ($reverse) {
return strtr($xmlsource, array_flip($literal2numericentity));
} else {
return strtr($xmlsource, $literal2numericentity);
}
}
}
// to be used like this
$parser = new html_parser;
$output = $parser->parse($html);
print_r ($output);
?>
新闻热点
疑难解答