/*******************************************************************************
Version: 1.11 ($Rev: 175 $)
Website: http://www.115.co
Author: S.C. Chen <admin@185.cm>
Acknowledge: Jose Solorzano (http://www.115.co/)
Contributions by:QQ交流群:89097023
Yousuke Kumakura (Attribute filters)
Vadim Voituk (Negative indexes supports of "find" method)
Antcs (Constructor with automatically load contents either text or file/url)
Licensed under The MIT License
Redistributions of files must retain the above copyright notice.
*******************************************************************************/
define('HDOM_TYPE_ELEMENT', 1);
define('HDOM_TYPE_COMMENT', 2);
define('HDOM_TYPE_TEXT', 3);
define('HDOM_TYPE_ENDTAG', 4);
define('HDOM_TYPE_ROOT', 5);
define('HDOM_TYPE_UNKNOWN', 6);
define('HDOM_QUOTE_DOUBLE', 0);
define('HDOM_QUOTE_SINGLE', 1);
define('HDOM_QUOTE_NO', 3);
define('HDOM_INFO_BEGIN', 0);
define('HDOM_INFO_END', 1);
define('HDOM_INFO_QUOTE', 2);
define('HDOM_INFO_SPACE', 3);
define('HDOM_INFO_TEXT', 4);
define('HDOM_INFO_INNER', 5);
define('HDOM_INFO_OUTER', 6);
define('HDOM_INFO_ENDSPACE',7);
// helper functions
// -----------------------------------------------------------------------------
// get html dom form file
function file_get_html() {
$dom = new simple_html_dom;
$args = func_get_args();
$dom->load(call_user_func_array('file_get_contents', $args), true);
return $dom;
}
// get html dom form string
function str_get_html($str, $lowercase=true) {
$dom = new simple_html_dom;
$dom->load($str, $lowercase);
return $dom;
}
// dump html dom tree
function dump_html_tree($node, $show_attr=true, $deep=0) {
$lead = str_repeat(' ', $deep);
echo $lead.$node->tag;
if ($show_attr && count($node->attr)>0) {
echo '(';
foreach($node->attr as $k=>$v)
echo "[$k]=>\"".$node->$k.'", ';
echo ')';
}
echo "\n";
foreach($node->nodes as $c)
dump_html_tree($c, $show_attr, $deep+1);
}
// get dom form file (dePRecated)
function file_get_dom() {
$dom = new simple_html_dom;
$args = func_get_args();
$dom->load(call_user_func_array('file_get_contents', $args), true);
return $dom;
}
// get dom form string (deprecated)
function str_get_dom($str, $lowercase=true) {
$dom = new simple_html_dom;
$dom->load($str, $lowercase);
return $dom;
}
// simple html dom node
// -----------------------------------------------------------------------------
class simple_html_dom_node {
public $nodetype = HDOM_TYPE_TEXT;
public $tag = 'text';
public $attr = array();
public $children = array();
public $nodes = array();
public $parent = null;
public $_ = array();
private $dom = null;
function __construct($dom) {
$this->dom = $dom;
$dom->nodes[] = $this;
}
function __destruct() {
$this->clear();
}
function __toString() {
return $this->outertext();
}
// clean up memory due to php5 circular references memory leak...
function clear() {
$this->dom = null;
$this->nodes = null;
$this->parent = null;
$this->children = null;
}
// dump node's tree
function dump($show_attr=true) {
dump_html_tree($this, $show_attr);
}
// returns the parent of node
function parent() {
return $this->parent;
}
// returns children of node
function children($idx=-1) {
if ($idx===-1) return $this->children;
if (isset($this->children[$idx])) return $this->children[$idx];
return null;
}
// returns the first child of node
function first_child() {
if (count($this->children)>0) return $this->children[0];
return null;
}
// returns the last child of node
function last_child() {
if (($count=count($this->children))>0) return $this->children[$count-1];
return null;
}
// returns the next sibling of node
function next_sibling() {
if ($this->parent===null) return null;
$idx = 0;
$count = count($this->parent->children);
while ($idx<$count && $this!==$this->parent->children[$idx])
++$idx;
if (++$idx>=$count) return null;
return $this->parent->children[$idx];
}
// returns the previous sibling of node
function prev_sibling() {
if ($this->parent===null) return null;
$idx = 0;
$count = count($this->parent->children);
while ($idx<$count && $this!==$this->parent->children[$idx])
++$idx;
if (--$idx<0) return null;
return $this->parent->children[$idx];
}
// get dom node's inner html
function innertext() {
if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
$ret = '';
foreach($this->nodes as $n)
$ret .= $n->outertext();
return $ret;
}
// get dom node's outer text (with tag)
function outertext() {
if ($this->tag==='root') return $this->innertext();
// trigger callback
if ($this->dom->callback!==null)
call_user_func_array($this->dom->callback, array($this));
if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
// render begin tag
$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
// render inner text
if (isset($this->_[HDOM_INFO_INNER]))
$ret .= $this->_[HDOM_INFO_INNER];
else {
foreach($this->nodes as $n)
$ret .= $n->outertext();
}
// render end tag
if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
$ret .= '</'.$this->tag.'>';
return $ret;
}
// get dom node's plain text
function text() {
if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
switch ($this->nodetype) {
case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
case HDOM_TYPE_COMMENT: return '';
case HDOM_TYPE_UNKNOWN: return '';
}
if (strcasecmp($this->tag, 'script')===0) return '';
if (strcasecmp($this->tag, 'style')===0) return '';
$ret = '';
foreach($this->nodes as $n)
$ret .= $n->text();
return $ret;
}
function xmltext() {
$ret = $this->innertext();
$ret = str_ireplace('<![CDATA[', '', $ret);
$ret = str_replace(']]>', '', $ret);
return $ret;
}
// build node's text with tag
function makeup() {
// text, comment, unknown
if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
$ret = '<'.$this->tag;
$i = -1;
foreach($this->attr as $key=>$val) {
++$i;
// skip removed attribute
if ($val===null || $val===false)
continue;
$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
//no value attr: nowrap, checked selected...
if ($val===true)
$ret .= $key;
else {
switch($this->_[HDOM_INFO_QUOTE][$i]) {
case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
case HDOM_QUOTE_SINGLE: $quote = '\''; break;
default: $quote = '';
}
$ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
}
}
$ret = $this->dom->restore_noise($ret);
return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
}
// find elements by CSS selector
function find($selector, $idx=null) {
$selectors = $this->parse_selector($selector);
if (($count=count($selectors))===0) return array();
$found_keys = array();
// find each selector
for ($c=0; $c<$count; ++$c) {
if (($levle=count($selectors[0]))===0) return array();
if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
$head = array($this->_[HDOM_INFO_BEGIN]=>1);
// handle descendant selectors, no recursive!
for ($l=0; $l<$levle; ++$l) {
$ret = array();
foreach($head as $k=>$v) {
$n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
$n->seek($selectors[$c][$l], $ret);
}
$head = $ret;
}
foreach($head as $k=>$v) {
if (!isset($found_keys[$k]))
$found_keys[$k] = 1;
}
}
// sort keys
ksort($found_keys);
$found = array();
foreach($found_keys as $k=>$v)
$found[] = $this->dom->nodes[$k];
// return nth-element or array
if (is_null($idx)) return $found;
else if ($idx<0) $idx = count($found) + $idx;
return (isset($found[$idx])) ? $found[$idx] : null;
}
// seek for given conditions
protected function seek($selector, &$ret) {
list($tag, $key, $val, $exp, $no_key) = $selector;
// xpath index
if ($tag && $key && is_numeric($key)) {
$count = 0;
foreach ($this->children as $c) {
if ($tag==='*' || $tag===$c->tag) {
if (++$count==$key) {
$ret[$c->_[HDOM_INFO_BEGIN]] = 1;
return;
}
}
}
return;
}
$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
if ($end==0) {
$parent = $this->parent;
while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
$end -= 1;
$parent = $parent->parent;
}
$end += $parent->_[HDOM_INFO_END];
}
for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
$node = $this->dom->nodes[$i];
$pass = true;
if ($tag==='*' && !$key) {
if (in_array($node, $this->children, true))
$ret[$i] = 1;
continue;
}
// compare tag
if ($tag && $tag!=$node->tag && $tag!=='*') {$pass=false;}
// compare key
if ($pass && $key) {
if ($no_key) {
if (isset($node->attr[$key])) $pass=false;
}
else if (!isset($node->attr[$key])) $pass=false;
}
// compare value
if ($pass && $key && $val && $val!=='*') {
$check = $this->match($exp, $val, $node->attr[$key]);
// handle multiple class
if (!$check && strcasecmp($key, 'class')===0) {
foreach(explode(' ',$node->attr[$key]) as $k) {
$check = $this->match($exp, $val, $k);
if ($check) break;
}
}
if (!$check) $pass = false;
}
if ($pass) $ret[$i] = 1;
unset($node);
}
}
protected function match($exp, $pattern, $value) {
switch ($exp) {
case '=':
return ($value===$pattern);
case '!=':
return ($value!==$pattern);
case '^=':
return preg_match("/^".preg_quote($pattern,'/')."/", $value);
case '$=':
return preg_match("/".preg_quote($pattern,'/')."$/", $value);
case '*=':
if ($pattern[0]=='/')
return preg_match($pattern, $value);
return preg_match("/".$pattern."/i", $value);
}
return false;
}
protected function parse_selector($selector_string) {
&n