php实现的一个很好用HTML解析器类可用于采集数据

前端之家收集整理的这篇文章主要介绍了php实现的一个很好用HTML解析器类可用于采集数据前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

<div class="codetitle">@L_502_0@ 代码如下:

<div class="codebody" id="code85400">
<?PHP
$oldSetting = libxml_use_internal_errors( true );
libxml_clear_errors();
/*

-+-----------------------------------
|PHP5 Framework - 2011
|Web Site: www.iblue.cc
|E-mail: mejinke@gmail.com
|Date: 2012-10-12
-+-----------------------------------

@desc HTML解析器
@author jingke
/
class XF_HtmlDom
{
private $_xpath = null;
private $_nodePath = ''; public function construct($xpath = null,$nodePath = '')
{
$this->_xpath = $xpath;
$this->_nodePath = $nodePath;
} public function loadHtml($url)
{
ini_set('user_agent','Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML,like Gecko) Version/4.0 Mobile Safari/530.17 –Nexus');
$content = '';
if(strpos(strtolower($url),'http')===false)
{
$content = file_get_contents($url);
}
else
{
$ch = curl_init();
$user_agent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)";
$user_agent1='Mozilla/5.0 (Windows NT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0';
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_HEADER,false);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_REFERER,CURLOPT_USERAGENT,$user_agent1);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);
$content =curl_exec($ch);
curl_close($ch);
} $html = new DOMDocument();
$html->loadHtml($content);
$this->_xpath = new DOMXPath( $html );
//return $this; } public function find($query,$index = null)
{
if($this->_nodePath == '')
$this->_nodePath = '//';
else
$this->_nodePath .= '/'; $nodes = $this->_xpath->query($this->_nodePath.$query);
//echo $nodes->item(0)->getNodePath();exit;
if ($index == null && !is_numeric($index))
{
$tmp = array();
foreach ($nodes as $node)
{
$tmp[] = new XF_HtmlDom($this->_xpath,$node->getNodePath());
}
return $tmp;
}
return new XF_HtmlDom($this->_xpath,$this->_xpath->query($this->_nodePath.$query)->item($index)->getNodePath());
} /
获取内容
/
public function text()
{
if ($this->_nodePath != '' && $this->_xpath != null )
return $this->_xpath->query($this->_nodePath)->item(0)->textContent;
else
return false;
} /

获取属性
/
public function getAttribute($name)
{
if ($this->_nodePath != '' && $this->_xpath != null )
return $this->_xpath->query($this->_nodePath)->item(0)->getAttribute($name);
else
return false;
} public function
get($name)
{
if($name == 'innertext')
return $this->text();
else
return $this->getAttribute($name);
}
}
$xp = new xf_HtmlDom();
$xp->loadHtml('http://www.aizhan.com/siteall/www.opendir.cn/');
$rows = $xp->find("td[@id='baidu']/a",0)->innertext;
print_r($rows);

猜你在找的PHP相关文章