基于Snoopy的PHP近似完美获取网站编码的代码
前端之家收集整理的这篇文章主要介绍了
基于Snoopy的PHP近似完美获取网站编码的代码,
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
先要到网上下载Snoopy.class.PHP @H_301_0@调用方法: @H_301_0@<div class="codetitle"><a style="CURSOR: pointer" data="29757" class="copybut" id="copybut29757" onclick="doCopy('code29757')"> 代码如下:
<div class="codebody" id="code29757"> @H_
301_0@<?
PHP @H_
301_0@require 'lib/Snoopy.class.
PHP'; @H_
301_0@require 'lib/WebCrawl.class.
PHP';//包含下面
代码 @H_
3010@$go=new WebCrawl('http://www.baidu.com'); @H3010@echo $go->getCharset(); @H3010@?> @H3010@ @H
301_0@<div class="codetitle">
<a style="CURSOR: pointer" data="21471" class="copybut" id="copybut21471" onclick="doCopy('code21471')"> 代码如下: <div class="codebody" id="code21471"> @H_
301_0@<?
PHP @H_
3010@class WebCrawl @H3010@{ @H3010@private $url; @H3010@private $request; @H301_0@public $charset
arr=array( @H3010@'gb2312',@H3010@'utf-8',@H3010@'big5',@H3010@'gbk',@H3010@'ascii',@H3010@'cp936',@H3010@'ibm037',@H3010@'ibm437',@H3010@'ibm500',@H3010@'asmo-708',@H3010@'dos-720',@H3010@'ibm737',@H3010@'ibm775',@H3010@'ibm850',@H3010@'ibm852',@H3010@'ibm855',@H3010@'ibm857',@H3010@'ibm00858',@H3010@'ibm861',@H3010@'ibm860',@H3010@'dos-862',@H3010@'ibm863',@H3010@'ibm864',@H3010@'ibm865',@H3010@'cp866',@H3010@'ibm869',@H3010@'ibm870',@H3010@'windows-874',@H3010@'cp875',@H301_0@'shift
jis',@H301_0@'ks_c
5601-1987',@H3010@'ibm1026',@H3010@'ibm01047',@H3010@'ibm01040',@H3010@'ibm01041',@H3010@'ibm01042',@H3010@'ibm01043',@H3010@'ibm01044',@H3010@'ibm01045',@H3010@'ibm01046',@H3010@'ibm01048',@H3010@'ibm01049',@H3010@'utf-16',@H3010@'unicodefffe',@H3010@'windows-1250',@H3010@'windows-1251',@H3010@'windows-1252',@H3010@'windows-1253',@H3010@'windows-1254',@H3010@'windows-1255',@H3010@'windows-1256',@H3010@'windows-1257',@H3010@'windows-1258',@H3010@'johab',@H3010@'macintosh',@H3010@'x-mac-japanese',@H3010@'x-mac-chinesetrad',@H3010@'x-mac-korean',@H3010@'x-mac-arabic',@H3010@'x-mac-hebrew',@H3010@'x-mac-greek',@H3010@'x-mac-cyrillic',@H3010@'x-mac-chinesesimp',@H3010@'x-mac-romanian',@H3010@'x-mac-ukrainian',@H3010@'x-mac-thai',@H3010@'x-mac-ce',@H3010@'x-mac-icelandic',@H3010@'x-mac-turkish',@H3010@'x-mac-croatian',@H3010@'x-chinese-cns',@H3010@'x-cp20001',@H3010@'x-chinese-eten',@H3010@'x-cp20003',@H3010@'x-cp20004',@H3010@'x-cp20005',@H3010@'x-ia5',@H3010@'x-ia5-german',@H3010@'x-ia5-swedish',@H3010@'x-ia5-norwegian',@H3010@'us-ascii',@H3010@'x-cp20261',@H3010@'x-cp20269',@H3010@'ibm273',@H3010@'ibm277',@H3010@'ibm278',@H3010@'ibm280',@H3010@'ibm284',@H3010@'ibm285',@H3010@'ibm290',@H3010@'ibm420',@H3010@'ibm423',@H3010@'ibm424',@H3010@'x-ebcdic-koreanextended',@H3010@'ibm-thai',@H3010@'koi8-r',@H3010@'ibm871',@H3010@'ibm880',@H3010@'ibm905',@H3010@'ibm00924',@H3010@'x-cp20936',@H3010@'x-cp20949',@H3010@'cp1025',@H3010@'koi8-u',@H3010@'iso-8859-1',@H3010@'iso-8859-2',@H3010@'iso-8859-3',@H3010@'iso-8859-4',@H3010@'iso-8859-5',@H3010@'iso-8859-6',@H3010@'iso-8859-7',@H3010@'iso-8859-8',@H3010@'iso-8859-9',@H3010@'iso-8859-13',@H3010@'iso-8859-15',@H3010@'x-europa',@H3010@'iso-8859-8-i',@H3010@'iso-2022-jp',@H3010@'csiso2022jp',@H3010@'iso-2022-kr',@H301_0@'x-cp
50227',@H_
3010@'euc-jp',@H3010@'euc-cn',@H3010@'euc-kr',@H3010@'hz-gb-2312',@H3010@'gb18030',@H3010@'x-iscii-de',@H3010@'x-iscii-be',@H3010@'x-iscii-ta',@H3010@'x-iscii-te',@H3010@'x-iscii-as',@H3010@'x-iscii-or',@H3010@'x-iscii-ka',@H3010@'x-iscii-ma',@H3010@'x-iscii-gu',@H3010@'x-iscii-pa',@H3010@'utf-7',@H3010@'utf-32',@H3010@'utf-32be' @H3010@); @H301_0@public function _
construct($url) @H3010@{ @H3010@$this->url=$url; @H3010@} @H3010@//打开网站 @H3010@private function open($url) @H3010@{ @H3010@if($this->request!==null) @H3010@{ @H3010@if($this->request->status==200) @H3010@{ @H3010@return true; @H3010@} @H3010@else @H3010@{ @H3010@return false; @H3010@} @H3010@} @H3010@else @H3010@{ @H3010@$this->request=new Snoopy(); @H3010@$this->request->fetch($url); @H3010@if($this->request->status==200) @H3010@{ @H3010@$this->request->results=strtolower($this->request->results); @H3010@$charset=$this->getCharset(); @H3010@if($charset!="utf-8") @H3010@{ @H3010@if($charset=="windows-1252") @H3010@{ @H301_0@$this->request->results=$this->uni
decode($this->request->results); @H3010@} @H3010@else @H3010@{ @H301_0@$this->request->results=mb_convert
encoding($this->request->results,"UTF-8",$charset); @H3010@} @H3010@} @H3010@return true; @H3010@} @H3010@else @H3010@{ @H3010@return false; @H3010@} @H3010@} @H3010@} @H301_0@//
获取网站title,keywords,description @H_
3010@public function getWebinfo() @H3010@{ @H3010@$info=array( @H3010@'title'=>'',@H3010@'keywords'=>'',@H3010@'desc'=>'',@H3010@'ip'=>'' @H3010@); @H3010@if(!$this->open($this->url)){return $info;exit;} @H301_0@// print
r($this->request->results);exit; @H301_0@preg
match('/([^>]*)<\/title>/si',$this->request->results,$titlematch ); @H301_0@if (isset($titlematch) && is
array($titlematch) && count($titlematch) > 0) @H3010@{ @H301_0@$info['title'] = strip
tags($titlematch[1]); @H3010@} @H301_0@preg_match
all('/<[\s]meta[\s]name="?' . '([^>"])"?[\s]' . 'content="?([^>"])"?[\s][\/]?[\s]*>/si',$match); @H3010@$ft=0; @H3010@foreach($match[1] as $mt) @H3010@{ @H3010@if($mt=="keywords" || $mt=="description") @H3010@{ @H3010@$ft=1; @H3010@} @H3010@} @H3010@if($ft==0) @H3010@{ @H301_0@preg_match
all('/<[\s]meta[\s]content="?([^>"])"?[\s]name="?' . '([^>"])"?[\s][\/]?[\s]*>/si',$match); @H301_0@if (isset($match) && is
array($match) && count($match) == 3) @H3010@{ @H3010@$originals = $match[0]; @H3010@$names = $match[2]; @H3010@$values = $match[1]; @H3010@if (count($originals) == count($names) && count($names) == count($values)) @H3010@{ @H301_0@$
MetaTags = array(); @H_
3010@for ($i=0,$limiti=count($names); $i < $limiti; $i++) @H3010@{ @H301_0@$
MetaTags[$names[$i]] = array ( @H_
3010@'html' => htmlentities($originals[$i]),@H3010@'value' => $values[$i] @H3010@); @H3010@} @H3010@} @H3010@} @H3010@} @H3010@else @H3010@{ @H301_0@if (isset($match) && is
array($match) && count($match) == 3) @H3010@{ @H3010@$originals = $match[0]; @H3010@$names = $match[1]; @H3010@$values = $match[2]; @H3010@if (count($originals) == count($names) && count($names) == count($values)) @H3010@{ @H301_0@$
MetaTags = array(); @H_
3010@for ($i=0,@H3010@'value' => $values[$i] @H3010@); @H3010@} @H3010@} @H3010@} @H3010@} @H3010@$result = array ( @H301_0@'
MetaTags' => $
MetaTags @H_
3010@); @H301_0@if(isset($result['
MetaTags']['keywords']['value'])) @H_
3010@{ @H301_0@$info['keywords']=$result['
MetaTags']['keywords']['value']; @H_
3010@} @H3010@else @H3010@{ @H3010@$info['keywords']=""; @H3010@} @H301_0@if(isset($result['
MetaTags']['description']['value'])) @H_
3010@{ @H301_0@$info['desc']=$result['
MetaTags']['description']['value']; @H_
3010@} @H3010@else @H3010@{ @H3010@$info['desc']=""; @H3010@} @H301_0@$domain=preg
replace('/http\:\/\//si','',$this->url); @H3010@$ip=@gethostbyname($domain); @H301_0@$ip
arr=explode(".",$ip); @H301_0@if(count($ip
arr)==4) @H3010@{ @H3010@$info['ip']=$ip; @H3010@} @H3010@return $info; @H3010@} @H3010@public function t($string,$o) @H3010@{ @H3010@for($i=0;$i<strlen($string);$i++) @H3010@{ @H3010@if(ord($string{$i})<128) @H3010@continue; @H3010@if((ord($string{$i})&224)==224) @H3010@{ @H3010@//第一个字节判断通过 @H3010@$char = $string{++$i}; @H3010@if((ord($char)&128)==128) @H3010@{ @H3010@//第二个字节判断通过 @H3010@$char = $string{++$i}; @H3010@if((ord($char)&128)==128) @H3010@{ @H3010@$encoding = "UTF-8"; @H3010@break; @H3010@} @H3010@} @H3010@} @H3010@if((ord($string{$i})&192)==192) @H3010@{ @H3010@//第一个字节判断通过 @H3010@$char = $string{++$i}; @H3010@if((ord($char)&128)==128) @H3010@{ @H3010@//第二个字节判断通过 @H3010@$encoding = "GB2312"; @H3010@break; @H3010@} @H3010@} @H3010@} @H3010@return strtolower($encoding); @H3010@} @H301_0@function uni
decode ($str,$code = 'utf-8'){ @H301_0@$str = json_decode(preg_replace_callback('/&#(\d{5});/',create
function('$dec','return \'\u\'.dechex($dec[1]);'),'"'.$str.'"')); @H3010@if($code != 'utf-8'){ $str = iconv('utf-8',$code,$str); } @H3010@return $str; @H3010@} @H301_0@//
获取网站编码 @H_
3010@public function getCharset() @H3010@{ @H3010@if(!$this->open($this->url)){return false;exit;} @H301_0@//首先从html
获取编码 @H_
301_0@preg_match("/<
Meta.+?charset=[^\w]?([-\w]+)/i",$temp) ? strtolower($temp[1]):""; @H_
3010@if($temp[1]!="") @H3010@{ @H301_0@if(in_array($temp[1],$this->charset
arr)) @H3010@{ @H3010@if($temp[1]=="gb2312") @H3010@{ @H301_0@$tmp
charset=$this->t($this->request->results,$temp[1]); @H301_0@if($tmp
charset==$temp[1]) @H3010@{ @H3010@return $temp[1]; @H3010@} @H3010@} @H3010@else @H3010@{ @H3010@return $temp[1]; @H3010@} @H3010@} @H3010@} @H3010@if(!empty($this->request->headers)) @H3010@{ @H301_0@//从header中
获取编码 @H_
3010@$hstr=strtolower(implode("|||",$this->request->headers)); @H301_0@preg
match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; @H3010@if($lang[1]!="") @H3010@{ @H3010@return $lang[1]; @H3010@} @H3010@} @H301_0@$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift
JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); @H301_0@$encoded=mb_detect_encoding($this->request->results,$encode
arr); @H3010@if($encoded) @H3010@{ @H3010@return strtolower($encoded); @H3010@} @H3010@else @H3010@{ @H3010@return false; @H3010@} @H3010@} @H3010@} @H3010@?> @H301_0@