本文实例讲述了PHP与python实现的线程池多线程爬虫功能。分享给大家供大家参考,具体如下:
多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看PHP与python 线程池多线程爬虫的例子,代码如下:
PHP例子
PHP;">
url = $url;
}
public function run()
{
$ch = $this->worker->getConnection();
curl_setopt($ch,CURLOPT_URL,$this->url);
$page = curl_exec($ch);
$info = curl_getinfo($ch);
$error = curl_error($ch);
$this->deal_data($this->url,$page,$info,$error);
$this->result = $page;
}
function deal_data($url,$error)
{
$parts = explode(".",$url);
$id = $parts[1];
if ($info['http_code'] != 200)
{
$this->show_msg($id,$error);
} else
{
$this->show_msg($id,"OK");
}
}
function show_msg($id,$msg)
{
echo $id."\t$msg\n";
}
public function getResult()
{
return $this->result;
}
protected $url;
protected $result;
}
function check_urls_multi_pthreads()
{
global $check_urls; //定义抓取的连接
$check_urls = array( 'http://xxx.com' => "xx网",);
$pool = new Pool(10,"Connect",array()); //建立10个线程池
foreach ($check_urls as $url => $name)
{
$pool->submit(new Query($url));
}
$pool->shutdown();
}
check_urls_multi_pthreads();
python 多线程
def handle(sid)://这个方法内执行爬虫数据处理
pass
class MyThread(Thread):
"""docstring for ClassName"""
def __init__(self,sid):
Thread.__init__(self)
self.sid = sid
def run():
handle(self.sid)
threads = []
for i in xrange(1,11):
t = MyThread(i)
threads.append(t)
t.start()
for t in threads:
t.join()
python 线程池爬虫:
]+)''',self.body(response)))
links = set()
for url in urls:
normalized = urllib.parse.urljoin(fetched_url,url)
parts = urllib.parse.urlparse(normalized)
if parts.scheme not in ('','http','https'):
continue
host,port = urllib.parse.splitport(parts.netloc)
if host and host.lower() not in ('localhost'):
continue
defragmented,frag = urllib.parse.urldefrag(parts.path)
links.add(defragmented)
return links
def body(self,response):
body = response.split(b'\r\n\r\n',1)[1]
return body.decode('utf-8')
def _is_html(self,response):
head,body = response.split(b'\r\n\r\n',1)
headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:])
return headers.get('Content-Type','').startswith('text/html')
class ThreadPool:
def __init__(self,num_threads):
self.tasks = Queue()
for _ in range(num_threads):
Fetcher(self.tasks)
def add_task(self,url):
self.tasks.put(url)
def wait_completion(self):
self.tasks.join()
if __name__ == '__main__':
start = time.time()
pool = ThreadPool(4)
pool.add_task("/")
pool.wait_completion()
print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))
更多关于PHP相关内容感兴趣的读者可查看本站专题:《》、《》、《》、《》、《》、《》、《》、《》、《》、《》及《PHP常见数据库操作技巧汇总》
希望本文所述对大家PHP程序设计有所帮助。