php与python实现的线程池多线程爬虫功能示例

前端之家收集整理的这篇文章主要介绍了php与python实现的线程池多线程爬虫功能示例前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

本文实例讲述了PHP与python实现的线程池多线程爬虫功能分享给大家供大家参考,具体如下:

多线程爬虫可以用于抓取内容了这个可以提升性能了,这里我们来看PHP与python 线程池多线程爬虫的例子,代码如下:

PHP例子

PHP;"> url = $url; } public function run() { $ch = $this->worker->getConnection(); curl_setopt($ch,CURLOPT_URL,$this->url); $page = curl_exec($ch); $info = curl_getinfo($ch); $error = curl_error($ch); $this->deal_data($this->url,$page,$info,$error); $this->result = $page; } function deal_data($url,$error) { $parts = explode(".",$url); $id = $parts[1]; if ($info['http_code'] != 200) { $this->show_msg($id,$error); } else { $this->show_msg($id,"OK"); } } function show_msg($id,$msg) { echo $id."\t$msg\n"; } public function getResult() { return $this->result; } protected $url; protected $result; } function check_urls_multi_pthreads() { global $check_urls; //定义抓取的连接 $check_urls = array( 'http://xxx.com' => "xx网",); $pool = new Pool(10,"Connect",array()); //建立10个线程池 foreach ($check_urls as $url => $name) { $pool->submit(new Query($url)); } $pool->shutdown(); } check_urls_multi_pthreads(); python 多线程 def handle(sid)://这个方法内执行爬虫数据处理 pass class MyThread(Thread): """docstring for ClassName""" def __init__(self,sid): Thread.__init__(self) self.sid = sid def run(): handle(self.sid) threads = [] for i in xrange(1,11): t = MyThread(i) threads.append(t) t.start() for t in threads: t.join()

python 线程池爬虫:

]+)''',self.body(response))) links = set() for url in urls: normalized = urllib.parse.urljoin(fetched_url,url) parts = urllib.parse.urlparse(normalized) if parts.scheme not in ('','http','https'): continue host,port = urllib.parse.splitport(parts.netloc) if host and host.lower() not in ('localhost'): continue defragmented,frag = urllib.parse.urldefrag(parts.path) links.add(defragmented) return links def body(self,response): body = response.split(b'\r\n\r\n',1)[1] return body.decode('utf-8') def _is_html(self,response): head,body = response.split(b'\r\n\r\n',1) headers = dict(h.split(': ') for h in head.decode().split('\r\n')[1:]) return headers.get('Content-Type','').startswith('text/html') class ThreadPool: def __init__(self,num_threads): self.tasks = Queue() for _ in range(num_threads): Fetcher(self.tasks) def add_task(self,url): self.tasks.put(url) def wait_completion(self): self.tasks.join() if __name__ == '__main__': start = time.time() pool = ThreadPool(4) pool.add_task("/") pool.wait_completion() print('{} URLs fetched in {:.1f} seconds'.format(len(seen_urls),time.time() - start))

更多关于PHP相关内容感兴趣的读者可查看本站专题:《》、《》、《》、《》、《》、《》、《》、《》、《》、《》及《PHP常见数据库操作技巧汇总》

希望本文所述对大家PHP程序设计有所帮助。

猜你在找的PHP相关文章