php 论坛采集程序 模拟登陆,抓取页面 实现代码
前端之家收集整理的这篇文章主要介绍了
php 论坛采集程序 模拟登陆,抓取页面 实现代码,
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
<div class="codetitle"><a style="CURSOR: pointer" data="78994" class="copybut" id="copybut78994" onclick="doCopy('code78994')"> 代码如下:
<div class="codebody" id="code78994">@H_
403_2@<?
PHP @H_
4032@// 吴燕军 @H4032@// 2009-06-27 @H403_2@// 采集程序
PHP @H_
403_2@set_time
limit(0); @H4032@//cookie保存目录 @H403_2@$cookie
jar = '/tmp/cookie.tmp'; @H403_2@/
函数------------------------------------------------------------------------------------------------------------/ @H_
4032@//模拟请求数据 @H403_2@function request($url,$postfields,$cookie
jar,$referer){ @H403_2@$ch = curl
init(); @H403_2@$options = array(CURLOPT
URL => $url,@H403_2@CURLOPT
HEADER => 0,@H403_2@CURLOPT
NOBODY => 0,@H403_2@CURLOPT
PORT => 80,@H403_2@CURLOPT
POST => 1,@H403_2@CURLOPT
POSTFIELDS => $postfields,@H403_2@CURLOPT
RETURNTRANSFER => 1,@H403_2@CURLOPT
FOLLOWLOCATION => 1,@H403_2@CURLOPT_COOKIEJAR => $cookie
jar,@H403_2@CURLOPT_COOKIEFILE => $cookie
jar,@H403_2@CURLOPT
REFERER => $referer @H4032@); @H403_2@curl_setopt
array($ch,$options); @H403_2@$code = curl
exec($ch); @H403_2@curl
close($ch); @H4032@return $code; @H4032@} @H403_2@//
获取帖子列表 @H_
4032@function getThreadsList($code){ @H403_2@preg_match_all('/ <!--[.|\r|\n]
? <a href=\"viewthread.PHP\?tid=(\d+)/',$code,$threads); @H_4032@return $threads[1]; @H4032@} @H4032@//判断该帖子是否存在 @H4032@function isExits($code){ @H403_2@preg_match('/ 指定的主题不存在或已被删除或正在被审核,请返回。 <\/p>/',$error); @H_4032@return isset($error[0])?false:true; @H4032@} @H403_2@//获取帖子标题 @H_4032@function getTitle($code){ @H403_2@preg_match('/
[^ <\/h1>]
/',$title
tmp); @H403_2@$title = $title
tmp[0]; @H4032@return $title; @H4032@} @H403_2@//
获取帖子作者: @H_
4032@function getAuthor($code){ @H403_2@preg_match('/ <a href=\"space.
PHP\?uid=\d+\" target=\"_blank\" id=\"userinfo\d+\" onmou
SEOver=\"showMenu(this.id)\">.+/',$author
tmp); @H403_2@$author = strip_tags($author
tmp[0]); @H4032@return $author; @H4032@} @H403_2@//
获取楼主发表的
内容 @H_
4032@function getContents($code){ @H403_2@preg
match('/ <div id=\"postmessage\d+\" class=\"t_msgfont\">(.|\r|\n)
? <\/div>/',$contentstmp); @H403_2@$contents = preg_replace('/images\//','http://bbs.war3.cn/images/',$contents_tmp[0]); @H_4032@return $contents; @H4032@} @H403_2@//打印帖子标题 @H_4032@function printTitle($title){ @H403_2@echo " ",striptags($title),"
"; @H4032@} @H403_2@//输出帖子作者 @H_4032@function printAuthor($author){ @H403_2@echo " 帖子作者:
",striptags($author),"
"; @H4032@} @H403_2@//打印帖子内容 @H_4032@function printContents($contents){ @H403_2@echo " 作者发表的内容:
",$contents,"
"; @H_4032@} @H403_2@//错误 @H_4032@function printError(){ @H4032@echo " 该帖子不存在! "; @H4032@} @H403_2@/函数列表end---------------------------------------------------------------------------------------------------
/ @H_403_2@/登录论坛 begin
/ @H_403_2@$url = 'http://bbs.war3.cn/logging.PHP?action=login'; @H_4032@$postfields='loginfield=username&username=1nject10n& password=xxxxxx&questionid=0&cookietime=315360000& referer=http://bbs.war3.cn/&loginsubmit=提交'; @H4032@request($url,''); @H4032@unset($postfields,$url); @H403_2@/登录论坛 end
/ @H_403_2@/获取帖子列表(位于第一页的帖子) begin
/ @H_403_2@$url = 'http://bbs.war3.cn/forumdisplay.PHP?fid=57'; @H_4032@$code = request($url,'',''); @H4032@$threadsList = getThreadsList($code); @H403_2@/获取帖子列表 end
/ @H_4032@//帖子序列 @H4032@$rows = 0; @H403_2@/循环抓取所有帖子源
代码 begin
/ @H_4032@foreach($threadsList as $list){ @H403_2@$url = "http://bbs.war3.cn/viewthread.PHP?tid=$list"; @H_4032@if(isExits($code)){ @H4032@$code = request($url,''); @H4032@$color = $rows%2==0?'#00CCFF':'#FFFF33'; @H4032@echo " "; @H
4032@echo " 第",($rows+1),"贴:
"; @H4032@$author = getAuthor($code); @H4032@printAuthor($author); @H4032@$title = getTitle($code); @H4032@printTitle($title); @H4032@$contents = getContents($code); @H4032@printContents($contents); @H4032@echo " "; @H4032@$rows++; @H4032@} @H4032@else @H4032@printError(); @H4032@echo "-----------------------------------------------------------------------------------------
"; @H4032@} @H403_2@/抓取源
代码 end*/ @H_
4032@?>@H403_2@