php 论坛采集程序 模拟登陆,抓取页面 实现代码

前端之家收集整理的这篇文章主要介绍了php 论坛采集程序 模拟登陆,抓取页面 实现代码前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

<div class="codetitle"><a style="CURSOR: pointer" data="78994" class="copybut" id="copybut78994" onclick="doCopy('code78994')"> 代码如下:

<div class="codebody" id="code78994">@H_403_2@<?PHP @H_4032@// 吴燕军 @H4032@// 2009-06-27 @H403_2@// 采集程序PHP @H_403_2@set_timelimit(0); @H4032@//cookie保存目录 @H403_2@$cookiejar = '/tmp/cookie.tmp'; @H403_2@/函数------------------------------------------------------------------------------------------------------------/ @H_4032@//模拟请求数据 @H403_2@function request($url,$postfields,$cookiejar,$referer){ @H403_2@$ch = curlinit(); @H403_2@$options = array(CURLOPTURL => $url,@H403_2@CURLOPTHEADER => 0,@H403_2@CURLOPTNOBODY => 0,@H403_2@CURLOPTPORT => 80,@H403_2@CURLOPTPOST => 1,@H403_2@CURLOPTPOSTFIELDS => $postfields,@H403_2@CURLOPTRETURNTRANSFER => 1,@H403_2@CURLOPTFOLLOWLOCATION => 1,@H403_2@CURLOPT_COOKIEJAR => $cookiejar,@H403_2@CURLOPT_COOKIEFILE => $cookiejar,@H403_2@CURLOPTREFERER => $referer @H4032@); @H403_2@curl_setoptarray($ch,$options); @H403_2@$code = curlexec($ch); @H403_2@curlclose($ch); @H4032@return $code; @H4032@} @H403_2@//获取帖子列表 @H_4032@function getThreadsList($code){ @H403_2@preg_match_all('/ <!--[.|\r|\n]? <a href=\"viewthread.PHP\?tid=(\d+)/',$code,$threads); @H_4032@return $threads[1]; @H4032@} @H4032@//判断该帖子是否存在 @H4032@function isExits($code){ @H403_2@preg_match('/

指定的主题不存在或已被删除或正在被审核,请返回。 <\/p>/',$error); @H_4032@return isset($error[0])?false:true; @H4032@} @H403_2@//获取帖子标题 @H_4032@function getTitle($code){ @H403_2@preg_match('/

[^ <\/h1>]/',$titletmp); @H403_2@$title = $titletmp[0]; @H4032@return $title; @H4032@} @H403_2@//获取帖子作者: @H_4032@function getAuthor($code){ @H403_2@preg_match('/ <a href=\"space.PHP\?uid=\d+\" target=\"_blank\" id=\"userinfo\d+\" onmouSEOver=\"showMenu(this.id)\">.+/',$authortmp); @H403_2@$author = strip_tags($authortmp[0]); @H4032@return $author; @H4032@} @H403_2@//获取楼主发表的内容 @H_4032@function getContents($code){ @H403_2@pregmatch('/ <div id=\"postmessage\d+\" class=\"t_msgfont\">(.|\r|\n)? <\/div>/',$contentstmp); @H403_2@$contents = preg_replace('/images\//','http://bbs.war3.cn/images/',$contents_tmp[0]); @H_4032@return $contents; @H4032@} @H403_2@//打印帖子标题 @H_4032@function printTitle($title){ @H403_2@echo "

帖子标题:

",striptags($title),"

"; @H
4032@} @H403_2@//输出帖子作者 @H_4032@function printAuthor($author){ @H403_2@echo "

帖子作者:

",striptags($author),"

"; @H
4032@} @H403_2@//打印帖子内容 @H_4032@function printContents($contents){ @H403_2@echo "

作者发表的内容:

",$contents,"
"; @H_4032@} @H403_2@//错误 @H_4032@function printError(){ @H4032@echo " 该帖子不存在! "; @H4032@} @H403_2@/
函数列表end---------------------------------------------------------------------------------------------------/ @H_403_2@/登录论坛 begin/ @H_403_2@$url = 'http://bbs.war3.cn/logging.PHP?action=login'; @H_4032@$postfields='loginfield=username&username=1nject10n& password=xxxxxx&questionid=0&cookietime=315360000& referer=http://bbs.war3.cn/&loginsubmit=提交'; @H4032@request($url,''); @H4032@unset($postfields,$url); @H403_2@/登录论坛 end/ @H_403_2@/获取帖子列表(位于第一页的帖子) begin/ @H_403_2@$url = 'http://bbs.war3.cn/forumdisplay.PHP?fid=57'; @H_4032@$code = request($url,'',''); @H4032@$threadsList = getThreadsList($code); @H403_2@/获取帖子列表 end/ @H_4032@//帖子序列 @H4032@$rows = 0; @H403_2@/循环抓取所有帖子源代码 begin/ @H_4032@foreach($threadsList as $list){ @H403_2@$url = "http://bbs.war3.cn/viewthread.PHP?tid=$list"; @H_4032@if(isExits($code)){ @H4032@$code = request($url,''); @H4032@$color = $rows%2==0?'#00CCFF':'#FFFF33'; @H4032@echo "
"; @H4032@echo "

第",($rows+1),"贴:


"; @H
4032@$author = getAuthor($code); @H4032@printAuthor($author); @H4032@$title = getTitle($code); @H4032@printTitle($title); @H4032@$contents = getContents($code); @H4032@printContents($contents); @H4032@echo "
"; @H4032@$rows++; @H4032@} @H4032@else @H4032@printError(); @H4032@echo "-----------------------------------------------------------------------------------------

"; @H
4032@} @H403_2@/
抓取源代码 end*/ @H_4032@?>@H403_2@

猜你在找的PHP相关文章