如有不明白的地方欢迎加QQ群14670545探讨
代码质量不行,后面采集起来卡,原因是正则我太菜了,没有开多线程采集。下面我举例:
目标页面是http://bbs.csdn.net/recommend_tech_topics,有图有真相:
看看采集后的效果:
下面是采集到的每个帖子楼主的内容以及一些简单的显示。图片之所以没有显示出来,因为csdn是用的一个专门的图片服务器,设置了防盗链,这个自己捣鼓下应该也可以绕过去。
下面我来分析下:
抓取的原理:先获取对应url页面的html内容,然后根据找出你要抓取的目标数据的的html结构,看看这个结构是否有某种规律,然后用正则去匹配这个规则,匹配到了以后就可以扣出来。
先看看http://bbs.csdn.net/recommend_tech_topics这个页面的源代码:
再通过谷歌浏览器我们再次确认了,目标数据是位于<div class="list_1">...</div>中间的
可是怎么匹配呢,我百度了下,就是找某两个字符中间的内容。考虑到结尾</div>很多,我这里就偷懒,把<div class="list_1">...</div>这一对div和近邻着的的后面的一个div【从源代码上我们可以看到是:<div class="page_nav">】<div class="page_nav">放到一起来,所以我的正则出来了:
Regex regex = new Regex("<div class=\"list_1\">([\\s\\S]*)</div>([\\s\\S]*)<div class=\"page_nav\">",RegexOptions.Compiled);为了美化一下,我加了点css,具体代码如下;
页面:
<%@ Page Language="C#" AutoEventWireup="true" CodeFile="testcollection.aspx.cs" Inherits="testcollection" %> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head runat="server"> <title>测试获取网页信息</title> <script src="js/jquery-1.6.min.js" type="text/javascript"></script> <style type="text/css"> a:link,a:visited {color: #335AA4;text-decoration: none;} a:hover,a:active {color: #CA0000;text-decoration: underline;} a.a_insertdb{ color:#f00; font-weight:bold;} a.a_insertdb:hover{ text-decoration:none;} #pageUrlInfo{ font-family:Arial,宋体; width:960px; margin:0 auto; font-size:14px;} #pageUrlInfo ul,li{ list-style:none;} #pageUrlInfo ul li{ line-height:23px;} .list_1 .time {float: right;color: #999;font-size: 12px;} .pageBar{width:960px; margin:0 auto; font-size:14px;} .pageBar ul li{ float:right; width:100px;} </style> </head> <body name="top"> <%--<a name="top"></a>--%> <div class="pageBar"> <ul> <li><a href="#bottom">回到底部</a></li> <li><a href="testcollection.aspx?page=<%=(int.Parse(pageindex)+1).ToString() %>">下一页</a></li> <li><a href="testcollection.aspx?page=<%=(int.Parse(pageindex)-(pageindex!="1" ? 1 : 0 )).ToString() %>">上一页</a></li> <li><a href="javascript:void(null);" class="a_insertdb">导入数据库</a></li> </ul> </div><br /> <form id="form1" runat="server"> <div runat="server" id="pageUrlInfo"> </div> <a name="bottom"></a> <div class="pageBar" style=" width:960px; margin:0 auto;"> <ul> <li><a href="#top">回到顶部</a></li> <li><a href="testcollection.aspx?page=<%=(int.Parse(pageindex)+1).ToString() %>">下一页</a></li> <li><a href="testcollection.aspx?page=<%=(int.Parse(pageindex)-(pageindex!="1" ? 1 : 0 )).ToString() %>">上一页</a></li> </ul> </div> </form> <script type="text/javascript"> $(document).ready(function() { $(".list_1 ul li a").each(function() { var _urlSuffix = $(this).attr("href"); $(this).attr({ "href": "<%=HttpUrlDomain %>" + _urlSuffix,"target": "_blank" }); }) $(".a_insertdb").click(function() { var _json = GetSelectedCustomer(); //alert(_json); return; $.ajax({ type: "POST",url: "ajax/news_gather.aspx",data: "pjson=" + _json + "&pt=" + (new Date().getTime()),success: function(msg) { alert("Data Saved: " + msg); } }); }) }) function GetSelectedCustomer() { var item = $(".list_1 ul li").map(function() { var _title = $(this).find("a"); var _time = $(this).find("span").text(); return '{' + _title.text() + '$$' + _title.attr("href") + '$$' + $(this).text().replace(_title.text(),"").replace(_time,"").match(/[0-9]+/) + '$$' + _time + '}'; }).get().join(","); return '[' + item + ']'; } </script> </body> </html>后台文件:
using System; using System.IO; using System.Net; using System.Text; using System.Text.RegularExpressions; public partial class testcollection : System.Web.UI.Page { protected void Page_Load(object sender,EventArgs e) { ///recommend_tech_topics?page=2 string rl; WebRequest myReq = WebRequest.Create(HttpUrlDomain + "/recommend_tech_topics?page=" + pageindex); WebResponse myRes = myReq.GetResponse(); Stream resStream = myRes.GetResponseStream(); StreamReader sr = new StreamReader(resStream,Encoding.UTF8); StringBuilder sb = new StringBuilder(); while ((rl = sr.ReadLine()) != null) { sb.AppendLine(rl); } Regex regex = new Regex("<div class=\"list_1\">([\\s\\S]*)</div>([\\s\\S]*)<div class=\"page_nav\">",RegexOptions.Compiled); Match match; match = regex.Match(sb.ToString()); if (match.Success) this.pageUrlInfo.InnerHtml = match.Groups[0].Value; myRes.Close(); } /// <summary> /// 获取页码 /// </summary> public string pageindex { get { return Request.QueryString["page"] != null ? (int.Parse(Request.QueryString["page"].ToString()) > 0 ? Request.QueryString["page"].ToString() : "1") : "1"; } } public string HttpUrlDomain { get { return "http://bbs.csdn.net"; } } }
最终效果:
news_gather.aspx:
<%@ Page Language="C#" AutoEventWireup="true" CodeFile="news_gather.aspx.cs" Inherits="ajax_news_gather" %>news_gather.aspx.cs:
using System; using System.IO; using System.Net; using System.Text; using System.Text.RegularExpressions; using System.Web; //using System.Threading; public partial class ajax_news_gather : System.Web.UI.Page { public Test_Model.CSDN_BBS csdnModel = new Test_Model.CSDN_BBS(); public Test_BUL.CSDN_BBS csdnDal = new Test_BUL.CSDN_BBS(); protected override void OnInit(EventArgs e) { base.OnInit(e); if (!CheckFormUrl()) Response.End(); } public bool CheckFormUrl() { if (HttpContext.Current.Request.UrlReferrer == null) return false; if ((HttpContext.Current.Request.UrlReferrer.Host) != (HttpContext.Current.Request.Url.Host)) return false; return true; } protected void Page_Load(object sender,EventArgs e) { string jsondata = Request.Form["pjson"] != null ? Request.Form["pjson"].ToString() : ""; Regex reg1 = new Regex(@"{.+?}"); MatchCollection mc = reg1.Matches(jsondata); string[] itmeArry; for (int i = 0; i < mc.Count; i++) { itmeArry = mc[i].Value.Replace("{","").Replace("}","").Split(new string[] { "$$" },StringSplitOptions.RemoveEmptyEntries); csdnModel.Title = itmeArry[0]; csdnModel.Content = GetContent(itmeArry[1]); csdnModel.Href = itmeArry[1]; csdnModel.Date = Convert.ToDateTime(itmeArry[3]); csdnModel.ClickNum = int.Parse(itmeArry[2].Trim()); csdnDal.Add(csdnModel); } Response.Write("OK"); } //private void GetUrlInfo() //{ // Thread[] downloadThread = new Thread[5];//为线程申请资源,确定线程总数 // for (int i = 0; i < downloadThread.Length; i++) // { // downloadThread[i] = new Thread(new ThreadStart(DownLoad));//创建一个线程 // downloadThread[i].Start();//逐个开启线程 // } //} //private void DownLoad() //{ // //执行一些方法 //} private string GetContent(string url) { string rl; WebRequest myReq = WebRequest.Create(url); WebResponse myRes = myReq.GetResponse(); Stream resStream = myRes.GetResponseStream(); StreamReader sr = new StreamReader(resStream,Encoding.UTF8); StringBuilder sb = new StringBuilder(); while ((rl = sr.ReadLine()) != null) { sb.AppendLine(rl); } try { string pattStr = "<div class=\"data\">([\\s\\S]*)<strong class=\"fr\">([\\s\\S]*)楼主([\\s\\S]*)<div class=\"post_body\">([\\s\\S]*)<iframe id=\"tad2\""; Regex regex = new Regex(pattStr,RegexOptions.Compiled); Match match; match = regex.Match(sb.ToString()); if (match.Success) return match.Groups[0].Value.Replace("<iframe id=\"tad2\"","").Replace("楼主",""); } catch (Exception ex) { throw new Exception(ex.Message); } finally { myRes.Close(); } return null; } }
model:
using System; namespace Test_Model { /// <summary> /// CSDN_BBS:实体类 /// </summary> [Serializable] public partial class CSDN_BBS : IDisposable { #region 接口实现 /// <summary> /// 终结器,调用虚拟的Dispose方法 /// </summary> ~CSDN_BBS() { Dispose(false); } /// <summary> /// 调用虚拟的Dispose方法,禁止Finalization(终结操作) /// </summary> public void Dispose() { Dispose(true); GC.SuppressFinalize(this); } /// <summary> /// 虚拟的Dispose方法 /// </summary> protected virtual void Dispose(bool disposing) { if (!disposing) return; } #endregion public CSDN_BBS() { } #region Model private int _id; private string _title; private string _href; private string _content; private DateTime? _date; private int _clicknum; /// <summary> /// /// </summary> public int ID { set { _id = value; } get { return _id; } } /// <summary> /// /// </summary> public string Title { set { _title = value; } get { return _title; } } /// <summary> /// /// </summary> public string Href { set { _href = value; } get { return _href; } } /// <summary> /// /// </summary> public string Content { set { _content = value; } get { return _content; } } /// <summary> /// /// </summary> public DateTime? Date { set { _date = value; } get { return _date; } } /// <summary> /// /// </summary> public int ClickNum { set { _clicknum = value; } get { return _clicknum; } } #endregion Model } }其它的我就不写了,获取每个标题的楼主发的内容和获取数据的思路一样,真心话我正则不会,都是依葫芦画瓢取目标字符串中间的内容。所以获取楼主的内容我也只能套了好几层,那个地方的div是循环出来的,结构包括css都是一个样子,我只能根据微末的变化来这么处理了。各位看官凑合着看吧