分页党统计专用工具
下载地址:http://download.csdn.net/source/2984669
Option Explicit Dim strData$,i% Dim reg As Object Dim matchs As Object Dim dic As Object,tmp Dim XmlHttp As Object Dim strUrlPre As String 'url前缀 Private Sub Form_Load() Set XmlHttp = CreateObject("Microsoft.XMLHTTP") Set dic = CreateObject("scripting.dictionary") Set reg = CreateObject("vbscript.regexp") reg.Global = True reg.IgnoreCase = True End Sub Private Sub Command1_Click() Dim intPageStart As Integer Dim intPageEnd As Integer dic.removeall reg.Pattern = "(.+?)(?:/.html|_/d+)" Set matchs = reg.Execute(Text1.Text) strUrlPre = matchs(0).SubMatches(0) reg.Pattern = ">共(/d+)页</a>" strData = getHtmlStr(strUrlPre & ".html") Set matchs = reg.Execute(strData) intPageEnd = matchs(0).SubMatches(0) Debug.Print strUrlPre & ".html" Text2.Text = strUrlPre & ".html" & vbCrLf intPageStart = Val(txtStartPage.Text) reg.Pattern = "<li class=""center""><a href=""http://hi.csdn.net/(.+?)""" For i = intPageStart To intPageEnd '页码2~最大页 If i < 2 Then i = 1 strData = getHtmlStr(strUrlPre & ".html") Else strData = getHtmlStr(strUrlPre & "_" & i & ".html") End If Set matchs = reg.Execute(strData) If matchs.Count > 0 Then dic(matchs(0).SubMatches(0)) = dic(matchs(0).SubMatches(0)) + 1 Debug.Print Format(i,"0000 ") & matchs(0).SubMatches(0) Text2.SelStart = Len(Text2.Text) Text2.SelText = Format(i,"0000 ") & matchs(0).SubMatches(0) & vbCrLf Else dic("MLGB404") = dic("MLGB404") + 1 End If Next Debug.Print "========结果汇总========" Text2.SelStart = Len(Text2.Text) Text2.SelText = "========结果汇总========" & vbCrLf For Each tmp In dic.keys Debug.Print tmp & vbTab & dic(tmp) Text2.SelStart = Len(Text2.Text) Text2.SelText = tmp & vbTab & dic(tmp) & vbCrLf Next End Sub '得到utf8格式的网页的代码 Public Function getHtmlStr(strUrl As String) As String Dim XmlHttp As Object Set XmlHttp = CreateObject("Microsoft.XMLHTTP") XmlHttp.Open "GET",strUrl,False On Error GoTo Err_net XmlHttp.send getHtmlStr = BytesToBstr(XmlHttp.ResponseBody,"UTF-8") Set XmlHttp = Nothing Err_net: End Function '转utf8格式 Private Function BytesToBstr(strBody,codeBase) As String Dim objStream As Object Set objStream = CreateObject("Adodb.Stream") objStream.Type = 1 objStream.Mode = 3 objStream.Open objStream.Write strBody objStream.position = 0 objStream.Type = 2 objStream.Charset = codeBase BytesToBstr = objStream.ReadText objStream.Close Set objStream = Nothing End Function
注意不要大量采集,否则可能会被csdn服务器拒绝,报403 Forbidden错误,这时你所有的帖子都访问不了了,要过一段时间才好。建议采集处加个延时。