正则表达式相关:C# 抓取网页类(获取网页中所有信息)

前端之家收集整理的这篇文章主要介绍了正则表达式相关:C# 抓取网页类(获取网页中所有信息)前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
  1. usingSystem;
  2. usingSystem.Data;
  3. usingSystem.Configuration;
  4. usingSystem.Net;
  5. usingSystem.IO;
  6. usingSystem.Text;
  7. usingSystem.Collections.Generic;
  8. usingSystem.Text.RegularExpressions;
  9. usingSystem.Threading;
  10. usingSystem.Web;
  11. usingSystem.Web.UI.MobileControls;
  12. ///<summary>
  13. ///网页类
  14. ///</summary>
  15. publicclassWebPage
  16. {
  17. #region私有成员
  18. privateUrim_uri;//url
  19. privateList<Link>m_links;//此网页上的链接
  20. privatestringm_title;//标题
  21. privatestringm_html;//HTML代码
  22. privatestringm_outstr;//网页可输出的纯文本
  23. privateboolm_good;//网页是否可用
  24. privateintm_pagesize;//网页的大小
  25. privatestaticDictionary<string,CookieContainer>webcookies=newDictionary<string,CookieContainer>();//存放所有网页的Cookie
  26. #endregion
  27. #region属性
  28. ///<summary>
  29. ///通过此属性可获得本网页的网址,只读
  30. ///</summary>
  31. publicstringURL
  32. {
  33. get
  34. returnm_uri.AbsoluteUri;
  35. }
  36. }
  37. ///通过此属性可获得本网页的标题,只读
  38. publicstringTitle
  39. get
  40. if(m_title=="")
  41. Regexreg=newRegex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
  42. Matchmc=reg.Match(m_html);
  43. if(mc.Success)
  44. m_title=mc.Groups["title"].Value.Trim();
  45. returnm_title;
  46. publicstringM_html
  47. if(m_html==null)
  48. m_html="";
  49. returnm_html;
  50. ///此属性获得本网页的所有链接信息,只读
  51. publicList<Link>Links
  52. if(m_links.Count==0)getLinks();
  53. returnm_links;
  54. ///此属性返回本网页的全部纯文本信息,只读
  55. publicstringContext
  56. if(m_outstr=="")getContext(Int16.MaxValue);
  57. returnm_outstr;
  58. ///此属性获得本网页的大小
  59. publicintPageSize
  60. returnm_pagesize;
  61. ///此属性获得本网页的所有站内链接
  62. publicList<Link>InsiteLinks
  63. returngetSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue);
  64. ///此属性表示本网页是否可用
  65. publicboolIsGood
  66. returnm_good;
  67. ///此属性表示网页的所在的网站
  68. publicstringHost
  69. returnm_uri.Host;
  70. #endregion
  71. ///从HTML代码中分析出链接信息
  72. ///<returns>List<Link></returns>
  73. privateList<Link>getLinks()
  74. if(m_links.Count==0)
  75. Regex[]regex=newRegex[2];
  76. regex[0]=newRegex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>",RegexOptions.IgnoreCase|RegexOptions.Singleline);
  77. regex[1]=newRegex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>",RegexOptions.IgnoreCase);
  78. for(inti=0;i<2;i++)
  79. Matchmatch=regex[i].Match(m_html);
  80. while(match.Success)
  81. try
  82. stringurl=HttpUtility.UrlDecode(newUri(m_uri,match.Groups["URL"].Value).AbsoluteUri);
  83. stringtext="";
  84. if(i==0)text=newRegex("(<[^>]+>)|(\\s)|()|&|\"",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value,"");
  85. Linklink=newLink();
  86. link.Text=text;
  87. link.NavigateUrl=url;
  88. m_links.Add(link);
  89. catch(Exceptionex){Console.WriteLine(ex.Message);};
  90. match=match.NextMatch();
  91. ///此私有方法从一段HTML文本中提取出一定字数的纯文本
  92. ///<paramname="instr">HTML代码</param>
  93. ///<paramname="firstN">提取从头数多少个字</param>
  94. ///<paramname="withLink">是否要链接里面的字</param>
  95. ///<returns>纯文本</returns>
  96. privatestringgetFirstNchar(stringinstr,intfirstN,boolwithLink)
  97. if(m_outstr=="")
  98. m_outstr=instr.Clone()asstring;
  99. m_outstr=newRegex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,"");
  100. m_outstr=newRegex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>",85); line-height:18px"> m_outstr=newRegex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>",153); list-style:decimal-leading-zero outside; color:inherit; line-height:18px"> if(!withLink)m_outstr=newRegex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>",85); line-height:18px"> RegexobjReg=newSystem.Text.RegularExpressions.Regex("(<[^>]+?>)|",RegexOptions.Multiline|RegexOptions.IgnoreCase);
  101. m_outstr=objReg.Replace(m_outstr,"");
  102. RegexobjReg2=newSystem.Text.RegularExpressions.Regex("(\\s)+",153); list-style:decimal-leading-zero outside; color:inherit; line-height:18px"> m_outstr=objReg2.Replace(m_outstr,85); line-height:18px"> returnm_outstr.Length>firstN?m_outstr.Substring(0,firstN):m_outstr;
  103. #region公有文法
  104. ///此公有方法提取网页中一定字数的纯文本,包括链接文字
  105. ///<paramname="firstN">字数</param>
  106. ///<returns></returns>
  107. publicstringgetContext(intfirstN)
  108. returngetFirstNchar(m_html,firstN,true);
  109. ///此公有方法从本网页的链接提取一定数量链接,该链接的URL满足某正则式
  110. ///<paramname="pattern">正则式</param>
  111. ///<paramname="count">返回的链接的个数</param>
  112. publicList<Link>getSpecialLinksByUrl(stringpattern,intcount)
  113. List<Link>SpecialLinks=newList<Link>();
  114. List<Link>.Enumeratori;
  115. i=m_links.GetEnumerator();
  116. intcnt=0;
  117. while(i.MoveNext()&&cnt<count)
  118. if(newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success)
  119. SpecialLinks.Add(i.Current);
  120. cnt++;
  121. returnSpecialLinks;
  122. ///此公有方法从本网页的链接提取一定数量链接,该链接文字满足某正则式
  123. publicList<Link>getSpecialLinksByText(stringpattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.Text).Success)
  124. ///这公有方法提取本网页的纯文本中满足某正则式的文字
  125. ///<returns>返回文字</returns>
  126. publicstringgetSpecialWords(stringpattern)
  127. Regexregex=newRegex(pattern,85); line-height:18px"> Matchmc=regex.Match(m_outstr);
  128. returnmc.Groups[1].Value;
  129. returnstring.Empty;
  130. #region构造函数
  131. privatevoidInit(string_url)
  132. try
  133. m_uri=newUri(_url);
  134. m_links=newList<Link>();
  135. m_outstr="";
  136. m_title="";
  137. m_good=true;
  138. if(_url.EndsWith(".rar")||_url.EndsWith(".dat")||_url.EndsWith(".msi"))
  139. m_good=false;
  140. return;
  141. HttpWebRequestrqst=(HttpWebRequest)WebRequest.Create(m_uri);
  142. rqst.AllowAutoRedirect=true;
  143. rqst.MaximumAutomaticRedirections=3;
  144. rqst.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)";
  145. rqst.KeepAlive=true;
  146. rqst.Timeout=10000;
  147. lock(WebPage.webcookies)
  148. if(WebPage.webcookies.ContainsKey(m_uri.Host))
  149. rqst.CookieContainer=WebPage.webcookies[m_uri.Host];
  150. else
  151. CookieContainercc=newCookieContainer();
  152. WebPage.webcookies[m_uri.Host]=cc;
  153. rqst.CookieContainer=cc;
  154. HttpWebResponsersps=(HttpWebResponse)rqst.GetResponse();
  155. Streamsm=rsps.GetResponseStream();
  156. if(!rsps.ContentType.ToLower().StartsWith("text/")||rsps.ContentLength>1<<22)
  157. rsps.Close();
  158. m_good=false;
  159. return;
  160. Encodingcding=System.Text.Encoding.Default;
  161. stringcontenttype=rsps.ContentType.ToLower();
  162. intix=contenttype.IndexOf("charset=");
  163. if(ix!=-1)
  164. cding=System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix+"charset".Length+1));
  165. catch
  166. cding=Encoding.Default;
  167. //该处视情况而定有的需要解码
  168. //m_html=HttpUtility.HtmlDecode(newStreamReader(sm,cding).ReadToEnd());
  169. m_html=newStreamReader(sm,cding).ReadToEnd();
  170. //该处视情况而定有的需要解码
  171. Regexregex=newRegex("charset=(?<cding>[^=]+)?\"",RegexOptions.IgnoreCase);
  172. stringstrcding=regex.Match(m_html).Groups["cding"].Value;
  173. cding=Encoding.GetEncoding(strcding);
  174. byte[]bytes=Encoding.Default.GetBytes(m_html.tocharArray());
  175. m_html=cding.GetString(bytes);
  176. if(m_html.Split('?').Length>100)
  177. m_html=Encoding.Default.GetString(bytes);
  178. m_pagesize=m_html.Length;
  179. m_uri=rsps.ResponseUri;
  180. catch(Exceptionex)
  181. publicWebPage(string_url)
  182. stringuurl="";
  183. uurl=Uri.UnescapeDataString(_url);
  184. _url=uurl;
  185. catch{};
  186. Init(_url);
  187. }

调用

[csharp] view plain copy
print ?
    WebPagewebInfo=newWebPage("网址");
  1. webInfo.Context;//不包含html标签的所有内容
  2. webInfo.M_html;//包含html标签内容
  3. ...参考属性

猜你在找的正则表达式相关文章