前端之家收集整理的这篇文章主要介绍了
正则表达式相关:C# 抓取网页类(获取网页中所有信息),
前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
- usingSystem;
- usingSystem.Data;
- usingSystem.Configuration;
- usingSystem.Net;
- usingSystem.IO;
- usingSystem.Text;
- usingSystem.Collections.Generic;
- usingSystem.Text.RegularExpressions;
- usingSystem.Threading;
- usingSystem.Web;
- usingSystem.Web.UI.MobileControls;
-
-
- ///</summary>
- publicclassWebPage
- {
- #region私有成员
- privateUrim_uri;
- privateList<Link>m_links;
- privatestringm_title;
- privatestringm_html;
- privatestringm_outstr;
- privateboolm_good;
- privateintm_pagesize;
- privatestaticDictionary<string,CookieContainer>webcookies=newDictionary<string,CookieContainer>();
-
- #endregion
- #region属性
-
- ///<summary>
- ///通过此属性可获得本网页的网址,只读
- ///</summary>
- publicstringURL
- {
- get
- returnm_uri.AbsoluteUri;
- }
- }
-
- ///通过此属性可获得本网页的标题,只读
- publicstringTitle
- get
- if(m_title=="")
- Regexreg=newRegex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
- Matchmc=reg.Match(m_html);
- if(mc.Success)
- m_title=mc.Groups["title"].Value.Trim();
- returnm_title;
- publicstringM_html
- if(m_html==null)
- m_html="";
- returnm_html;
- ///此属性获得本网页的所有链接信息,只读
- publicList<Link>Links
- if(m_links.Count==0)getLinks();
- returnm_links;
- ///此属性返回本网页的全部纯文本信息,只读
- publicstringContext
- if(m_outstr=="")getContext(Int16.MaxValue);
- returnm_outstr;
- ///此属性获得本网页的大小
- publicintPageSize
- returnm_pagesize;
- ///此属性获得本网页的所有站内链接
- publicList<Link>InsiteLinks
- returngetSpecialLinksByUrl("^http://"+m_uri.Host,Int16.MaxValue);
- ///此属性表示本网页是否可用
- publicboolIsGood
- returnm_good;
- ///此属性表示网页的所在的网站
- publicstringHost
- returnm_uri.Host;
- #endregion
- ///从HTML代码中分析出链接信息
- ///<returns>List<Link></returns>
- privateList<Link>getLinks()
- if(m_links.Count==0)
- Regex[]regex=newRegex[2];
- regex[0]=newRegex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>",RegexOptions.IgnoreCase|RegexOptions.Singleline);
- regex[1]=newRegex("<[i]*frame[^><]+src=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>",RegexOptions.IgnoreCase);
- for(inti=0;i<2;i++)
- Matchmatch=regex[i].Match(m_html);
- while(match.Success)
- try
- stringurl=HttpUtility.UrlDecode(newUri(m_uri,match.Groups["URL"].Value).AbsoluteUri);
- stringtext="";
- if(i==0)text=newRegex("(<[^>]+>)|(\\s)|()|&|\"",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value,"");
- Linklink=newLink();
- link.Text=text;
- link.NavigateUrl=url;
- m_links.Add(link);
- catch(Exceptionex){Console.WriteLine(ex.Message);};
- match=match.NextMatch();
- ///此私有方法从一段HTML文本中提取出一定字数的纯文本
- ///<paramname="instr">HTML代码</param>
- ///<paramname="firstN">提取从头数多少个字</param>
- ///<paramname="withLink">是否要链接里面的字</param>
- ///<returns>纯文本</returns>
- privatestringgetFirstNchar(stringinstr,intfirstN,boolwithLink)
- if(m_outstr=="")
- m_outstr=instr.Clone()asstring;
- m_outstr=newRegex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>",RegexOptions.Multiline|RegexOptions.IgnoreCase).Replace(m_outstr,"");
- m_outstr=newRegex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>",85); line-height:18px"> m_outstr=newRegex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>",153); list-style:decimal-leading-zero outside; color:inherit; line-height:18px"> if(!withLink)m_outstr=newRegex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>",85); line-height:18px"> RegexobjReg=newSystem.Text.RegularExpressions.Regex("(<[^>]+?>)|",RegexOptions.Multiline|RegexOptions.IgnoreCase);
- m_outstr=objReg.Replace(m_outstr,"");
- RegexobjReg2=newSystem.Text.RegularExpressions.Regex("(\\s)+",153); list-style:decimal-leading-zero outside; color:inherit; line-height:18px"> m_outstr=objReg2.Replace(m_outstr,85); line-height:18px"> returnm_outstr.Length>firstN?m_outstr.Substring(0,firstN):m_outstr;
-
- #region公有文法
- ///此公有方法提取网页中一定字数的纯文本,包括链接文字
- ///<paramname="firstN">字数</param>
- ///<returns></returns>
- publicstringgetContext(intfirstN)
- returngetFirstNchar(m_html,firstN,true);
- ///此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式
- ///<paramname="pattern">正则式</param>
- ///<paramname="count">返回的链接的个数</param>
- publicList<Link>getSpecialLinksByUrl(stringpattern,intcount)
- List<Link>SpecialLinks=newList<Link>();
- List<Link>.Enumeratori;
- i=m_links.GetEnumerator();
- intcnt=0;
- while(i.MoveNext()&&cnt<count)
- if(newRegex(pattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success)
- SpecialLinks.Add(i.Current);
- cnt++;
- returnSpecialLinks;
- ///此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式
- publicList<Link>getSpecialLinksByText(stringpattern,RegexOptions.Multiline|RegexOptions.IgnoreCase).Match(i.Current.Text).Success)
- ///这公有方法提取本网页的纯文本中满足某正则式的文字
- ///<returns>返回文字</returns>
- publicstringgetSpecialWords(stringpattern)
- Regexregex=newRegex(pattern,85); line-height:18px"> Matchmc=regex.Match(m_outstr);
- returnmc.Groups[1].Value;
- returnstring.Empty;
- #region构造函数
- privatevoidInit(string_url)
- try
- m_uri=newUri(_url);
- m_links=newList<Link>();
- m_outstr="";
- m_title="";
- m_good=true;
- if(_url.EndsWith(".rar")||_url.EndsWith(".dat")||_url.EndsWith(".msi"))
- m_good=false;
- return;
- HttpWebRequestrqst=(HttpWebRequest)WebRequest.Create(m_uri);
- rqst.AllowAutoRedirect=true;
- rqst.MaximumAutomaticRedirections=3;
- rqst.UserAgent="Mozilla/4.0(compatible;MSIE5.01;WindowsNT5.0)";
- rqst.KeepAlive=true;
- rqst.Timeout=10000;
- lock(WebPage.webcookies)
- if(WebPage.webcookies.ContainsKey(m_uri.Host))
- rqst.CookieContainer=WebPage.webcookies[m_uri.Host];
- else
- CookieContainercc=newCookieContainer();
- WebPage.webcookies[m_uri.Host]=cc;
- rqst.CookieContainer=cc;
- HttpWebResponsersps=(HttpWebResponse)rqst.GetResponse();
- Streamsm=rsps.GetResponseStream();
- if(!rsps.ContentType.ToLower().StartsWith("text/")||rsps.ContentLength>1<<22)
- rsps.Close();
- m_good=false;
- return;
- Encodingcding=System.Text.Encoding.Default;
- stringcontenttype=rsps.ContentType.ToLower();
- intix=contenttype.IndexOf("charset=");
- if(ix!=-1)
- cding=System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix+"charset".Length+1));
- catch
- cding=Encoding.Default;
- //该处视情况而定有的需要解码
- //m_html=HttpUtility.HtmlDecode(newStreamReader(sm,cding).ReadToEnd());
- m_html=newStreamReader(sm,cding).ReadToEnd();
- //该处视情况而定有的需要解码
- Regexregex=newRegex("charset=(?<cding>[^=]+)?\"",RegexOptions.IgnoreCase);
- stringstrcding=regex.Match(m_html).Groups["cding"].Value;
- cding=Encoding.GetEncoding(strcding);
- byte[]bytes=Encoding.Default.GetBytes(m_html.tocharArray());
- m_html=cding.GetString(bytes);
- if(m_html.Split('?').Length>100)
- m_html=Encoding.Default.GetString(bytes);
- m_pagesize=m_html.Length;
- m_uri=rsps.ResponseUri;
- catch(Exceptionex)
- publicWebPage(string_url)
- stringuurl="";
- uurl=Uri.UnescapeDataString(_url);
- _url=uurl;
- catch{};
- Init(_url);
- }
调用
WebPagewebInfo=newWebPage("网址");
- webInfo.Context;
- webInfo.M_html;
- ...参考属性
原文链接:https://www.f2er.com/regex/358218.html