晚上起先和朋友们跑步去了,然后回来之后洗了个澡,打开VS新建项目发现都会弹出一个问题
然后就去找万能的度娘了,http://bbs.csdn.net/topics/390514964?page=1#post-395015041
25楼真相,卸载掉那2个补丁就可以了,不过在卸载第一个补丁的时候你需要停止他指出的那个服务。
我当初刚开始接触正则是去年公司主管让我去学,然后发了个网址给我:http://www.cnblogs.com/ie421/archive/2008/07/23/1249896.html
看完后收益颇大,下面就开始正题。
之所以要获取博客园的内容是因为博客园造就了我,而大家也都是在博客园里相识,所以我们就以博客园为例子。
下面上传的这个是当初主管给我的一个类,大家可以参考参考,我今天的内容用到了里面的GetString()这个方法。在运行之前要引用System.Web
1 using System; 2 using System.Collections.Generic; 3 using System.IO; 4 using System.IO.Compression; 5 using System.Linq; 6 using System.Net; 7 using System.Text; 8 using System.Web; 9 10 namespace CnblogsSearch 11 { 12 public class HttpClient 13 { 14 #region fields 15 private bool keepContext; 16 private string defaultLanguage = "zh-CN"; 17 private Encoding defaultEncoding = Encoding.UTF8; 18 private string accept = "*/*"; 19 private string userAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"; 20 private HttpVerb verb = HttpVerb.GET; 21 private HttpClientContext context; 22 private readonly List<HttpUploadingFile> files = new List<HttpUploadingFile>(); 23 private readonly Dictionary<string,string> postingData = new Dictionary<string,string>(); 24 private string url; 25 private WebHeaderCollection responseHeaders; 26 private int startPoint; 27 private int endPoint; 28 public bool boundaryed; 29 private string encodingType = "utf-8"; 30 private int timeOut = 10000; 31 32 #endregion 33 34 #region events 35 public event EventHandler<StatusUpdateEventArgs> StatusUpdate; 36 37 private void OnStatusUpdate(StatusUpdateEventArgs e) 38 { 39 EventHandler<StatusUpdateEventArgs> temp = StatusUpdate; 40 41 if (temp != null) 42 temp(this,e); 43 } 44 #endregion 45 46 #region properties 47 48 public string EncodingType 49 { 50 get 51 { 52 return encodingType; 53 } 54 set 55 { 56 encodingType = value; 57 } 58 } 59 60 /// <summary> 61 /// 是否启用gzip压缩传输 62 /// </summary> 63 public bool IsGzip { get; set; } 64 65 /// <summary> 66 /// 是否在数据流中编码 67 /// </summary> 68 public bool encodeMemory { get; set; } 69 /// <summary> 70 /// 是否自动在不同的请求间保留Cookie,Referer 71 /// </summary> 72 public bool KeepContext 73 { 74 get { return keepContext; } 75 set { keepContext = value; } 76 } 77 public CookieContainer cookie; 78 /// <summary> 79 /// 期望的回应的语言 80 /// </summary> 81 public string DefaultLanguage 82 { 83 get { return defaultLanguage; } 84 set { defaultLanguage = value; } 85 } 86 87 /// <summary> 88 /// GetString()如果不能从HTTP头或Meta标签中获取编码信息,则使用此编码来获取字符串 89 /// </summary> 90 public Encoding DefaultEncoding 91 { 92 get { return defaultEncoding; } 93 set { defaultEncoding = value; } 94 } 95 96 public int TimeOut 97 { 98 get 99 { 100 return timeOut; 101 } 102 set 103 { 104 timeOut = value; 105 } 106 } 107 /// <summary> 108 /// 指示发出Get请求还是Post请求 109 /// </summary> 110 public HttpVerb Verb 111 { 112 get { return verb; } 113 set { verb = value; } 114 } 115 116 /// <summary> 117 /// 要上传的文件.如果不为空则自动转为Post请求 118 /// </summary> 119 public List<HttpUploadingFile> Files 120 { 121 get { return files; } 122 } 123 124 public List<RepeatPostData> repeatPostData 125 { 126 get; 127 set; 128 } 129 130 /// <summary> 131 /// 要发送的Form表单信息 132 /// </summary> 133 public Dictionary<string,string> PostingData 134 { 135 136 get { return postingData; } 137 } 138 139 /// <summary> 140 /// 获取或设置请求资源的地址 141 /// </summary> 142 public string Url 143 { 144 get { return url; } 145 set { url = value; } 146 } 147 148 /// <summary> 149 /// 用于在获取回应后,暂时记录回应的HTTP头 150 /// </summary> 151 public WebHeaderCollection ResponseHeaders 152 { 153 get { return responseHeaders; } 154 } 155 156 /// <summary> 157 /// 获取或设置期望的资源类型 158 /// </summary> 159 public string Accept 160 { 161 get { return accept; } 162 set { accept = value; } 163 } 164 165 /// <summary> 166 /// 获取或设置请求中的Http头User-Agent的值 167 /// </summary> 168 public string UserAgent 169 { 170 get { return userAgent; } 171 set { userAgent = value; } 172 } 173 174 /// <summary> 175 /// 获取或设置Cookie及Referer 176 /// </summary> 177 public HttpClientContext Context 178 { 179 get { return context; } 180 set { context = value; } 181 } 182 183 /// <summary> 184 /// 获取或设置获取内容的起始点,用于断点续传,多线程下载等 185 /// </summary> 186 public int StartPoint 187 { 188 get { return startPoint; } 189 set { startPoint = value; } 190 } 191 192 /// <summary> 193 /// 获取或设置获取内容的结束点,多下程下载等. 194 /// 如果为0,表示获取资源从StartPoint开始的剩余内容 195 /// </summary> 196 public int EndPoint 197 { 198 get { return endPoint; } 199 set { endPoint = value; } 200 } 201 202 203 204 #endregion 205 206 #region constructors 207 /// <summary> 208 /// 构造新的HttpClient实例 209 /// </summary> 210 public HttpClient() 211 : this(null) 212 { 213 } 214 215 /// <summary> 216 /// 构造新的HttpClient实例 217 /// </summary> 218 /// <param name="url">要获取的资源的地址</param> 219 public HttpClient(string url) 220 : this(url,null) 221 { 222 } 223 224 /// <summary> 225 /// 构造新的HttpClient实例 226 /// </summary> 227 /// <param name="url">要获取的资源的地址</param> 228 /// <param name="context">Cookie及Referer</param> 229 public HttpClient(string url,HttpClientContext context) 230 : this(url,context,false) 231 { 232 } 233 234 /// <summary> 235 /// 构造新的HttpClient实例 236 /// </summary> 237 /// <param name="url">要获取的资源的地址</param> 238 /// <param name="context">Cookie及Referer</param> 239 /// <param name="keepContext">是否自动在不同的请求间保留Cookie,Referer</param> 240 public HttpClient(string url,HttpClientContext context,bool keepContext) 241 { 242 this.url = url; 243 this.context = context; 244 this.keepContext = keepContext; 245 if (this.context == null) 246 this.context = new HttpClientContext(); 247 cookie = new CookieContainer(); 248 } 249 #endregion 250 251 #region AttachFile 252 /// <summary> 253 /// 在请求中添加要上传的文件 254 /// </summary> 255 /// <param name="fileName">要上传的文件路径</param> 256 /// <param name="fieldName">文件字段的名称(相当于<input type=file name=fieldName>)里的fieldName)</param> 257 public void AttachFile(string fileName,string fieldName) 258 { 259 HttpUploadingFile file = new HttpUploadingFile(fileName,fieldName); 260 files.Add(file); 261 } 262 263 /// <summary> 264 /// 在请求中添加要上传的文件 265 /// </summary> 266 /// <param name="data">要上传的文件内容</param> 267 /// <param name="fileName">文件名</param> 268 /// <param name="fieldName">文件字段的名称(相当于<input type=file name=fieldName>)里的fieldName)</param> 269 public void AttachFile(byte[] data,string fileName,string fieldName) 270 { 271 HttpUploadingFile file = new HttpUploadingFile(data,fileName,fieldName); 272 files.Add(file); 273 } 274 #endregion 275 276 /// <summary> 277 /// 清空PostingData,Files,StartPoint,EndPoint,ResponseHeaders,并把Verb设置为Get. 278 /// 在发出一个包含上述信息的请求后,必须调用此方法或手工设置相应属性以使下一次请求不会受到影响. 279 /// </summary> 280 public void Reset() 281 { 282 verb = HttpVerb.GET; 283 files.Clear(); 284 postingData.Clear(); 285 responseHeaders = null; 286 startPoint = 0; 287 endPoint = 0; 288 IsGzip = false; 289 if (repeatPostData != null) repeatPostData.Clear(); 290 } 291 public string ip; 292 private IPEndPoint BindIPEndPointCallback(ServicePoint servicePoint,IPEndPoint remoteEndPoint,int retryCount) 293 { 294 return new IPEndPoint(IPAddress.Parse(ip),0); 295 } 296 297 public string cookieStr = ""; 298 299 private HttpWebRequest CreateRequest() 300 { 301 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); 302 303 req.CookieContainer = cookie; 304 //req.Headers.Add("Accept-Language",defaultLanguage); 305 req.Accept = accept; 306 req.UserAgent = userAgent; 307 req.KeepAlive = true; 308 req.AllowAutoRedirect = true; 309 req.Timeout = TimeOut; 310 311 312 313 if (IsGzip) 314 { 315 req.Headers.Add("Accept-Encoding","gzip"); 316 } 317 318 if (ip != null) 319 { 320 req.ServicePoint.BindIPEndPointDelegate = new BindIPEndPoint(BindIPEndPointCallback); 321 } 322 if (context.Cookies != null) 323 req.CookieContainer.Add(context.Cookies); 324 if (!string.IsNullOrEmpty(context.Referer)) 325 req.Referer = context.Referer; 326 327 if (verb == HttpVerb.HEAD) 328 { 329 req.Method = "HEAD"; 330 return req; 331 } 332 333 if (postingData.Count > 0 || files.Count > 0) 334 verb = HttpVerb.POST; 335 if (cookieStr != "") req.Headers.Add("Cookie",cookieStr); 336 if (verb == HttpVerb.POST) 337 { 338 req.Method = "POST"; 339 340 MemoryStream memoryStream = new MemoryStream(); 341 342 StreamWriter writer; 343 if (encodeMemory) 344 { 345 writer = new StreamWriter(memoryStream,Encoding.GetEncoding(EncodingType)); 346 } 347 else 348 writer = new StreamWriter(memoryStream); 349 350 if (files.Count > 0 || boundaryed) 351 { 352 string newLine = "\r\n"; 353 string boundary = Guid.NewGuid().ToString().Replace("-",""); 354 req.ContentType = "multipart/form-data; boundary=" + boundary; 355 356 foreach (string key in postingData.Keys) 357 { 358 writer.Write("--" + boundary + newLine); 359 writer.Write("Content-Disposition: form-data; name=\"{0}\"{1}{1}",key,newLine); 360 writer.Write(postingData[key] + newLine); 361 } 362 363 364 365 foreach (HttpUploadingFile file in files) 366 { 367 writer.Write("--" + boundary + newLine); 368 writer.Write( 369 "Content-Disposition: form-data; name=\"{0}\"; filename=\"{1}\"{2}",370 file.FieldName,371 file.FileName,372 newLine 373 ); 374 writer.Write("Content-Type: image/jpeg" + newLine + newLine); 375 writer.Flush(); 376 memoryStream.Write(file.Data,0,file.Data.Length); 377 writer.Write(newLine); 378 writer.Write("--" + boundary + "--" + newLine); 379 } 380 381 } 382 else 383 { 384 req.ContentType = "application/x-www-form-urlencoded"; 385 StringBuilder sb = new StringBuilder(); 386 foreach (string key in postingData.Keys) 387 { 388 sb.AppendFormat("{0}={1}&",HttpUtility.UrlEncode(key,Encoding.GetEncoding(EncodingType)),HttpUtility.UrlEncode(postingData[key],Encoding.GetEncoding(EncodingType))); 389 } 390 391 if (repeatPostData != null) 392 { 393 foreach (var item in repeatPostData) 394 { 395 sb.AppendFormat("{0}={1}&",HttpUtility.UrlEncode(item.key,HttpUtility.UrlEncode(item.value,Encoding.GetEncoding(EncodingType))); 396 } 397 } 398 399 if (sb.Length > 0) 400 sb.Length--; 401 writer.Write(sb.ToString()); 402 } 403 404 writer.Flush(); 405 406 using (Stream stream = req.GetRequestStream()) 407 { 408 memoryStream.WriteTo(stream); 409 } 410 } 411 412 if (startPoint != 0 && endPoint != 0) 413 req.AddRange(startPoint,endPoint); 414 else if (startPoint != 0 && endPoint == 0) 415 req.AddRange(startPoint); 416 417 return req; 418 } 419 420 /// <summary> 421 /// 发出一次新的请求,并返回获得的回应 422 /// 调用此方法永远不会触发StatusUpdate事件. 423 /// </summary> 424 /// <returns>相应的HttpWebResponse</returns> 425 public HttpWebResponse GetResponse() 426 { 427 428 HttpWebRequest req = CreateRequest(); 429 HttpWebResponse res = null; 430 try 431 { 432 res = (HttpWebResponse)req.GetResponse(); 433 434 435 responseHeaders = res.Headers; 436 if (keepContext) 437 { 438 context.Cookies = res.Cookies; 439 context.Referer = url; 440 cookie.Add(context.Cookies); 441 } 442 } 443 catch (Exception) 444 { throw; } 445 return res; 446 447 } 448 449 /// <summary> 450 /// 发出一次新的请求,并返回回应内容的流 451 /// 调用此方法永远不会触发StatusUpdate事件. 452 /// </summary> 453 /// <returns>包含回应主体内容的流</returns> 454 public Stream GetStream() 455 { 456 return GetResponse().GetResponseStream(); 457 } 458 public string responseURL; 459 /// <summary> 460 /// 发出一次新的请求,并以字节数组形式返回回应的内容 461 /// 调用此方法会触发StatusUpdate事件 462 /// </summary> 463 /// <returns>包含回应主体内容的字节数组</returns> 464 public byte[] GetBytes() 465 { 466 byte[] result = new byte[] { 0,1 }; 467 try 468 { 469 HttpWebResponse res = GetResponse(); 470 int length = (int)res.ContentLength; 471 responseURL = res.ResponseUri.AbsoluteUri; 472 MemoryStream memoryStream = new MemoryStream(); 473 byte[] buffer = new byte[0x100]; 474 Stream rs = res.GetResponseStream(); 475 for (int i = rs.Read(buffer,0,buffer.Length); i > 0; i = rs.Read(buffer,buffer.Length)) 476 { 477 memoryStream.Write(buffer,i); 478 OnStatusUpdate(new StatusUpdateEventArgs((int)memoryStream.Length,length)); 479 } 480 rs.Close(); 481 result = memoryStream.ToArray(); 482 } 483 catch (Exception) 484 { 485 throw; 486 } 487 488 return result; 489 } 490 491 /// <summary> 492 /// 发出一次新的请求,以Http头,或Html Meta标签,或DefaultEncoding指示的编码信息对回应主体解码 493 /// 调用此方法会触发StatusUpdate事件 494 /// </summary> 495 /// <returns>解码后的字符串</returns> 496 public string GetString() 497 { 498 byte[] data = GetBytes(); 499 if (responseHeaders.AllKeys.Contains<string>("Content-Encoding") && responseHeaders["Content-Encoding"].Contains("gzip")) 500 { 501 //Console.WriteLine(responseHeaders["Content-Encoding"].ToString()); 502 data = GZipDecompress(data); 503 } 504 505 string encodingName = GetEncodingFromHeaders(); 506 507 if (encodingName == null) 508 encodingName = GetEncodingFromBody(data); 509 510 Encoding encoding; 511 if (encodingName == null) 512 encoding = defaultEncoding; 513 else 514 { 515 try 516 { 517 encoding = Encoding.GetEncoding(encodingName); 518 } 519 catch (ArgumentException) 520 { 521 encoding = defaultEncoding; 522 } 523 } 524 return encoding.GetString(data); 525 } 526 527 /// <summary> 528 /// 发出一次新的请求,对回应的主体内容以指定的编码进行解码 529 /// 调用此方法会触发StatusUpdate事件 530 /// </summary> 531 /// <param name="encoding">指定的编码</param> 532 /// <returns>解码后的字符串</returns> 533 public string GetString(Encoding encoding) 534 { 535 byte[] data = GetBytes(); 536 return encoding.GetString(data); 537 } 538 539 /// <summary> 540 /// GZip解压函数 541 /// </summary> 542 /// <param name="data"></param> 543 /// <returns></returns> 544 private byte[] GZipDecompress(byte[] data) 545 { 546 using (MemoryStream stream = new MemoryStream()) 547 { 548 using (GZipStream gZipStream = new GZipStream(new MemoryStream(data),CompressionMode.Decompress)) 549 { 550 byte[] bytes = new byte[40960]; 551 int n; 552 while ((n = gZipStream.Read(bytes,bytes.Length)) != 0) 553 { 554 stream.Write(bytes,n); 555 } 556 gZipStream.Close(); 557 } 558 559 return stream.ToArray(); 560 } 561 } 562 563 private string GetEncodingFromHeaders() 564 { 565 string encoding = null; 566 try 567 { 568 string contentType = responseHeaders["Content-Type"]; 569 if (contentType != null) 570 { 571 int i = contentType.IndexOf("charset="); 572 if (i != -1) 573 { 574 encoding = EncodingType = contentType.Substring(i + 8); 575 } 576 } 577 } 578 catch (Exception) 579 { } 580 return encoding; 581 } 582 583 private string GetEncodingFromBody(byte[] data) 584 { 585 //string encodingName = null; 586 string dataAsAscii = Encoding.ASCII.GetString(data); 587 if (dataAsAscii != null) 588 { 589 int i = dataAsAscii.IndexOf("charset="); 590 if (i != -1) 591 { 592 int j = dataAsAscii.IndexOf("\"",i); 593 if (j != -1) 594 { 595 int k = i + 8; 596 EncodingType = dataAsAscii.Substring(k,(j - k) + 1); 597 char[] chArray = new char[2] { '>','"' }; 598 EncodingType = EncodingType.TrimEnd(chArray); 599 } 600 } 601 } 602 return EncodingType; 603 } 604 605 /// <summary> 606 /// 发出一次新的Head请求,获取资源的长度 607 /// 此请求会忽略PostingData,Verb 608 /// </summary> 609 /// <returns>返回的资源长度</returns> 610 public int HeadContentLength() 611 { 612 Reset(); 613 HttpVerb lastVerb = verb; 614 verb = HttpVerb.HEAD; 615 using (HttpWebResponse res = GetResponse()) 616 { 617 verb = lastVerb; 618 return (int)res.ContentLength; 619 } 620 } 621 622 /// <summary> 623 /// 发出一次新的请求,把回应的主体内容保存到文件 624 /// 调用此方法会触发StatusUpdate事件 625 /// 如果指定的文件存在,它会被覆盖 626 /// </summary> 627 /// <param name="fileName">要保存的文件路径</param> 628 public void SaveAsFile(string fileName) 629 { 630 SaveAsFile(fileName,FileExistsAction.Overwrite); 631 } 632 633 /// <summary> 634 /// 发出一次新的请求,把回应的主体内容保存到文件 635 /// 调用此方法会触发StatusUpdate事件 636 /// </summary> 637 /// <param name="fileName">要保存的文件路径</param> 638 /// <param name="existsAction">指定的文件存在时的选项</param> 639 /// <returns>是否向目标文件写入了数据</returns> 640 public bool SaveAsFile(string fileName,FileExistsAction existsAction) 641 { 642 byte[] data = GetBytes(); 643 switch (existsAction) 644 { 645 case FileExistsAction.Overwrite: 646 using (BinaryWriter writer = new BinaryWriter(new FileStream(fileName,FileMode.OpenOrCreate,FileAccess.Write))) 647 writer.Write(data); 648 return true; 649 650 case FileExistsAction.Append: 651 using (BinaryWriter writer = new BinaryWriter(new FileStream(fileName,FileMode.Append,FileAccess.Write))) 652 writer.Write(data); 653 return true; 654 655 default: 656 if (!File.Exists(fileName)) 657 { 658 using ( 659 BinaryWriter writer = 660 new BinaryWriter(new FileStream(fileName,FileMode.Create,FileAccess.Write))) 661 writer.Write(data); 662 return true; 663 } 664 else 665 { 666 return false; 667 } 668 } 669 } 670 } 671 672 public class HttpClientContext 673 { 674 private CookieCollection cookies; 675 private string referer; 676 677 public CookieCollection Cookies 678 { 679 get { return cookies; } 680 set { cookies = value; } 681 } 682 683 public string Referer 684 { 685 get { return referer; } 686 set { referer = value; } 687 } 688 } 689 690 public class RepeatPostData 691 { 692 public string key { get; set; } 693 public string value { get; set; } 694 } 695 696 public enum HttpVerb 697 { 698 GET,699 POST,700 HEAD,701 } 702 703 public enum FileExistsAction 704 { 705 Overwrite,706 Append,707 Cancel,708 } 709 710 public class HttpUploadingFile 711 { 712 private string fileName; 713 private string fieldName; 714 private byte[] data; 715 716 public string FileName 717 { 718 get { return fileName; } 719 set { fileName = value; } 720 } 721 722 public string FieldName 723 { 724 get { return fieldName; } 725 set { fieldName = value; } 726 } 727 728 public byte[] Data 729 { 730 get { return data; } 731 set { data = value; } 732 } 733 734 public HttpUploadingFile(string fileName,string fieldName) 735 { 736 this.fileName = fileName; 737 this.fieldName = fieldName; 738 using (FileStream stream = new FileStream(fileName,FileMode.Open)) 739 { 740 byte[] inBytes = new byte[stream.Length]; 741 stream.Read(inBytes,inBytes.Length); 742 data = inBytes; 743 } 744 } 745 746 public HttpUploadingFile(byte[] data,string fieldName) 747 { 748 this.data = data; 749 this.fileName = fileName; 750 this.fieldName = fieldName; 751 } 752 } 753 754 public class StatusUpdateEventArgs : EventArgs 755 { 756 private readonly int bytesGot; 757 private readonly int bytesTotal; 758 759 public StatusUpdateEventArgs(int got,int total) 760 { 761 bytesGot = got; 762 bytesTotal = total; 763 } 764 765 /// <summary> 766 /// 已经下载的字节数 767 /// </summary> 768 public int BytesGot 769 { 770 get { return bytesGot; } 771 } 772 773 /// <summary> 774 /// 资源的总字节数 775 /// </summary> 776 public int BytesTotal 777 { 778 get { return bytesTotal; } 779 } 780 } 781 }@H_421_4404@
1 /// <summary> 2 /// 根据网址获取页面源码 3 /// </summary> 4 /// <param name="url"></param> 5 /// <returns></returns> 6 public string GetHtml(string url) 7 { 8 string ContentHtml = ""; 9 try 10 { 11 HttpClient hc = new HttpClient(); 12 hc.Url = url; 13 if (!hc.Url.Contains("http://"))//如果输入的网址没有包含http:// 则手动添加 14 { 15 hc.Url = "http://" + hc.Url; 16 } 17 ContentHtml = hc.GetString(); 18 } 19 catch (Exception e)//如果上面的执行出错,则返回继续执行 20 { 21 return GetHtml(url); 22 } 23 return ContentHtml; 24 }@H_421_4404@
然后再观察每条随笔的规律,我们发现没条的开头是<div class="post_item_body">,结尾是<div class="clear">,那我们就可以根据这个规律来写出正则:Regex regexContent = new Regex("<div class=\"post_item_body\">(?<content>.*?)<div class=\"clear\"></div>",RegexOptions.Singleline);
然后可以使用这个正则来获取我们需要匹配的内容了
1 string Html= GetHtml("http://www.cnblogs.com/"); 2 Regex regexContent = new Regex("<div class=\"post_item_body\">(?<content>.*?)<div class=\"clear\"></div>",RegexOptions.Singleline); 3 string blog = regexContent.Match(Html).Groups["content"].Value.ToString();@H_421_4404@
在这里我用到的正则匹配工具是Expresso,有需要的朋友可以留言。当然,如果我有什么地方写的不好的,欢迎各位指出。晚上就先到这里了,该洗洗睡了。