聚合搜索(四)

1.4 各搜索引擎专用类

搜索引擎专用类用来完成具体的搜索任务，例如，Baidu类用来执行Baidu搜索，Google类用来执行Google搜索。它们都继承自ISearch类，主要是执行一些正则表达式操作，把搜索结果匹配出来。搜索结果作为数据，保存在了XML文件中。最后，这个XML文件按照格式化文件result.xsl的格式把搜索结果显示出来。

Search()方法的大致思路是：首先调用GetPageString()方法把搜索的关键字、页码等信息发送到特定搜索引擎，并接收搜索引擎返回的信息。然后对这个信息进行解析，分析出搜索结果的每个记录，并记录在XML文件中。然后再对搜索结果的分页导航进行分析，转换成本系统的形式，并采用Base64编码，把结果记录在XML文件中。在Search()方法执行的最后，XML文件被格式化输出到客户端浏览器显示出来。下面是6大搜索引擎专用类的具体实现代码：

//Google搜索类 google.cs

using System;

using System.Collections.Generic;

using System.Text;

using System.Text.RegularExpressions;

using System.Xml;

public class Google : ISearch

{

public override System.Xml.XmlDocument Search()

{

string xmlstr = GetPageString(); //获取搜索页面的字符串

XmlDataDocument document = new XmlDataDocument();//用于返回的页面

document.LoadXml("<search/>");

string style = Tools.Match(xmlstr,"<head>[//s//S]*?</head>").Value;

style = Tools.delTagArray(style,"script,Meta,title",true);

//删除其他标记

XmlNode xn = Tools.CreateNode(document,document.DocumentElement,"head");

xn.InnerText = Tools.delTagArray(style,"head",false);//删除head标记

//创建body

XmlNode body = Tools.CreateNode(document,
"body");

//搜索记录数描述

XmlNode txt = Tools.CreateNode(document,body,"key");

XmlNode count = Tools.CreateNode(document,"count");

string sou = Tools.Match(xmlstr,"(?<=符合<b>)[//s//S]*?(?=</b>的查
询)").Value; //记录总数

string count2 = Tools.Match(xmlstr,"(?<=约有<b>)[//s//S]*?(?=</
b>)").Value;

count.InnerText = count2;

txt.InnerText = sou; //记录总数

MatchCollection mtc = Tools.MatchCollection(xmlstr,"<div class[//s//S]*?
</div>");

StringBuilder sb = new StringBuilder(1000);

//遍历每个结果，把搜索结果插入xml文档中

foreach (Match mt in mtc)

{

XmlNode item = Tools.CreateNode(document,"item");

//在xml中插入一条搜索记录

XmlNode link = Tools.CreateNode(document,item,"url");

XmlNode desc = Tools.CreateNode(document,"desc");

XmlNode title = Tools.CreateNode(document,"title");

sb = sb.Remove(0,sb.Length);

sb.Append(Tools.Match(mt.Value,"(?<=<td[^>]*?>)[//s//S]*?(?=</td>)").
Value);

MatchCollection itemc = Tools.MatchCollection(sb.ToString(),"[//s
//S]*?<br[^>]*?>");

string ul = Tools.Match(mt.Value,"(?<=<h2[^>]*?>)[//s//S]*?(?=</h2>)").
Value;

ul = Tools.Match(ul,"<a[//s//S]*?</a>").Value;

string u_li = Tools.Match(ul,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value;

link.InnerText = Tools.Replace(u_li,"^/"","");

string u_t = Tools.delTagArray(ul,"a",false);

title.InnerText = Tools.delHtml(u_t); //删除html标记

if (itemc.Count > 0)

{

ul = Tools.delHtml(itemc[0].Value);

desc.InnerText = ul;

}

XmlNode sour1 = Tools.CreateNode(document,"sour");

//搜索得到的网页信息

string str1 = Tools.Match(mt.Value,"(?<=<h2[^>]*?>)[//s//S]*?(?=<
/h2>)").Value;

sour1.InnerText = str1;

XmlNode sour2 = Tools.CreateNode(document,"sour");

string str2 = Tools.Match(mt.Value,"(?<=<td[^>]*?>)[//s//S]*? (?=
</td>)").Value;

str2 = Tools.delTagArray(str2,"td",false);

sour2.InnerText = str2;

XmlNode begin = Tools.CreateNode(document,"begin");

//单条搜索结果开始

string str3 = Tools.Match(mt.Value,"(?=<div[^>]*?>)[//s//S]*?(?=
<h2)").Value;

begin.InnerText = str3;

XmlNode end = Tools.CreateNode(document,"end");

//单条搜索结果结束

str3 = Tools.Match(mt.Value,"(?<=</table>)[//s//S]*?(?<=</div>)").
Value;

end.InnerText = str3;

}

string page = Tools.Match(xmlstr,"(?=<div id=navbar class=n>)[//s//S]*?(?
=<center>)").Value;

MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s//
S]*?</a>");

foreach (Match mt in mcpage) //遍历每个页码，替换为本系统的形式

{

string s2 = mt.Value;

s2 = Tools.Match(s2,@"(?<=href=/search/?)[^/s>]*").Value;

//和搜索相关的参数

page = page.Replace("/search?" + s2,"?nav_go_post=" + Tools.To
Base64(s2) + "&itemtype=" + ItemType);

}

XmlNode pageNv = Tools.CreateNode(document,"pageSite");

//插入分页导航

page = Tools.delTagArray(page,"img",false);

pageNv.InnerText = page; //赋予导航内容

return document;

}

//百度搜索类 baidu.cs

using System;

using System.Collections.Generic;

using System.Text;

using System.Xml;

using System.Text.RegularExpressions;

public class Baidu:ISearch

{

public override System.Xml.XmlDocument Search()

{

string xmlstr = GetPageString(); //获取搜索页

XmlDocument document = new XmlDocument(); //返回的页面

document.LoadXml("<search/>");

string style = Tools.Match(xmlstr,"<head>[//s//S]*?</head>").Value;
//css

style = Tools.delTagArray(style,"script",true); //删除脚本

//创建头部

XmlNode xn = Tools.CreateNode(document,
"head");

//删除head标记

xn.InnerText = Tools.delTagArray(style,false);

//创建body

XmlNode body = Tools.CreateNode(document,
"body");

//搜索记录数的描述

XmlNode txt = Tools.CreateNode(document,"key");

XmlNode count = Tools.CreateNode(document,"count");

//记录总数

string sou = Tools.Match(xmlstr,"(?<=<input name=wd size=/"35/" class=
/"i/" value=/")[//s//S]*?(?=/" maxlength=/"100/")").Value;

string count2 = Tools.Match(xmlstr,"(?<=找到相关网页[^//d])[//s//S]*?
(?=篇)").Value;

count.InnerText = count2;

txt.InnerText = sou ;//记录总数

//搜索结果的记录集

MatchCollection mtc = Tools.MatchCollection(xmlstr,"<table border=/"0/"
cellpadding=/"0/" cellspacing=/"0/">[//s//S]*?</table>");

StringBuilder sb = new StringBuilder(1000);

//遍历每个结果，把搜索结果插入xml文档中

foreach (Match mt in mtc)

{

XmlNode item = Tools.CreateNode(document,"item");

//在xml中插入一条搜索记录

XmlNode link = Tools.CreateNode(document,"url"); //单条超链接

XmlNode desc = Tools.CreateNode(document,"desc"); //单条搜索结果的描述

XmlNode title = Tools.CreateNode(document,"title"); //单条搜索结果的标题

sb.Remove(0,sb.Length); //清空

sb.Append(Tools.Match(mt.Value,"(?<=<td[^>]*?>)[//s//S]*?(?=
</td>)").Value);

MatchCollection itemc = Tools.MatchCollection(sb.ToString(),
"[//s//S]*?<br[^>]*?>");

if(itemc.Count>=3)

{

string u1 = Tools.Match(itemc[0].Value,"<a[//s//S]*?</
a>").Value;

string u_li=Tools.Match(u1,"(?<=href=[/"]?).*?(?=[/"]?
[//s>])").Value;

link.InnerText = Tools.Replace(u_li,"");

string u_t = Tools.delTagArray(u1,false);

title.InnerText = Tools.delHtml(u_t); //删除html标记

u1 = Tools.delHtml(itemc[1].Value);

desc.InnerText = u1;

foreach (Match mt1 in itemc)

{

//搜索的网页信息

XmlNode sour1 = Tools.CreateNode(document,
"sour");

string da = Tools.delTagArray(mt1.Value,"br",false); //删除br

sour1.InnerText=da;

}

XmlNode end = Tools.CreateNode(document,"end"); //搜索结果结束

end.InnerText ="</font><br/>";

}

string page = Tools.Match(xmlstr,"<div class=/"p/">[//s//S]*?</div>").
Value; //分页

MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s
//S]*?</a>");

//遍历每个页码，替换为本系统的形式

foreach (Match mt in mcpage)

{

string s2 = mt.Value;

s2 = Tools.Match(s2,@"(?<=href=s/?)[^/s>]*").Value;

page = page.Replace("s?" + s2,"?nav_go_post="+Tools.ToBase64(s2)
+"&itemtype=" + ItemType); //替换超链接

}

XmlNode pageNv = Tools.CreateNode(document,"pageSite"); //插入分页导航

pageNv.InnerText = page; //赋予导航内容

return document;

}

//搜狗类 sogou.cs

using System;

using System.Collections.Generic;

using System.Text;

using System.Text.RegularExpressions;

using System.Xml;

using System.Web;

public class Sogou : ISearch

{

public override System.Xml.XmlDocument Search()

{

string xmlstr = GetPageString(); //获取搜索页

xmlstr = Tools.delTagArray(xmlstr,true); //删除脚本

XmlDataDocument document = new XmlDataDocument();

document.LoadXml("<search/>");

string style = Tools.Match(xmlstr,"(?=<style[^>]*?)[//s//S]*?(?<=</style>)").
Value;//css

//头部

XmlNode head = Tools.CreateNode(document,"head");

head.InnerText = style;

XmlNode body = Tools.CreateNode(document,"body");

//记录描述

XmlNode count = Tools.CreateNode(document,"count"); //记录总数

XmlNode txt = Tools.CreateNode(document,"key");

string sou = Tools.Match(xmlstr,"(?<=<input name=/"query/" type=/"text/" class
=/"query/" size=/"35/" tabindex=/"1/" value=/")[//s//S]*?(?=/"/>)").Value;

string count2 = Tools.Match(xmlstr,"(?<=找到)[//s//S]*?(?=个网页)").
Value;

count.InnerText = count2;

txt.InnerText = sou;

string xmlstr2 = Tools.Match(xmlstr,"(?<=<div id=/"content/">)[//s
//S]*?(?<=<div id=/"pagebar/">)").Value;

MatchCollection mtc = Tools.MatchCollection(xmlstr2,"(<div>)[//s//S]*?
(</div>)");

//遍历每个结果，把搜索结果插入xml文档中

foreach (Match mt in mtc)

{

XmlNode item = Tools.CreateNode(document,"item");

XmlNode url = Tools.CreateNode(document,"url");

XmlNode desc = Tools.CreateNode(document,"desc");

XmlNode title = Tools.CreateNode(document,"title");

XmlNode begin = Tools.CreateNode(document,"begin");

XmlNode end = Tools.CreateNode(document,"end");

string ul = Tools.Match(mt.Value,"(?<=<h2>)[//s//S]*?(?=</h2>)").
Value;

string ul_li = Tools.Match(ul,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value;

url.InnerText = Tools.Replace(ul_li,"[/"']","");

string u_t = Tools.delTagArray(ul,false); //删除超链接

title.InnerText = Tools.delHtml(u_t);

MatchCollection itemc = Tools.MatchCollection(mt.Value,"(?<=<p[^>]*?>)
[//s//S]*?(?=</p>)");

if (itemc.Count > 0)

{

string de = itemc[0].Value;

de = Tools.delHtml(de);

desc.InnerText = Tools.Replace(de,"");

}

XmlNode sour1 = Tools.CreateNode(document,"sour");//网页内容

string sout1str = Tools.Match(mt.Value,"(?<=<h2>)[//s//S]*?(?=
</h2>)").Value;

sout1str = Tools.Replace(sout1str,"(<br^[>]*?)*","");

sour1.InnerText = Tools.Replace(sout1str,"(?=onclick=)[//s//S]*?
(?<=;/")","");

XmlNode sour2 = Tools.CreateNode(document,"sour");

string content = Tools.Match(mt.Value,"(?<=</h2>)[//s//S]*?(?=</
div>)").Value;

content = Tools.Replace(content,"");

Tools.CreateCData(document,sour2,content); //添加一条搜索记录

begin.InnerText = "<div>"; //每条记录的开始

end.InnerText = "</div>"; //每条记录的结束

}

string page = Tools.Match(xmlstr,"(?=)[//s
//S]*?(?=)").Value;//分页

MatchCollection mcpage = Tools.MatchCollection(page,"(?=<a[//s//S]*?
>)[//s//S]*?(?<=</a>)");

//遍历每个页码，替换为本系统的形式

foreach (Match mt in mcpage)

{

string s2 = mt.Value;

s2 = Tools.Replace(s2,"/"","");

s2 = Tools.Match(s2,@"(?<=href=/?)[/s/S]*?(?=>)").Value;

page = page.Replace("?" + s2,"?nav_go_post=" + Tools.ToBase64(s2)
+ "&itemtype=" + ItemType);

}

XmlNode pageNv = Tools.CreateNode(document,"pageSite");//插入分页导航

pageNv.InnerText = page; //赋予导航内容

return document;

}

//爱问搜索类 iask.cs

using System;

using System.Collections.Generic;

using System.Text;

using System.Text.RegularExpressions;

using System.Xml;

public class Iask : ISearch

{

public override System.Xml.XmlDocument Search()

{

string xmlstr = GetPageString(); //获取搜索页

XmlDataDocument document = new XmlDataDocument(); //返回的页面

document.LoadXml("<search/>");

XmlNode head = Tools.CreateNode(document,
"head");

XmlNode body = Tools.CreateNode(document,
"body");

//搜索记录数描述

XmlNode count = Tools.CreateNode(document,"count");

XmlNode txt = Tools.CreateNode(document,"(?<=<title>)[//s//S]*?(?= - 爱问搜索)").
Value;

//记录总数

string count2 = Tools.Match(xmlstr,"(?<=找到 <span class=/"ar/">) [//s
//S]*?(?=</span> 篇网页)").Value;

count.InnerText = count2;

txt.InnerText = sou;

string style = Tools.Match(xmlstr,"(?<=<head>)[//s//S]*?(?<=</head>)").
Value;

style = Tools.Match(xmlstr,"(?=<style[^>]*?>)[//s//S]*?(?<=</style>)").
Value;

style = Tools.delTagArray(style,true);

head.InnerText = style;

string xmlstr2 = Tools.Match(xmlstr,"(?<=)[//s
//S]*?(?=)").Value;

MatchCollection mtc = Tools.MatchCollection(xmlstr2,"<table[^>]*?>[//s
//S]*?</table>");

//遍历每个结果，把搜索结果插入xml文档中

foreach (Match mt in mtc)

{

XmlNode item = Tools.CreateNode(document,"title");

MatchCollection itemc = Tools.MatchCollection(mt.Value,"[//s//S]*?
(?<=<br[^>]*?>)");

string ul = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").Value;

string u_li = Tools.Match(ul,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value;

url.InnerText = Tools.Replace(u_li,false);

title.InnerText = Tools.delHtml(u_t);

ul = Tools.delHtml(itemc[1].Value);

desc.InnerText = ul;

//网页信息

XmlNode sour1 = Tools.CreateNode(document,"sour");

sour1.InnerText = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").
Value;

XmlNode sour2 = Tools.CreateNode(document,"sour");

string str = "";

for (int j = 1; j < itemc.Count; j++)

{

str = str + itemc[j].Value;

}

sour2.InnerText = str;

}

string page = Tools.Match(xmlstr,"(?<=)[//s//S]*?
(?<=</table>)").Value;

MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s //S]*?
</a>");

//遍历每个页码，替换为本系统的形式

foreach (Match mt in mcpage)

{

string s2 = mt.Value;

s2 = Tools.Replace(s2,@"(?<=href=/s/?)[^/s>]*").Value;

page = page.Replace("/s?" + s2,"pageSite"); //插入分页导航

pageNv.InnerText = page; //赋予导航内容

return document;

}

//雅虎类 yahoo.cs

using System;

using System.Collections.Generic;

using System.Text;

using System.Xml;

using System.Text.RegularExpressions;

public class Yahoo : ISearch

{

public override System.Xml.XmlDocument Search()

{

string xmlstr = GetPageString(); //获取搜索页

xmlstr = Tools.Replace(xmlstr,"(?=<div class=/"pm r/">)[//s//S]*?(?=</
table>)","");

XmlDocument document = new XmlDocument(); //返回的页面

document.LoadXml("<search/>");

XmlNode head = Tools.CreateNode(document,
"body");

//记录描述

XmlNode count = Tools.CreateNode(document,"(?<=<title>雅虎搜索_)[//s//S]*?
(?=</title>)").Value;

//记录总数

count.InnerText = Tools.Match(xmlstr,"(?<=共返回[^//d])[//s//S]*?(?=
项)").Value;

txt.InnerText = sou;

string style = Tools.Match(xmlstr,"(?<=<head>*?)[//s//S]*?(?<=</head>)").
Value;//css

style = Tools.delTagArray(style,true);//删除脚本和title

style = Tools.delTagArray(style,"Meta",false); //删除 Meta

style = Tools.Match(style,"(?=<style>)[//s//S]*?(?<=</style>)").Value;

head.InnerText = style;

MatchCollection mtc = Tools.MatchCollection(xmlstr,"(<div class=/"i/">)[//s
//S]*?(</table>)");

//遍历每个结果，把搜索结果插入xml文档中

foreach (Match mt in mtc)

{

XmlNode item = Tools.CreateNode(document,"item");

string link = Tools.Match(mt.Value,"(?<=<div class=/"i/">)[//s//S]*?
(?=<table cellspacing=/"0/">)").Value;

string ul_li = Tools.Match(link,"(?<=href=[/"]?).*?(?=[/"]?[//s>])").
Value;

if (ul_li != "")

{

//信息节点

XmlNode url = Tools.CreateNode(document,"title");

url.InnerText = Tools.Replace(ul_li,"");

string u_t = Tools.delTagArray(link,false);

title.InnerText = Tools.delHtml(u_t);

string de = Tools.Match(mt.Value,"(?=<td class=/"d/">)[//s//S]
*?(?<=<div class=/"rel/">)").Value;

de = Tools.delHtml(de);

desc.InnerText = de;

XmlNode begin = Tools.CreateNode(document,"begin"); //单条搜索结果的开始

begin.InnerText = "<div class=/"i/">";

XmlNode end = Tools.CreateNode(document,"end"); //单条搜索结果的结束

end.InnerText = "</div>";

XmlNode sour1 = Tools.CreateNode(document,"sour"); //单条结果的内容

sour1.InnerText = Tools.Replace(link,"sour");

string sourstr2 = Tools.Match(mt.Value,"(?=<table cellspacing
=/"0/">)[//s//S]*?(?<=</table>)").Value;

sourstr2 = Tools.Replace(sourstr2,"(<a[^>]*?)[//s//S]*?(?<=
- )",Tools.Replace(sourstr2,
"(?=onclick=)[//s//S]*?(?<=;/")",""));

}

string page = Tools.Match(xmlstr,"(<div id=/"pg/">)[//s//S]*?(?<=</div>)").
Value;//分页

MatchCollection mcpage = Tools.MatchCollection(page,"(?=<a[//s//S]*?>)
[//s//S]*?(?<=</a>)");

//遍历每个页码，替换为本系统的形式

foreach (Match mt in mcpage)

{

string s2 = mt.Value;

s2 = Tools.Replace(s2,"pageSite");

//插入分页导航

pageNv.InnerText = page; //赋予分页导航内容

return document;

}

////中搜 zhongsou.cs

using System;

using System.Collections.Generic;

using System.Text;

using System.Text.RegularExpressions;

using System.Xml;

public class Zhongsou : ISearch

{

public override System.Xml.XmlDocument Search()

{

string xmlstr = GetPageString(); //获取搜索页

XmlDocument document = new XmlDocument(); //返回的xml

document.LoadXml("<search/>");

XmlNode head = Tools.CreateNode(document,
"head");//创建头部

XmlNode body = Tools.CreateNode(document,
"body");//创建body

//搜索记录数描述

XmlNode count = Tools.CreateNode(document,"count");

XmlNode txt2 = Tools.CreateNode(document,"(?<=<title>中搜网页_)[//s//S]*? (?=</
title>)").Value;

string count2 = Tools.Match(xmlstr,"(?<=找到)[//s//S]*?(?=条结果)").Value; //搜索结果总数

count.InnerText = count2;

txt2.InnerText = sou;

string style = Tools.Match(xmlstr,"(?=<head>)[//s//S]*?(?<=</head>)").
Value; //样式

style = Tools.Match(style,true);

head.InnerText = style;

MatchCollection mtc = Tools.MatchCollection(xmlstr,"(?=<table cellspacing
=/"0/" cellpadding=/"0/">)[//s//S]*?(?<=</table>)");

//遍历每个结果，把搜索结果插入xml文档中

foreach (Match mt in mtc)

{

XmlNode item = Tools.CreateNode(document,"item");

//在xml中插入一条搜索记录

XmlNode url = Tools.CreateNode(document,"url");//单条超链接

XmlNode desc = Tools.CreateNode(document,"desc");

//单条搜索结果的描述

XmlNode title = Tools.CreateNode(document,"title"); //单条搜索结果的标题

MatchCollection itemc = Tools.MatchCollection(mt.Value,"[//s//S]*?
<br[^>]*?>");

string ul = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").Value;

string ul_li = Tools.Match(ul,false);

title.InnerText = Tools.delHtml(u_t); //删除html

if (itemc.Count > 1)

{

ul = Tools.delHtml(itemc[1].Value);

desc.InnerText = ul; //获取描述信息

}

//显示信息

XmlNode sour1 = Tools.CreateNode(document,"sour");

string sourstr = Tools.Match(itemc[0].Value,"<a[//s//S]*?</a>").
Value;

sour1.InnerText = Tools.Replace(sourstr,"(?=onmousedown=)[//s//S]*?
(?<=//)/")","sour");

string txt = Tools.Match(mt.Value,"(?=<td[^>]*?)[//s//S]*?(?<=</td>)").
Value;

txt = Tools.Replace(txt,"<a[//s//S]*?</a>",""); //删除超链接

txt = Tools.delTagArray(txt,false);

sour2.InnerText = txt; //单条记录的内容

XmlNode begin = Tools.CreateNode(document,"begin");

//单条记录的开始

begin.InnerText = "<table cellspacing=/"0/" cellpadding=/"0/"><tr><td
class=/"f/">";

XmlNode end = Tools.CreateNode(document,"end");//单条记录的结束

end.InnerText = "</td></tr></table>";

}

string page = Tools.Match(xmlstr,"(<table ><tr><td class=db>)[//s//S]*?
(</table>)").Value;

MatchCollection mcpage = Tools.MatchCollection(page,"<a[^>]*?>[//s//S]*?
</a>");

//遍历每个页码，替换为本系统的形式

foreach (Match mt in mcpage)

{

string s2 = mt.Value;

s2 = Tools.Replace(s2,@"(?<=href=p/?)[^/s>]*").Value;

page = page.Replace("p?" + s2,"pageSite");//插入分页导航

pageNv.InnerText = page; //赋予导航内容

return document;

}

聚合搜索(四)

猜你在找的设计模式相关文章