一般是使用json与服务器端交互的,当是如果service无法提供json时,我们通常使用html的解析api或者直接使用正则匹配
我们要使用java实现上面的搜索引擎
package com.org; public class Book { private String href; private String title; private String src; public String getHref() { return href; } public void setHref(String href) { this.href = href; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getSrc() { return src; } public void setSrc(String src) { this.src = src; } @Override public String toString() { return "Book [href=" + href + ",title=" + title + ",src=" + src + "]"; } }
http获取整个html
package com.org; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.PrintStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; public class UtilNet { public static StringBuffer getContent(String url,String body){ URL http = null; HttpURLConnection conn = null; InputStream in = null; OutputStream out = null; PrintStream outPs = null; BufferedReader bufferIn = null; try { http = new URL(url); conn = (HttpURLConnection) http.openConnection(); conn.setRequestMethod("GET"); conn.setRequestProperty("Host","it-ebooks.info"); conn.setRequestProperty("Connection","keep-alive"); conn.setRequestProperty("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"); conn.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/35.0.1916.153 Safari/537.36 SE 2.X MetaSr 1.0"); conn.setRequestProperty("Referer",url); //conn.setRequestProperty("Accept-Encoding","gzip,deflate,sdch");这是进行压缩传输,浏览器会自动解压,而我们使用api所以不用该头文件,不然会乱码 conn.setRequestProperty("Accept-Language","zh-CN,zh;q=0.8"); conn.setDoInput(true); conn.setDoOutput(true); conn.connect(); out = conn.getOutputStream(); outPs = new PrintStream(out); //数据查询 outPs.print(body); in = conn.getInputStream(); bufferIn = new BufferedReader(new InputStreamReader(in,"utf8")); StringBuffer data2 = new StringBuffer(); String line = null; //读取数据 while((line = bufferIn.readLine())!=null){ data2.append(line + "\n"); } out.close(); outPs.close(); in.close(); conn.disconnect(); return data2; } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return null; } }
正则解析html,获取需要的数据
package com.org; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class UtilJson { public static List<Book> paraseHtml(String ct,String reg){ List<Book> books = new ArrayList<Book>(); Book bk = null; String line = null; //行 Pattern patternLine = Pattern.compile(reg,Pattern.CASE_INSENSITIVE); Matcher matcherLine = patternLine.matcher(ct); //href Pattern patternHref = Pattern.compile("/book/[0-9]*",Pattern.CASE_INSENSITIVE); //title Pattern patternTitle = Pattern.compile("'[a-zA-Z\\s]*'",Pattern.CASE_INSENSITIVE); //src Pattern patternSrc = Pattern.compile("/images/.*.jpg",Pattern.CASE_INSENSITIVE); while(matcherLine.find()){ bk = new Book(); line = matcherLine.group(0); //href title src Matcher matcherHref = patternHref.matcher(line); Matcher matcherTitle = patternTitle.matcher(line); Matcher matcherSrc = patternSrc.matcher(line); if(matcherHref.find()&&matcherTitle.find()&&matcherSrc.find()){ bk.setHref(matcherHref.group(0)); bk.setTitle(matcherTitle.group(0).replace("'","")); bk.setSrc(matcherSrc.group(0)); } books.add(bk); } return books; } }
package com.org; import java.io.UnsupportedEncodingException; import java.util.List; import org.junit.Test; public class NetTest { @Test public void getHtml() throws UnsupportedEncodingException{ StringBuffer buffer = UtilNet.getContent("http://it-ebooks.info/search/?q=java&type=title",""); List<Book> books = UtilJson.paraseHtml(buffer.toString(),"<a\\shref=\"/book/[0-9]*/\"\\stitle='.*><img\\s.*</a>"); for(Book bk:books){ System.out.println(bk.toString()); } } }