最近,在做一个Code Kata,突然想把自己CSDN博客上面所有的文章全部列出来,而且是先写测试,在写实现(传说中的TDD)。下面把其分享出来。笔者是基于org.htmlparser.htmlparser来进行页面解析的。如果大家需要用的话,请在pom.xml文件里面加入下面的依赖。
值得一提的是,在使用org.htmlparser.htmlparser的时候,恰当合理的Filter(过滤器)非常的重要,如果使用得当的,往往会事半功倍。下面把常用的16个Filter(过滤器)列出来一下。
- <dependency>
- <groupId>org.htmlparser</groupId>
- <artifactId>htmlparser</artifactId>
- <version>2.1</version>
- </dependency>
16个不同的Filter,也可以分为几类。
* 判断类Filter:
- TagNameFilter
- HasAttributeFilter
- HasChildFilter
- HasParentFilter
- HasSiblingFilter
- IsEqualFilter
* 逻辑运算Filter:
- AndFilter
- NotFilter
- OrFilter
- XorFilter
* 其他Filter:
- NodeClassFilter
- StringFilter
- LinkStringFilter
- LinkRegexFilter
- RegexFilter
- CssSelectorNodeFilter
#1 TDD中测试先行,测试程序部分
- package com.winneryum.csdn;
- import static org.junit.Assert.*;
- import java.util.List;
- import org.junit.Test;
- public class CSDNPageParserTest {
- @Test
- public void testListAllCategoryURLByCSDNIdURL(){
- //http://blog.csdn.net/chancein007/
- String csdnID="chancein007";
- CSDNPageParser csdnPageParser=new CSDNPageParser(csdnID);
- List<String> lsCategryURLs=csdnPageParser.listAllCategoryURLsByCSDNId();
- assertTrue(lsCategryURLs.size()>0);
- System.out.println(lsCategryURLs.toString());
- }
- @Test
- public void testListPagesByCategoryURLs(){
- CSDNPageParser csdnPageParser=new CSDNPageParser();
- String categoryURL="http://blog.csdn.net//chancein007/article/category/2331239";
- List<String> lsPages= csdnPageParser.listPagesByCategoryURL(categoryURL);
- assertTrue(lsPages.size()>0);
- System.out.println(lsPages.toString());
- }
- @Test
- public void testGetAllPageURLs(){
- String csdnID="chancein007";
- CSDNPageParser csdnPageParser=new CSDNPageParser(csdnID);
- List<String> lsAllPages= csdnPageParser.getAllPageURLs();
- assertTrue(lsAllPages.size()>0);
- for(int i=0;i<lsAllPages.size();i++){
- System.out.println(lsAllPages.get(i));
- }
- }
- }
# 2 程序实现部分
- package com.winneryum.csdn;
- import java.util.ArrayList;
- import java.util.Hashtable;
- import java.util.List;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.HasAttributeFilter;
- import org.htmlparser.filters.TagNameFilter;
- import org.htmlparser.http.ConnectionManager;
- import org.htmlparser.nodes.TagNode;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- public class CSDNPageParser {
- public final static String CSDN_ROOT_URL="http://blog.csdn.net";
- private String csdnID;
- private String getCSDNRootBlogURL(){
- return CSDN_ROOT_URL+"/"+csdnID+"/";
- }
- public CSDNPageParser(String csdnID) {
- this.csdnID=csdnID;
- }
- public CSDNPageParser() {
- }
- public List<String> listAllCategoryURLsByCSDNId() {
- List<String> categoryURLs=new ArrayList<String>();
- String encoding = "UTF-8";
- try {
- Parser onLineHtmlParser;
- onLineHtmlParser = new Parser();
- ConnectionManager connectionManager=Parser.getConnectionManager ();
- Hashtable hashTable=connectionManager.getRequestProperties();
- hashTable.put("User-Agent","Firefox");
- connectionManager.setRequestProperties(hashTable);
- onLineHtmlParser.setURL(getCSDNRootBlogURL());
- onLineHtmlParser.setEncoding(encoding);
- NodeFilter filter = new HasAttributeFilter( "id","panel_Category" );
- //NodeClassFilter nodeClassFilter=new NodeClassFilter(org.htmlparser.tags.LinkTag.class);
- //AndFilter andFilter=new AndFilter(new NodeFilter[]{filter,nodeClassFilter});
- NodeList nodes = onLineHtmlParser.extractAllNodesThatMatch(filter);
- String categorySegment= nodes.elementAt(1).toHtml();
- Parser categorySegementParser = new Parser(categorySegment);
- TagNameFilter tagFileter=new TagNameFilter("a");
- NodeList categoryNode=categorySegementParser.extractAllNodesThatMatch(tagFileter);
- for(int i=0;i<categoryNode.size();i++){
- TagNode linkNode=(TagNode)categoryNode.elementAt(i);
- categoryURLs.add(CSDN_ROOT_URL+linkNode.getAttribute("href"));
- }
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return categoryURLs;
- }
- public List<String> listPagesByCategoryURL(String categoryURL) {
- List<String> pageURLs=new ArrayList<String>();
- String encoding = "UTF-8";
- try {
- Parser onLineHtmlParser;
- onLineHtmlParser = new Parser();
- ConnectionManager connectionManager=Parser.getConnectionManager ();
- Hashtable hashTable=connectionManager.getRequestProperties();
- hashTable.put("User-Agent","Firefox");
- connectionManager.setRequestProperties(hashTable);
- onLineHtmlParser.setURL(categoryURL);
- onLineHtmlParser.setEncoding(encoding);
- TagNameFilter h1TagFileter=new TagNameFilter("h1");
- NodeList h1Nodes = onLineHtmlParser.extractAllNodesThatMatch(h1TagFileter);
- String pageSegment= h1Nodes.toHtml();
- Parser categorySegementParser = new Parser(pageSegment);
- TagNameFilter tagFileter=new TagNameFilter("a");
- NodeList pageDetailedNode=categorySegementParser.extractAllNodesThatMatch(tagFileter);
- for(int i=0;i<pageDetailedNode.size();i++){
- TagNode linkNode=(TagNode)pageDetailedNode.elementAt(i);
- pageURLs.add(CSDN_ROOT_URL+linkNode.getAttribute("href"));
- }
- } catch (ParserException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- return pageURLs;
- }
- public List<String> getAllPageURLs() {
- List<String> allpageURLs=new ArrayList<String>();
- List<String> allCatgoryURL=listAllCategoryURLsByCSDNId();
- for(String categoryURL:allCatgoryURL){
- List<String> listPageURLs=listPagesByCategoryURL(categoryURL);
- allpageURLs.addAll(listPageURLs);
- }
- return allpageURLs;
- }
- }
#3 注意事项
注意上面这段代码,
- ConnectionManager connectionManager=Parser.getConnectionManager ();
- Hashtable hashTable=connectionManager.getRequestProperties();
- hashTable.put("User-Agent","Firefox");
- connectionManager.setRequestProperties(hashTable);
如果没有这段代码,CSDN网站就会认为这是一个机器在访问CSDN网站,就会抛出下面的403 Forbidden的状态码。
org.htmlparser.util.ParserException: Exception getting input stream from http://blog.csdn.net/chancein007/ (Server returned HTTP response code: 403 for URL: http://blog.csdn.net/chancein007/).;
java.io.IOException: Server returned HTTP response code: 403 for URL: http://blog.csdn.net/chancein007/
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:526)
at sun.net.www.protocol.http.HttpURLConnection$6.run(HttpURLConnection.java:1675)
at sun.net.www.protocol.http.HttpURLConnection$6.run(HttpURLConnection.java:1673)
at java.security.AccessController.doPrivileged(Native Method)
at sun.net.www.protocol.http.HttpURLConnection.getChainedException(HttpURLConnection.java:1671)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1244)
at org.htmlparser.lexer.Page.setConnection(Page.java:576)
at org.htmlparser.lexer.Page.<init>(Page.java:133)
at org.htmlparser.lexer.Lexer.<init>(Lexer.java:185)
at org.htmlparser.Parser.setConnection(Parser.java:419)
at org.htmlparser.Parser.setURL(Parser.java:448)
at com.winneryum.csdn.CSDNPageParser.listAllCategoryURLsByCSDNId(CSDNPageParser.java:38)
at com.winneryum.csdn.CSDNPageParser.getAllPageURLs(CSDNPageParser.java:96)
at com.winneryum.csdn.CSDNPageParserTest.testGetAllPageURLs(CSDNPageParserTest.java:34)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
at org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
at org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
at org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
at org.junit.runners.ParentRunner.run(ParentRunner.java:363)
at org.eclipse.jdt.internal.junit4.runner.JUnit4TestReference.run(JUnit4TestReference.java:86)
at org.eclipse.jdt.internal.junit.runner.TestExecution.run(TestExecution.java:38)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:459)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.runTests(RemoteTestRunner.java:678)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.run(RemoteTestRunner.java:382)
at org.eclipse.jdt.internal.junit.runner.RemoteTestRunner.main(RemoteTestRunner.java:192)
Caused by: java.io.IOException: Server returned HTTP response code: 403 for URL: http://blog.csdn.net/chancein007/
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1626)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:468)
at org.htmlparser.http.ConnectionManager.openConnection(ConnectionManager.java:661)
at org.htmlparser.http.ConnectionManager.openConnection(ConnectionManager.java:849)
... 27 more