使用正则表达式抓取网易云课堂中的数据

前端之家收集整理的这篇文章主要介绍了使用正则表达式抓取网易云课堂中的数据前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

要抓取数据的页面如下:



代码

  1. package com.url;
  2.  
  3. import java.io.BufferedReader;
  4. import java.io.InputStreamReader;
  5. import java.net.URL;
  6. import java.net.URLConnection;
  7. import java.util.Vector;
  8. import java.util.regex.Matcher;
  9. import java.util.regex.Pattern;
  10.  
  11. public class PaChong {
  12. static Vector<String> url1 = new Vector<>();
  13. public static void FindOne(URL url) throws Exception{
  14. URLConnection Conn = url.openConnection();
  15. Conn.setReadTimeout(10000);
  16. BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
  17. String line = "";
  18. while((line = read.readLine())!=null){
  19. int index = line.indexOf("about/aboutus.htm#/about?"); //截取URL搜索到的网页源码中的包含该字段的源码
  20. if(index>=0){
  21. String URL ="http://study.163.com/"+line.substring(index);
  22. try {
  23. URL = URL.substring(0,URL.indexOf("\""));
  24. } catch (Exception e) {
  25. // TODO Auto-generated catch block
  26. e.printStackTrace();
  27. }
  28. url1.add(URL);
  29. System.out.println(URL);
  30. }
  31. }
  32. }
  33. public static void FindTitle(URL url) throws Exception{
  34. URLConnection Conn = url.openConnection();
  35. Conn.setReadTimeout(10000);
  36. BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
  37. String line = "";
  38. while((line = read.readLine())!=null){
  39. int Titleindex = line.indexOf("<title>");
  40. if(Titleindex>=0){
  41. System.out.println(PaChong.getChinese(line));
  42. }
  43. }
  44. }
  45. public static void FindContent(URL url) throws Exception{
  46. URLConnection Conn = url.openConnection();
  47. Conn.setReadTimeout(10000);
  48. BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
  49. String line = "";
  50. while((line = read.readLine())!=null){
  51. // int Contentindex = line.indexOf("<a data-index=");
  52. int Contentindex = line.indexOf("data-name=");
  53. if(Contentindex>=0){
  54. String content = line.substring(line.indexOf("\""));
  55. System.out.println(PaChong.getChinese(content));
  56. }
  57. }
  58. }
  59. //正则表达式提取搜索到网页中需要的中文字符
  60. public static String getChinese(String paramValue) {
  61. String regex = "([\u4e00-\u9fa5]+)";
  62. String str = "";
  63. Matcher matcher = Pattern.compile(regex).matcher(paramValue);
  64. while (matcher.find()) {
  65. str+= matcher.group(0);
  66. str+= " ";
  67. }
  68. return str;
  69. }
  70. public static void main(String[] args) throws Exception {
  71. // TODO Auto-generated method stub
  72. URL url = new URL("http://study.163.com/courses-search?keyword=JAVA"); //爬取的链接
  73. System.err.println("提取的相关介绍网页如下:");
  74. FindOne(url);
  75. System.err.println("提取的网页标题如下:");
  76. FindTitle(url);
  77. System.err.println("提取的网页内容如下:");
  78. FindContent(url);
  79. }
  80.  
  81. }

截图:

猜你在找的正则表达式相关文章