要抓取数据的页面如下:
代码:
- package com.url;
- import java.io.BufferedReader;
- import java.io.InputStreamReader;
- import java.net.URL;
- import java.net.URLConnection;
- import java.util.Vector;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- public class PaChong {
- static Vector<String> url1 = new Vector<>();
- public static void FindOne(URL url) throws Exception{
- URLConnection Conn = url.openConnection();
- Conn.setReadTimeout(10000);
- BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
- String line = "";
- while((line = read.readLine())!=null){
- int index = line.indexOf("about/aboutus.htm#/about?"); //截取URL搜索到的网页源码中的包含该字段的源码
- if(index>=0){
- String URL ="http://study.163.com/"+line.substring(index);
- try {
- URL = URL.substring(0,URL.indexOf("\""));
- } catch (Exception e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- url1.add(URL);
- System.out.println(URL);
- }
- }
- }
- public static void FindTitle(URL url) throws Exception{
- URLConnection Conn = url.openConnection();
- Conn.setReadTimeout(10000);
- BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
- String line = "";
- while((line = read.readLine())!=null){
- int Titleindex = line.indexOf("<title>");
- if(Titleindex>=0){
- System.out.println(PaChong.getChinese(line));
- }
- }
- }
- public static void FindContent(URL url) throws Exception{
- URLConnection Conn = url.openConnection();
- Conn.setReadTimeout(10000);
- BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
- String line = "";
- while((line = read.readLine())!=null){
- // int Contentindex = line.indexOf("<a data-index=");
- int Contentindex = line.indexOf("data-name=");
- if(Contentindex>=0){
- String content = line.substring(line.indexOf("\""));
- System.out.println(PaChong.getChinese(content));
- }
- }
- }
- //正则表达式提取搜索到网页中需要的中文字符
- public static String getChinese(String paramValue) {
- String regex = "([\u4e00-\u9fa5]+)";
- String str = "";
- Matcher matcher = Pattern.compile(regex).matcher(paramValue);
- while (matcher.find()) {
- str+= matcher.group(0);
- str+= " ";
- }
- return str;
- }
- public static void main(String[] args) throws Exception {
- // TODO Auto-generated method stub
- URL url = new URL("http://study.163.com/courses-search?keyword=JAVA"); //爬取的链接
- System.err.println("提取的相关介绍网页如下:");
- FindOne(url);
- System.err.println("提取的网页标题如下:");
- FindTitle(url);
- System.err.println("提取的网页内容如下:");
- FindContent(url);
- }
- }
截图: