在前面介绍了使用Jsoup解析html,Jsoup也比较强大和方便,不过需要使用第三方包,在只需解析html很少内容的时候,可以考虑用正则表达式查找匹配的内容,下面给一个简单例子,爬取页面中的title标签的文本内容,代码如下:
package com.home.parsehtml; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import android.app.Activity; import android.os.Bundle; import android.util.Log; import android.view.View; import android.view.View.OnClickListener; import android.widget.Button; public class MainActivity extends Activity implements OnClickListener { private Button btn; private static final String URL_STR = "http://vip.astro.sina.com.cn/iframe/astro/view/aries/day/"; @Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.main); btn = (Button) findViewById(R.id.main_btn); btn.setOnClickListener(this); } @Override public void onClick(View v) { if (v == btn) { new Thread(r).start(); } } Runnable r = new Runnable() { @Override public void run() { String content = getHtmlContent(); doParse(content); } }; /** * 读取html * * @return */ protected String getHtmlContent() { StringBuffer sb = new StringBuffer(); BufferedReader br = null; try { URL url = new URL(URL_STR); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); br = new BufferedReader(new InputStreamReader( conn.getInputStream(),"utf-8")); String temp; while ((temp = br.readLine()) != null) { sb.append(temp).append("\n"); } } catch (Exception e) { e.printStackTrace(); } finally { if (br != null) { try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } return sb.toString(); } /** * 使用正则抽取title标签内容 * * @param content */ protected void doParse(String content) { // 匹配<title>开头,</title>结尾的文档 Pattern p = Pattern.compile("<title>([^</title>]*)"); Matcher m = p.matcher(content); if (m.find()) { String title = m.group(1); Log.i("title",title); } } }