使用goquery抓取天气的demo。数据量有点多。目前按省份存储天气数据。存储到csv文件中。
package main import ( "code.google.com/p/mahonia" "encoding/csv" "fmt" "github.com/PuerkitoBio/goquery" "net/http" "os" "strings" "time" ) var log = loger.Loger{ Level: loger.DEBUG,} const ( YEAR = 2013 SleepTime = 100 //毫秒 ) func main() { sc,cc := GetCity() var weatherInfoAll []*WeaterInfo for key,value := range sc { filePath := fmt.Sprintf("%d%s.csv",YEAR,key) _,err := os.Stat(filePath) if err == nil { continue } weatherInfoAll = make([]*WeaterInfo,100000) for _,city := range value { name := cc[city] log.Debug("get ",key,city) client := &http.Client{} weatherInfoYear := GetWeather(client,city,name) weatherInfoAll = append(weatherInfoAll,weatherInfoYear...) } SaveToCSV(key,weatherInfoAll) } } //返回数据为省份=>城市名 城市名=>拼音.html func GetCity() (sc map[string][]string,cc map[string]string) { url := "http://www.tianqihoubao.com/lishi/" request,err := http.NewRequest("GET",url,nil) if err != nil { log.Log(err) return } request.Header.Add("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Ubuntu Chromium/39.0.2171.65 Chrome/39.0.2171.65 Safari/537.36") request.Header.Add("referer","http://www.tianqihoubao.com/") resp,err := http.DefaultClient.Do(request) if err != nil { log.Log(err) return } document,err := goquery.NewDocumentFromResponse(resp) if err != nil { log.Log(err) return } gbk := mahonia.NewDecoder("gbk") sc = make(map[string][]string) cc = make(map[string]string) document.Find(".citychk").Find("dl").Each(func(index int,s *goquery.Selection) { province := gbk.ConvertString(s.Find("dt").Find("b").Text()) citys := make([]string,20) s.Find("dd").Find("a").Each(func(index int,se *goquery.Selection) { uri,exists := se.Attr("href") if !exists { return } name := gbk.ConvertString(se.Text()) uri = strings.Replace(uri,".html","",-1) citys = append(citys,name) cc[name] = uri }) sc[province] = citys }) return } type WeaterInfo struct { Province string City string Date string Info string Temp string Wind string } func GetWeather(client *http.Client,province,name string) []*WeaterInfo { baseUrl := fmt.Sprintf("http://www.tianqihoubao.com%s/month/%%s",name) weaterInfoYear := make([]*WeaterInfo,380) for i := 1; i <= 12; i++ { url := fmt.Sprintf(baseUrl,fmt.Sprintf("%d%02d.html",i)) weaterInfos := GetWeatherInfo(client,url) weaterInfoYear = append(weaterInfoYear,weaterInfos...) time.Sleep(time.Millisecond * SleepTime) } return weaterInfoYear } func GetWeatherInfo(client *http.Client,url string) (weaterInfos []*WeaterInfo) { request,err := client.Do(request) if err != nil { log.Log(err) return } document,err := goquery.NewDocumentFromResponse(resp) if err != nil { log.Log(err) return } gbk := mahonia.NewDecoder("gbk") weaterInfos = make([]*WeaterInfo,31) document.Find("#content").Find("tbody").Find("tr").Each(func(index int,s *goquery.Selection) { //排除第一个 if index == 0 { return } var date,info,temp,wind string s.Find("td").Each(func(index int,se *goquery.Selection) { if index == 0 { date = gbk.ConvertString(se.Find("a").Text()) } if index == 1 { info = gbk.ConvertString(se.Text()) } if index == 2 { temp = gbk.ConvertString(se.Text()) } if index == 3 { wind = gbk.ConvertString(se.Text()) } }) weatherInfo := &WeaterInfo{ Province: province,City: city,Date: date,Info: info,Temp: temp,Wind: wind,} weaterInfos = append(weaterInfos,weatherInfo) }) return } func SaveToCSV(file string,weatherInfos []*WeaterInfo) (err error) { filePath := fmt.Sprintf("%d%s.csv",file) _,err = os.Stat(filePath) if err == nil { return } f,err := os.Create(filePath) if err != nil { log.Log(err) return } defer f.Close() f.WriteString("\xEF\xBB\xBF") //UTF-8 w := csv.NewWriter(f) w.Write([]string{"省份","城市","日期","天气状况","气温","风力风向"}) for i,weatherInfo := range weatherInfos { if i%1000 == 0 { w.Flush() //刷入文件 } strs := []string{TrimSpace(weatherInfo.Province),TrimSpace(weatherInfo.City),TrimSpace(weatherInfo.Date),TrimSpace(weatherInfo.Info),TrimSpace(weatherInfo.Temp),TrimSpace(weatherInfo.Wind)} w.Write(strs) } w.Flush() return } func TrimSpace(value string) string { value = strings.Replace(value,"\n",-1) return strings.Replace(value," ",-1) }
日志库删掉了,因为看起来有点不是很好。当然,也没有说这个代码好。只是临时写着东西。