python爬虫最简单代码

前端之家收集整理的这篇文章主要介绍了python爬虫最简单代码前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
感兴趣Python爬虫最简单代码的小伙伴,下面一起跟随编程之家 jb51.cc的小编来看看吧。<br>
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import cookielib
import urllib2
from urllib import urlencode
import MysqLdb
from bs4 import BeautifulSoup

# 数据库
mydb = MysqLdb.connect(host="10.0.66.248",user="root",passwd="123456",db="invCloudOA",charset="utf8")
cursor = mydb.cursor()
mydb.autocommit(on="on")

# 建一个cookie处理器的opener
cookie = cookielib.CookieJar()
handler = urllib2.HTTPCookieProcessor(cookie)
conn = urllib2.build_opener(handler)

# 请求参数
param = {
    'email': '88888888@qq.com','password': '88888888','target': 'http://aaaaaa.com/member'
}

# 发送请求
resp = conn.open("http://aaaaaa.com/user/login",data=urlencode(param))

# 登录成功后,开始采集数据
for i in xrange(1,200,1):
    print("当前页码 %d" % i)
    try:
        resp_con = conn.open("http://aaaaaa.com/member/index/page/%d" % i,timeout=20)
        resp_string = resp_con.read()
        bs = BeautifulSoup(resp_string)

        a_list = bs.select('li[class="col-sm-6 col-md-4 col-lg-6"] a')
        try:
            for a in a_list:
                person_html = conn.open("http://aaaaaa.com%s" % a.get("href"),timeout=20)
                person_soup = BeautifulSoup(person_html)

                person_detail_soup = person_soup.find('div',class_='col-sm-12 col-md-8 detail-left min-padding')

                try:
                    username = person_detail_soup.find('div',class_='detail').find('div',class_='introduce').find('h4').find('span').get_text()
                except:
                    username = ""

                try:
                    gongsi = person_detail_soup.find('div',class_='introduce').find('div',class_="local").find('a').get_text()
                except:
                    gongsi = ""

                try:
                    zhiwu = person_detail_soup.find('div',class_='introduce').find_all('div',class_="local")[1].get_text()
                except:
                    pass
                try:
                    diqu = person_detail_soup.find('div',class_="local")[2].find('a').get_text()
                except:
                    diqu = ""
                try:
                    shouji = person_detail_soup.find_all('div',class_='contacts')[0].get_text()
                except:
                    shouji = ""
                try:
                    qq = person_detail_soup.find_all('div',class_='contacts')[1].get_text()
                except:
                    qq = ""
                try:
                    weibo = person_detail_soup.find_all('div',class_='contacts')[2].get_text()
                except:
                    weibo = ""
                try:
                    email = person_detail_soup.find_all('div',class_='contacts')[3].get_text()
                except:
                    email = ""
                try:
                    weixin = person_detail_soup.find_all('div',class_='contacts')[4].get_text()
                except:
                    weixin = ""
                sql = "INSERT INTO`ft_person`(`mingzi`,`shouji`,`qq`,`weibo`,`emai`,`wexin`,`chanpinjingli`,`zhiyejingli`,`gerenjieshao`,`yuanid`,gongsi,zhiwu,diqu) VALUES('%s','%s','%s');" % (
                    username,shouji,qq,weibo,email,weixin,"",a.get("href"),diqu)
                cursor.execute(sql)
                print("采集成功 %s" % username.encode('utf-8'))
        except StandardError as ex:
            print(ex)
            print("采集该人物失败 %s" % a.get("href"))
    except StandardError as ex:
        print(ex)
        print("采集第 %d 页失败!" % i)
# 关闭数据库
cursor.close()
 

猜你在找的Python相关文章