国家统计局区划码爬取

前端之家收集整理的这篇文章主要介绍了国家统计局区划码爬取前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。

目标数据

分享图片

 

oracle存储表格

-- Create table
create table VILLAGE_CODE
(
  id                INTEGER,area_code         VARCHAR2(500),city_village_code VARCHAR2(500),area_name         VARCHAR2(500)
)
tablespace SYSTEM
  pctfree 10
  pctused 40
  initrans 1
  maxtrans 255
  storage
  (
    initial 64K
    next 1M
    minextents 1
    maxextents unlimited
  );
-- Add comments to the columns 
comment on column VILLAGE_CODE.id
  is 自增ID;
comment on column VILLAGE_CODE.area_code
  is 统计用区划代码;
comment on column VILLAGE_CODE.city_village_code
  is 城乡分类代码    ;
comment on column VILLAGE_CODE.area_name
  is 名称;

 

爬取代码

#!/usr/bin/env python
# encoding: utf-8
‘‘‘
@author: lurenjia
@contact: [email protected]
@file: areacode.py
@time: 2018/9/29 14:40
@desc:
‘‘‘

import urllib2,re
from time import sleep
from random import random
from config import DBSession


headers = {
    "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
session = DBSession()


def insertVillage(code,name,city_village_code=-1):
    print code,city_village_code
    session.execute("insert into village_code(area_code,area_name,city_village_code) VALUES (‘%s‘,‘%s‘,‘%s‘)" %(code,city_village_code))
    session.commit()


def openUrl(url,type):
    try:
        sleep(random()*0.5)
        request = urllib2.Request(url,headers=headers)
        html = urllib2.urlopen(request,timeout=10).read().decode(gbk)
    except:
        html = None
        with open(error.txt,a+) as f:
            f.write(url+                   +str(type)+\n)
    finally:
        return html
    
    
def parseCode1(baseUrl,lastUrl):
    html = openUrl(baseUrl+lastUrl,1)
    if html:
        for tr in re.findall("<tr class=‘provincetr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)<br/>",tr):
                parseCode2(baseUrl,td[0])
        

def parseCode2(baseUrl,lastUrl):
    html = openUrl(baseUrl + lastUrl,2)
    if html:
        for tr in re.findall("<tr class=‘citytr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)</a></td><td><a href=‘.+?‘>(.+?)</a>",tr):
                insertVillage(td[1],td[2])
                parseCode3(baseUrl,td[0])
        

def parseCode3(baseUrl,lastUrl):
    baseUrl = baseUrl + lastUrl.split(/)[0] + /
    lastUrl = /.join(lastUrl.split(/)[1:])
    html = openUrl(baseUrl + lastUrl,3)
    if html:
        for tr in re.findall("<tr class=‘countytr‘>.+?</tr>",td[2])
                parseCode4(baseUrl,td[0])
        

def parseCode4(baseUrl,4)
    if html:
        for tr in re.findall("<tr class=‘towntr‘>.+?</tr>",td[2])
                parseCode5(baseUrl,td[0])
        

def parseCode5(baseUrl,5)
    if html:
        for tr in re.findall("<tr class=‘villagetr‘>.+?</tr>",html):
            for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>",tr):
                insertVillage(td[0],td[2],td[1])
        

if __name__=="__main__":
    baseUrl = http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/
    parseCode1(baseUrl,index.html)

猜你在找的Oracle相关文章