国家统计局区划码爬取

目标数据

oracle存储表格

-- Create table
create table VILLAGE_CODE
(
  id                INTEGER,area_code         VARCHAR2(500),city_village_code VARCHAR2(500),area_name         VARCHAR2(500)
)
tablespace SYSTEM
  pctfree 10
  pctused 40
  initrans 1
  maxtrans 255
  storage
  (
    initial 64K
    next 1M
    minextents 1
    maxextents unlimited
  );
-- Add comments to the columns 
comment on column VILLAGE_CODE.id
  is ‘自增ID‘;
comment on column VILLAGE_CODE.area_code
  is ‘统计用区划代码‘;
comment on column VILLAGE_CODE.city_village_code
  is ‘城乡分类代码    ‘;
comment on column VILLAGE_CODE.area_name
  is ‘名称‘;

爬取代码

#!/usr/bin/env python
# encoding: utf-8
‘‘‘
@author: lurenjia
@contact: [email protected]
@file: areacode.py
@time: 2018/9/29 14:40
@desc:
‘‘‘

import urllib2,re
from time import sleep
from random import random
from config import DBSession


headers = {
    "User-Agent": "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
session = DBSession()


def insertVillage(code,name,city_village_code=‘-1‘):
    print code,city_village_code
    session.execute("insert into village_code(area_code,area_name,city_village_code) VALUES (‘%s‘,‘%s‘,‘%s‘)" %(code,city_village_code))
    session.commit()


def openUrl(url,type):
    try:
        sleep(random()*0.5)
        request = urllib2.Request(url,headers=headers)
        html = urllib2.urlopen(request,timeout=10).read().decode(‘gbk‘)
    except:
        html = None
        with open(‘error.txt‘,‘a+‘) as f:
            f.write(url+‘                   ‘+str(type)+‘\n‘)
    finally:
        return html
    
    
def parseCode1(baseUrl,lastUrl):
    html = openUrl(baseUrl+lastUrl,1)
    if html:
        for tr in re.findall("<tr class=‘provincetr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)<br/>",tr):
                parseCode2(baseUrl,td[0])
        

def parseCode2(baseUrl,lastUrl):
    html = openUrl(baseUrl + lastUrl,2)
    if html:
        for tr in re.findall("<tr class=‘citytr‘>.+?</tr>",html):
            for td in re.findall("<a href=‘(.+?html)‘>(.+?)</a></td><td><a href=‘.+?‘>(.+?)</a>",tr):
                insertVillage(td[1],td[2])
                parseCode3(baseUrl,td[0])
        

def parseCode3(baseUrl,lastUrl):
    baseUrl = baseUrl + lastUrl.split(‘/‘)[0] + ‘/‘
    lastUrl = ‘/‘.join(lastUrl.split(‘/‘)[1:])
    html = openUrl(baseUrl + lastUrl,3)
    if html:
        for tr in re.findall("<tr class=‘countytr‘>.+?</tr>",td[2])
                parseCode4(baseUrl,td[0])
        

def parseCode4(baseUrl,4)
    if html:
        for tr in re.findall("<tr class=‘towntr‘>.+?</tr>",td[2])
                parseCode5(baseUrl,td[0])
        

def parseCode5(baseUrl,5)
    if html:
        for tr in re.findall("<tr class=‘villagetr‘>.+?</tr>",html):
            for td in re.findall("<td>(.+?)</td><td>(.+?)</td><td>(.+?)</td>",tr):
                insertVillage(td[0],td[2],td[1])
        

if __name__=="__main__":
    baseUrl = ‘http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/‘
    parseCode1(baseUrl,‘index.html‘)

国家统计局区划码爬取

目标数据

oracle存储表格

爬取代码

猜你在找的Oracle相关文章