-
Notifications
You must be signed in to change notification settings - Fork 59
/
Copy pathgetArea.py
113 lines (95 loc) · 4.7 KB
/
getArea.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# -*- coding: UTF-8 -*-
import json
import os
import ssl
import time
# import requests
import urllib.parse
import urllib.request
from bs4 import BeautifulSoup
# config #
year = '2020' # 年份,目前国家统计局官网有2009-2019年的数据
level = 5 # 可选:3|5 获取的层级 3层为省-市-区 最多5级省-市-区-县(街道)-乡镇(居委会)
digit = 12 # 可选:6|12 行政区划代码位数 层级为3级时通常使用6位代码 如110000,层级为5级时使用12位代码 如 110000000000
head_url = "42" # 可选:index|各省行政区划前两位 要从哪开始获取 index为全国所有省份 要获取单独的省份修改为省行政区划的前两位
# config #
context = ssl._create_unverified_context()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
mainUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/'
attrsClassArr = ['provincetr', 'citytr', 'countytr', 'towntr', 'villagetr']
def is_CN(word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
def retry(times):
def wrapper(func):
def inner_wrapper(*args, **kwargs):
i = 0
while i < times:
try:
if i > 0:
print('正在尝试第%d次请求' % (i+1))
return func(*args, **kwargs)
except:
# 此处打印日志 func.__name__ 为say函数
print('%s 请求失败!' % (args[0]))
# print("logdebug: {}()".format(func.__name__))
i += 1
return inner_wrapper
return wrapper
@retry(5)
def fetchAreaData(url, index, attrsClass, area):
if index < level:
try:
with urllib.request.urlopen(url, context=context, timeout=3) as response:
soup = BeautifulSoup(
response.read(), 'html.parser', fromEncoding="gb18030")
tag = soup.find_all(attrs={"class": attrsClass})
for tg in tag:
villageArr = []
for tgc in tg.children:
if tgc.a:
if is_CN(tgc.a.get_text()):
print('正在请求', '/'.join(url.split('/')
[0:-1]) + '/' + tgc.a.attrs['href'], tgc.a.get_text())
if index + 1 == level:
area.append(
{"value": tgc.a.attrs['href'].split('/')[-1].replace('.html', '').ljust(digit, '0'), "label": tgc.a.get_text()})
else:
content = {"value": tgc.a.attrs['href'].split(
'/')[-1].replace('.html', '').ljust(digit, '0'), "label": tgc.a.get_text(), "children": []}
area.append(content)
fetchAreaData(
'/'.join(url.split('/')[0:-1]) + '/' + tgc.a.attrs['href'], index+1, attrsClassArr[index+1], area[len(area)-1]['children'])
elif attrsClass == 'villagetr':
villageArr.append(tgc.get_text())
if len(villageArr) > 0:
area.append(
{"value": villageArr[0], "label": villageArr[2]})
except urllib.request.URLError as e:
print("出现异常: "+str(e))
position = 0
areas = []
if head_url != 'index':
position = 1
fetchAreaData(mainUrl+year+'/'+head_url+'.html',
position, attrsClassArr[position], areas)
with open(head_url+'_'+year+'_'+"_level_" + str(level)+'.json', "w", encoding='utf-8') as json_file:
json.dump(areas, json_file, ensure_ascii=False)
print("写入%s完成" % (head_url+'_'+year+'_'+"_level_" + str(level)+'.json'))
# code = ['11', '12', '13', '14', '15', '21', '22','23', '31', '32', '33', '34', '35', '36', '37',
# '41', '42', '43', '44', '45', '46', '50', '51', '52', '53', '54', '61', '62', '63', '64', '65']
# for index in range(24):
# areas = []
# head_url = code[index]
# position = 0
# if head_url != 'index':
# position = 1
# fetchAreaData(mainUrl+year+'/'+head_url+'.html',
# position, attrsClassArr[position], areas)
# with open('province/level_5/'+head_url+'_'+year+'_'+"_level_" + str(level)+'.json', "w", encoding='utf-8') as json_file:
# json.dump(areas, json_file, ensure_ascii=False)
# print("写入%s完成" % (head_url+'_'+year+'_'+"_level_" + str(level)+'.json'))