爬取国家统计局区划代码(多进程版)
程序员文章站
2022-03-02 21:03:55
...
现在奉上多进程版本
from multiprocessing.pool import Pool
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import os
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/83.0.4103.116 Safari/537.36'}
# 获取一级代码、名称、下一级链接
def getOneLevelCodeName(originUrl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'):
web = requests.get(originUrl, headers=headers) # 获取网页
web.encoding = web.apparent_encoding # 设置编码
soup = BeautifulSoup(web.text, 'html.parser') # 解析网页
provinceList = soup.select('.provincetr') # 查找类名为provincetr的内容
oneLevelWeb = []
for table in provinceList:
for province in table.select('a'):
oneLevelWeb.append((province['href'], province.text)) # 获取下一级短链接、获取省名
oneLevelWebUrl = [(url[0][0:2] + '0000000000', '中国-' + url[1], originUrl[0:54] + url[0]) for url in
oneLevelWeb] # 构建区划代码、省名、下一级链接
return oneLevelWebUrl
# 获取二级代码、名称、下一级链接
def getSecondLevelCodeName(proLevelName=None, url='None'):
if proLevelName is None or url == 'None':
return 'None'
else:
web = requests.get(url, headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text, 'html.parser')
secondLevelCodeNameList = soup.select('.citytr')
retList = []
for tag in secondLevelCodeNameList:
if tag.text[12:] == '市辖区':
retList.append((tag.text[0:12], proLevelName + '-' + proLevelName.split('-')[-1] + tag.text[12:],
url[0:54] + tag.select('a')[0]['href']))
else:
retList.append(
(tag.text[0:12], proLevelName + '-' + tag.text[12:], url[0:54] + tag.select('a')[0]['href']))
return retList
# 获取二级代码,封装函数
def getSecondLevelCodeNamePackage(proLevelNameAndNextUrl):
return getSecondLevelCodeName(proLevelNameAndNextUrl[1], proLevelNameAndNextUrl[2])
# 获取三级代码、名称、下一级链接
def getThirdLevelCodeName(proLevelName=None, url='None'):
if proLevelName is None or url == 'None':
return 'None'
else:
web = requests.get(url, headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text, 'html.parser')
thirdLevelCodeNameList = soup.select('.countytr')
retList = []
for tag in thirdLevelCodeNameList:
try:
retList.append(
(tag.text[0:12], proLevelName + '-' + tag.text[12:], url[0:56] + '/' + tag.select('a')[0]['href']))
except:
retList.append(
(tag.text[0:12], proLevelName + '-' + proLevelName.split('-')[-1] + tag.text[12:], 'None'))
return retList
# 获取三级代码,封装函数
def getThirdLevelCodeNamePackage(proLevelNameAndNextUrl):
return getThirdLevelCodeName(proLevelNameAndNextUrl[1], proLevelNameAndNextUrl[2])
# 获取四级代码、名称、下一级链接
def getFourthLevelCodeName(proLevelName=None, url='None'):
if proLevelName is None or url == 'None':
return [('None','None','None')]
else:
web = requests.get(url, headers=headers)
web.encoding = web.apparent_encoding
soup = BeautifulSoup(web.text, 'html.parser')
fourthLevelCodeNameList = soup.select('.towntr')
retList = []
for tag in fourthLevelCodeNameList:
retList.append((tag.text[0:12], proLevelName + '-' + tag.text[12:], url[0:60] + tag.select('a')[0]['href']))
return retList
# 获取四级代码 封装函数
def getFourthLevelCodeNamePackage(proLevelNameAndNextUrl):
return getFourthLevelCodeName(proLevelNameAndNextUrl[1], proLevelNameAndNextUrl[2])
if __name__ == '__main__':
# 进程数视电脑情况而定
pool = Pool(processes=8)
# 获取一级区域
oneLevel = getOneLevelCodeName('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html')
# print(oneLevel)
# 获取二级区域
time.sleep(2)
secondLevel = [url for data in pool.map(getSecondLevelCodeNamePackage, oneLevel) for url in data]
# print(secondLevel)
# 获取三级区域
time.sleep(2)
thirdLevel = [url for data in pool.map(getThirdLevelCodeNamePackage, secondLevel) for url in data]
# print(thirdLevel)
# 获取四级区域
time.sleep(2)
fourthLevel = [url for data in pool.map(getFourthLevelCodeNamePackage, thirdLevel) for url in data]
# print(fourthLevel)
pool.close()
pd_oneLevel = pd.DataFrame(oneLevel)
pd_secondLevel = pd.DataFrame(secondLevel)
pd_thirdLevel = pd.DataFrame(thirdLevel)
pd_fourthLevel = pd.DataFrame(fourthLevel)
pd_allLevel = pd.concat([pd_oneLevel,pd_secondLevel,pd_thirdLevel,pd_fourthLevel])
pd_allLevel.columns = ['区划代码','名称','下一级网址']
# 保存到当前路径
pd_allLevel.to_excel(r''+os.path.realpath('__file__')[0:-8]+'区划代码及名称.xlsx',index=False)
区划代码下载传送(0积分):https://download.csdn.net/download/Dongfnag_HU/12852876
上一篇: 爬虫2
下一篇: Java爬统计局12位区划代码