欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬取国家统计局区划代码(多进程版)

程序员文章站 2022-03-02 21:03:55
...

现在奉上多进程版本

 

from multiprocessing.pool import Pool
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import os

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/83.0.4103.116 Safari/537.36'}


# 获取一级代码、名称、下一级链接
def getOneLevelCodeName(originUrl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'):
    web = requests.get(originUrl, headers=headers)  # 获取网页
    web.encoding = web.apparent_encoding  # 设置编码
    soup = BeautifulSoup(web.text, 'html.parser')  # 解析网页
    provinceList = soup.select('.provincetr')  # 查找类名为provincetr的内容

    oneLevelWeb = []
    for table in provinceList:
        for province in table.select('a'):
            oneLevelWeb.append((province['href'], province.text))  # 获取下一级短链接、获取省名

    oneLevelWebUrl = [(url[0][0:2] + '0000000000', '中国-' + url[1], originUrl[0:54] + url[0]) for url in
                      oneLevelWeb]  # 构建区划代码、省名、下一级链接

    return oneLevelWebUrl


# 获取二级代码、名称、下一级链接
def getSecondLevelCodeName(proLevelName=None, url='None'):
    if proLevelName is None or url == 'None':
        return 'None'
    else:
        web = requests.get(url, headers=headers)
        web.encoding = web.apparent_encoding
        soup = BeautifulSoup(web.text, 'html.parser')
        secondLevelCodeNameList = soup.select('.citytr')

        retList = []
        for tag in secondLevelCodeNameList:
            if tag.text[12:] == '市辖区':
                retList.append((tag.text[0:12], proLevelName + '-' + proLevelName.split('-')[-1] + tag.text[12:],
                                url[0:54] + tag.select('a')[0]['href']))
            else:
                retList.append(
                    (tag.text[0:12], proLevelName + '-' + tag.text[12:], url[0:54] + tag.select('a')[0]['href']))

        return retList


# 获取二级代码,封装函数
def getSecondLevelCodeNamePackage(proLevelNameAndNextUrl):
    return getSecondLevelCodeName(proLevelNameAndNextUrl[1], proLevelNameAndNextUrl[2])


# 获取三级代码、名称、下一级链接
def getThirdLevelCodeName(proLevelName=None, url='None'):
    if proLevelName is None or url == 'None':
        return 'None'
    else:
        web = requests.get(url, headers=headers)
        web.encoding = web.apparent_encoding
        soup = BeautifulSoup(web.text, 'html.parser')
        thirdLevelCodeNameList = soup.select('.countytr')

        retList = []
        for tag in thirdLevelCodeNameList:
            try:
                retList.append(
                    (tag.text[0:12], proLevelName + '-' + tag.text[12:], url[0:56] + '/' + tag.select('a')[0]['href']))
            except:
                retList.append(
                    (tag.text[0:12], proLevelName + '-' + proLevelName.split('-')[-1] + tag.text[12:], 'None'))

        return retList


# 获取三级代码,封装函数
def getThirdLevelCodeNamePackage(proLevelNameAndNextUrl):
    return getThirdLevelCodeName(proLevelNameAndNextUrl[1], proLevelNameAndNextUrl[2])


# 获取四级代码、名称、下一级链接
def getFourthLevelCodeName(proLevelName=None, url='None'):
    if proLevelName is None or url == 'None':
        return [('None','None','None')]
    else:
        web = requests.get(url, headers=headers)
        web.encoding = web.apparent_encoding
        soup = BeautifulSoup(web.text, 'html.parser')
        fourthLevelCodeNameList = soup.select('.towntr')
        retList = []

        for tag in fourthLevelCodeNameList:
            retList.append((tag.text[0:12], proLevelName + '-' + tag.text[12:], url[0:60] + tag.select('a')[0]['href']))

        return retList


# 获取四级代码 封装函数
def getFourthLevelCodeNamePackage(proLevelNameAndNextUrl):
    return getFourthLevelCodeName(proLevelNameAndNextUrl[1], proLevelNameAndNextUrl[2])


if __name__ == '__main__':
    # 进程数视电脑情况而定
    pool = Pool(processes=8)
    # 获取一级区域
    oneLevel = getOneLevelCodeName('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html')
    # print(oneLevel)
    # 获取二级区域
    time.sleep(2)
    secondLevel = [url for data in pool.map(getSecondLevelCodeNamePackage, oneLevel) for url in data]
    # print(secondLevel)
    # 获取三级区域
    time.sleep(2)
    thirdLevel = [url for data in pool.map(getThirdLevelCodeNamePackage, secondLevel) for url in data]
    # print(thirdLevel)
    # 获取四级区域
    time.sleep(2)
    fourthLevel = [url for data in pool.map(getFourthLevelCodeNamePackage, thirdLevel) for url in data]
    # print(fourthLevel)

    pool.close()

    pd_oneLevel = pd.DataFrame(oneLevel)
    pd_secondLevel = pd.DataFrame(secondLevel)
    pd_thirdLevel = pd.DataFrame(thirdLevel)
    pd_fourthLevel = pd.DataFrame(fourthLevel)
    pd_allLevel = pd.concat([pd_oneLevel,pd_secondLevel,pd_thirdLevel,pd_fourthLevel])
    pd_allLevel.columns = ['区划代码','名称','下一级网址']
    # 保存到当前路径
    pd_allLevel.to_excel(r''+os.path.realpath('__file__')[0:-8]+'区划代码及名称.xlsx',index=False)

区划代码下载传送(0积分):https://download.csdn.net/download/Dongfnag_HU/12852876