国家统计局2020年统计用区划代码全量采集
程序员文章站
2022-03-02 21:03:49
...
公司业务需要,把相关数据采集了一遍,在此记录。
采集链接:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html(可修改对应年限)
使用了scrapy框架,这里直接贴spider代码,存储代码因人而异就不贴了。
import scrapy
import re
from china_city_spider.items import ChinaCitySpiderItem
class ChinaCitySpider(scrapy.Spider):
name = 'china_city'
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/'
def start_requests(self):
yield scrapy.Request(self.base_url, meta={'home_page': True}, dont_filter=True)
def parse(self, response, **kwargs):
url_temp = response.meta.get('url_temp')
level = response.meta.get('level', 1)
from_code = response.meta.get('from_code', 0)
if not url_temp:
url_temp = self.base_url
tr_list = response.xpath("//table[contains(@class,'table')]//tr[contains(@class,'tr')]")
if response.meta.get('home_page'):
# 首页规则
for tr in tr_list:
for td in tr.xpath("./td"):
code = td.re(r"(\d+)\.html")
if code:
code = code[0]
else:
continue
name = td.xpath(".//a/text()").get()
item = self.create_item(code, from_code, name, level)
yield ChinaCitySpiderItem(**item)
uri = td.xpath(".//a/@href").get()
url_code = code
if uri:
url = self.base_url + uri
_url_temp = url_temp + url_code + '/'
yield scrapy.Request(url, callback=self.parse, meta={'url_temp': _url_temp, 'level': level + 1, 'from_code': code})
else:
for tr in tr_list:
content = tr.xpath(".//td//text()").getall()
if content:
code = content[0]
name = content[-1]
item = self.create_item(code, from_code, name, level)
yield ChinaCitySpiderItem(**item)
uri = tr.xpath(".//a/@href").get()
if uri:
url = re.search(r'.*/', response.url).group() + uri
yield scrapy.Request(url, callback=self.parse, meta={'level': level + 1, 'from_code': code})
else:
self.logger.error(f'Can\'t get code and name: {response.url}')
def create_item(self, code, from_code, name, level):
item = {
'id': code,
'parent_id': from_code,
'cname': name,
'ctype': level,
'code': int(code),
}
return item
全量67w数据。