欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python 爬取国家统计局的省市区数据

程序员文章站 2022-05-04 18:50:07
...

 说明:

  1. 数据暂时不能爬取全,由于频繁请求会超时
  2. 建议配置多个代理轮询调用爬去
  3. 只爬取到区的信息,可以不用上代理
# -*-coding:utf-8 -*-
import urllib2
import sys

# 接上面代码
from bs4 import BeautifulSoup as bs

reload(sys)
sys.setdefaultencoding('GBK')


def get_url_content(url):
    i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", \
                 "Referer": 'http://www.baidu.com'}
    req = urllib2.Request(url, headers=i_headers)
    # proxies = {"http": "114.244.112.220:8118"}  # 设置你想要使用的代理
    # proxy_s = urllib2.ProxyHandler(proxies)
    # opener = urllib2.build_opener(proxy_s)
    # urllib2.install_opener(opener)
    return urllib2.urlopen(req, timeout=10).read().decode('GBK')


html_data = get_url_content('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html')
# print(html_data)

sup = bs(html_data, 'html.parser')
attrs = {
    "class": "provincetr"
}
city_attrs = {
    "class": "citytr"
}
area_attrs = {
    "class": "countytr"
}
town_attrs = {
    "class": "towntr"
}
baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/';
cityTxtName = "./city.txt"
areaTxtName = "./area.txt"
townTxtName = "./town.txt"
city_file = open(cityTxtName, 'a+');
city_file.truncate();
area_file = open(areaTxtName, 'a+');
area_file.truncate();
town_file = open(townTxtName, 'a+');
town_file.truncate();
data_tables = sup.find_all('tr', attrs)
for mytr in data_tables:
    mya = mytr.find_all('a');
    for sa in mya:
        print (sa['href'])
        print (sa.contents[0]);
        print (sa['href'][:-5]);
        add = sa['href'][:-5];
        city_data = get_url_content(baseUrl + sa['href']);
        citys = bs(city_data, 'html.parser').find_all('tr', city_attrs);
        for city in citys:
            city_a = city.find_all('a');
            city_file.write(city_a[0].contents[0] + "\t" + city_a[1].contents[0] + "\n")
            print(baseUrl + city_a[0]['href'])
            area_data = get_url_content(baseUrl + city_a[0]['href'])
            areas = bs(area_data, 'html.parser').find_all('tr', area_attrs);
            for area in areas:
                area_a = area.find_all('a');
                print(area_a);
                if area_a:
                    area_file.write(area_a[0].contents[0] + "\t" + area_a[1].contents[0] + "\n")
                    print(baseUrl + add + '/' + area_a[0]['href'])
                    town_data = get_url_content(baseUrl + add + '/' + area_a[0]['href'])
                    towns = bs(town_data, 'html.parser').find_all('tr', town_attrs);
                    for town in towns:
                        town_a = town.find_all('a');
                        town_file.write(town_a[0].contents[0] + "\t" + town_a[1].contents[0] + "\n")
                else:
                    area_td = area.find_all('td');
                    print(area_td)
                    area_file.write(area_td[0].contents[0] + "\t" + area_td[1].contents[0] + "\n")
print('完成数据爬取!')
city_file.close();
area_file.close();
town_file.close();

 

转载于:https://my.oschina.net/u/2984386/blog/1863100