Python 爬取国家统计局的省市区数据
程序员文章站
2022-05-04 18:50:07
...
说明:
- 数据暂时不能爬取全,由于频繁请求会超时
- 建议配置多个代理轮询调用爬去
- 只爬取到区的信息,可以不用上代理
# -*-coding:utf-8 -*-
import urllib2
import sys
# 接上面代码
from bs4 import BeautifulSoup as bs
reload(sys)
sys.setdefaultencoding('GBK')
def get_url_content(url):
i_headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1) Gecko/20090624 Firefox/3.5", \
"Referer": 'http://www.baidu.com'}
req = urllib2.Request(url, headers=i_headers)
# proxies = {"http": "114.244.112.220:8118"} # 设置你想要使用的代理
# proxy_s = urllib2.ProxyHandler(proxies)
# opener = urllib2.build_opener(proxy_s)
# urllib2.install_opener(opener)
return urllib2.urlopen(req, timeout=10).read().decode('GBK')
html_data = get_url_content('http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html')
# print(html_data)
sup = bs(html_data, 'html.parser')
attrs = {
"class": "provincetr"
}
city_attrs = {
"class": "citytr"
}
area_attrs = {
"class": "countytr"
}
town_attrs = {
"class": "towntr"
}
baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/';
cityTxtName = "./city.txt"
areaTxtName = "./area.txt"
townTxtName = "./town.txt"
city_file = open(cityTxtName, 'a+');
city_file.truncate();
area_file = open(areaTxtName, 'a+');
area_file.truncate();
town_file = open(townTxtName, 'a+');
town_file.truncate();
data_tables = sup.find_all('tr', attrs)
for mytr in data_tables:
mya = mytr.find_all('a');
for sa in mya:
print (sa['href'])
print (sa.contents[0]);
print (sa['href'][:-5]);
add = sa['href'][:-5];
city_data = get_url_content(baseUrl + sa['href']);
citys = bs(city_data, 'html.parser').find_all('tr', city_attrs);
for city in citys:
city_a = city.find_all('a');
city_file.write(city_a[0].contents[0] + "\t" + city_a[1].contents[0] + "\n")
print(baseUrl + city_a[0]['href'])
area_data = get_url_content(baseUrl + city_a[0]['href'])
areas = bs(area_data, 'html.parser').find_all('tr', area_attrs);
for area in areas:
area_a = area.find_all('a');
print(area_a);
if area_a:
area_file.write(area_a[0].contents[0] + "\t" + area_a[1].contents[0] + "\n")
print(baseUrl + add + '/' + area_a[0]['href'])
town_data = get_url_content(baseUrl + add + '/' + area_a[0]['href'])
towns = bs(town_data, 'html.parser').find_all('tr', town_attrs);
for town in towns:
town_a = town.find_all('a');
town_file.write(town_a[0].contents[0] + "\t" + town_a[1].contents[0] + "\n")
else:
area_td = area.find_all('td');
print(area_td)
area_file.write(area_td[0].contents[0] + "\t" + area_td[1].contents[0] + "\n")
print('完成数据爬取!')
city_file.close();
area_file.close();
town_file.close();
转载于:https://my.oschina.net/u/2984386/blog/1863100