[python爬虫]爬取天气网全国所有县市的天气数据
程序员文章站
2022-07-14 17:00:07
...
所要用到的库
import requests
from lxml import etree
import xlwt
访问URL
这里我们要用xpath来解析数据,所以我们返回的网页数据格式为html
没学过xpath的可以看看这篇博客,写的还是很详细的
xpath链接
https://blog.csdn.net/u013332124/article/details/80621638
def ask_url(url):
# 伪装请求头
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
}
response = requests.get(url, headers=header)
# 以html格式返回数据
html = etree.HTML(response.text)
return html
解析数据
所要爬取的数据有一下
先是爬取每个省份URL的尾部,然后一个个访问。
def get_data(url):
html = ask_url(url)
base_url = 'http://www.weather.com.cn'
province_name = [] # 省份名字
# province_url = [] # 省份的URL
city_name = [] # 城市名称
weather = [] # 天气现象
wind_direction = [] # 风向
wind_power = [] # 风力
max_temperature = [] # 最高温
min_temperature = [] # 最低温
data = [] # 数据汇总
province_name_decode = html.xpath('//div[@class="lqcontentBoxheader"]//a[@target="_blank"]/text()')
for i in range(len(province_name_decode)):
# print(province_name_decode[i].encode('raw_unicode_escape').decode())
province_name.append(province_name_decode[i].encode('raw_unicode_escape').decode())
province_url = html.xpath('//div[@class="lqcontentBoxheader"]//a[@target="_blank"]/@href') # 省份的URL
# print(province_url)
for j in range(len(province_url)):
# for j in range(0, 1):
temp_url = base_url + province_url[j]
province_html = ask_url(temp_url)
# 城市名称
city_name_decode = province_html.xpath('//div[@class="hanml"]/div[1]//td[contains(@width, "83") and contains(@height, "23")]/a[1]/text()')
for n in range(len(city_name_decode)):
# print(city_name_decode[n].encode('raw_unicode_escape').decode())
city_name.append(city_name_decode[n].encode('raw_unicode_escape').decode())
# 天气现象
weather_decode = province_html.xpath('//div[@class="hanml"]/div[1]//div[@class="conMidtab3"]//td[@width="89"]/text()')
for n in range(len(weather_decode)):
# print(weather_decode[n].encode('raw_unicode_escape').decode())
weather.append(weather_decode[n].encode('raw_unicode_escape').decode())
# 风向和风力
wind_direction_decode = province_html.xpath('//div[@class="hanml"]/div[1]//div[@class="conMidtab3"]//td[@width="162"]/span[1]/text()')
for n in range(len(wind_direction_decode)):
# print(wind_direction_decode[n].encode('raw_unicode_escape').decode())
wind_direction.append(wind_direction_decode[n].encode('raw_unicode_escape').decode())
wind_power_decode = province_html.xpath('//div[@class="hanml"]/div[1]//div[@class="conMidtab3"]//td[@width="162"]/span[@class="conMidtabright"]/text()')
for n in range(len(wind_power_decode)):
# print(wind_power_decode[n].encode('raw_unicode_escape').decode())
wind_power.append(wind_power_decode[n].encode('raw_unicode_escape').decode())
# 最高温
max_temperature_decode = province_html.xpath('//div[@class="hanml"]/div[1]//div[@class="conMidtab3"]//td[@width="92"]/text()')
for n in range(len(max_temperature_decode)):
# print(max_temperature_decode[n])
max_temperature.append(max_temperature_decode[n])
# 最低温
min_temperature_decode = province_html.xpath('//div[@class="hanml"]/div[1]//div[@class="conMidtab3"]//td[@width="86"]/text()')
for n in range(len(min_temperature_decode)):
# print(min_temperature_decode[n])
min_temperature.append(min_temperature_decode[n])
data.append(city_name)
data.append(weather)
data.append(wind_direction)
data.append(wind_power)
data.append(max_temperature)
data.append(min_temperature)
return data
保存数据
保存成excel文件
def save_data(data, save_path):
"""
保存数据
:param data:
:return:
"""
# 创建workbook对象
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = workbook.add_sheet('天气', cell_overwrite_ok=True) # 创建工作表
col = ('城市名', '天气', '风向', '风力', '最高温', '最低温')
for i in range(len(col)):
sheet.write(0, i, col[i])
print("正在下载第%d列数据" % (i + 1))
for j in range(len(data[0])):
sheet.write(j + 1, 0, data[0][j])
for j in range(len(data[1])):
sheet.write(j + 1, 1, data[1][j])
for j in range(len(data[2])):
sheet.write(j + 1, 2, data[2][j])
for j in range(len(data[3])):
sheet.write(j + 1, 3, data[3][j])
for j in range(len(data[4])):
sheet.write(j + 1, 4, data[4][j])
for j in range(len(data[5])):
sheet.write(j + 1, 5, data[5][j])
workbook.save(save_path)
得到的结果截图
完整的代码稍后给链接
上一篇: Python爬取天气网历史天气数据
下一篇: Python 爬取历史天气数据