爬取中国天气网的数据(城市,最高温,最低温)
程序员文章站
2022-03-05 11:50:47
...
1.使用正则
import requests
import re
import random
import csv
import time
# 设置UA与IP
headers_choice = [{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50safari 5.1 – Windows'},{'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)TT'},{'User-Agent':'?Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'},{'User-Agent':'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5'},{'User-Agent':''},{'User-Agent':'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'},{'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'},{'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},{'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)Firefox 4.0.1 – MAC'}]
proxy_choice = [{'http':'54.214.52.181:80'},{'http':'34.93.243.67:80'},{'http':'175.42.128.103:9999'},{'https':'58.220.95.80:9401'}]
headers = random.choice(headers_choice)
proxy = random.choice(proxy_choice)
# 创建一个空列表用于存储数据
list = []
# 记录爬取的数目
k = 1
# 记录要拼接的地区
all_cregion = ['hb','db','hd','hz','hn','xb','xn','gat']
# 进行拼接
for i in all_cregion:
url = 'http://www.weather.com.cn/textFC/' + i +'.shtml'
# print(html)
# 地区
html = requests.get(url, headers=headers, proxies=proxy).content.decode('utf-8')
time.sleep(0.5)
region = re.findall(f'target="_blank">(.*?)</a></td>\n<td width="89">',html)
# print(region)
# 最高温度
html = requests.get(url, headers=headers, proxies=proxy).content.decode('utf-8')
time.sleep(0.5)
top_T = re.findall(r'</span></td>\n<td width="92">(.*?)</td>',html)
# print(top_T)
# 最低温度
html = requests.get(url, headers=headers, proxies=proxy).content.decode('utf-8')
low_T = re.findall(r'<td width="86">(.*?)</td>\n<td width="49" class="last">', html)
# print(low_T)
for endre in region:
# 创建一个空字典用于保存数据
dic = {}
# 存储数据
dic['地区'] = endre
dic['最高温'] = top_T[region.index(endre)]
dic['最低温'] = low_T[region.index(endre)]
# print(dic)
list.append(dic)
print(f'正在写入第{k}个城市的数据')
k += 1
# 写入文件
with open(r'D:\pycharm\PycharmProjects\test\pic/'+'天气网爬虫.csv','w',encoding='utf-8',newline='') as f:
# 创建表头
writer = csv.DictWriter(f, fieldnames=['地区','最高温','最低温'])
# 写入表头
writer.writeheader()
for j in list:
writer.writerow(j)
效果: