python爬取空气质量
程序员文章站
2022-05-19 13:17:18
...
一,下载chromdriver.exe
下载地址:http://chromedriver.storage.googleapis.com/index.html
下载完毕之后不用安装
版本匹配,参考博文:https://blog.csdn.net/mmayanshuo/article/details/78962398
二,代码
# -*- coding: utf-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import urllib
import time
import random
import requests
#获取ip列表
def get_ip_list(url, headers):
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
#随机获取一个代理ip
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
#========================================代理ip==============================================
url = 'http://www.xicidaili.com/nn/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
ip_list = get_ip_list(url, headers=headers)
proxies = get_random_ip(ip_list)
#=====================================获取外层a标签======================================
url_host='https://www.aqistudy.cn/historydata/'
httpproxy_handler = urllib.request.ProxyHandler(proxies)
opener =urllib.request.build_opener(httpproxy_handler)
req=urllib.request.Request(url_host)
html=urllib.request.urlopen(req)
html_code=html.read()
plain_text=str(html_code,'utf-8')
soup=BeautifulSoup(plain_text)
soups_div=soup.find_all('div',{'class':'all'})
soups_a=soups_div.pop().find_all('a')
#
#=====================================获取内层a标签======================================
url_host2='https://www.aqistudy.cn/historydata/daydata.php?city=%E9%83%91%E5%B7%9E&month=201909'
html2=requests.get(url_host2)
html2.encoding='utf-8'
html_code2=html2.text
soup2=BeautifulSoup(html_code2)
soups_div2=soup2.find_all('ul',{'class':'unstyled1'})
soups_a2=soups_div2.pop().find_all('a')
#=====================================循环获取页面数据============================================
def aqi_get(url_host,citys,months):
chromedriver = r'C:\chromedriver.exe'#指定软件存放的位置
driver = webdriver.Chrome(executable_path=chromedriver)
air_quality=pd.DataFrame(columns=['city','month','AQI','level','PM25','PM10','SO2','CO','NO2','O3_8h'])
for city in citys:
for month in months:
for hh in soups_a:
for mm in soups_a2:
if hh.text==city and mm.text==month:
url1=url_host+'daydata.php?city='+city+'&month='+month[0:4]+month[5:7]
driver.get(url1)
print(url1)
html2 = driver.page_source
bf1 = BeautifulSoup(html2, 'lxml')
result = bf1.find_all('tr')
#若没有爬取到数据,一直访问该页面
while len(result) < 10:
html2 = driver.page_source
bf1 = BeautifulSoup(html2, 'lxml')
result = bf1.find_all('tr')
for re in result[1:]:
td=re.find_all('td')
ss=[]
for tt in td:
ss.append(tt.text)
sss=[city]+ss
s=pd.Series(sss,index=air_quality.columns)
air_quality=air_quality.append(s,ignore_index=True)
#适当暂停一段时间,防止反爬虫
# time.sleep(8)
driver.close()
return air_quality
#citys=['郑州', '平顶山', '洛阳','安阳']
months=['2019年07月','2019年08月','2019年09月']
citys=['郑州','开封','许昌','洛阳','平顶山','三门峡','南阳','信阳','安阳','濮阳','商丘','鹤壁','焦作','驻马店','周口','新乡','漯河','济源']
now = time.strftime("%Y-%m-%d-%H-%M-%S",time.localtime(time.time()))
air_quality=aqi_get(url_host,citys,months)
pd.DataFrame.to_excel(air_quality,r"D:\air_quality_day"+now+".xls",',')
上一篇: SketchUp怎么调整图纸的背景颜色?
下一篇: 我想出去看看