欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬取空气质量

程序员文章站 2022-05-19 13:17:18
...

一,下载chromdriver.exe

下载地址:http://chromedriver.storage.googleapis.com/index.html

下载完毕之后不用安装

版本匹配,参考博文:https://blog.csdn.net/mmayanshuo/article/details/78962398

二,代码

# -*- coding: utf-8 -*-

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import urllib
import time
import random
import requests


#获取ip列表
def get_ip_list(url, headers):
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ips = soup.find_all('tr')
    ip_list = []
    for i in range(1, len(ips)):
        ip_info = ips[i]
        tds = ip_info.find_all('td')
        ip_list.append(tds[1].text + ':' + tds[2].text)
    return ip_list


#随机获取一个代理ip
def get_random_ip(ip_list):
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies


#========================================代理ip==============================================
url = 'http://www.xicidaili.com/nn/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
ip_list = get_ip_list(url, headers=headers)
proxies = get_random_ip(ip_list)


#=====================================获取外层a标签======================================
url_host='https://www.aqistudy.cn/historydata/'
httpproxy_handler = urllib.request.ProxyHandler(proxies)
opener =urllib.request.build_opener(httpproxy_handler)
req=urllib.request.Request(url_host)
html=urllib.request.urlopen(req)
html_code=html.read()
plain_text=str(html_code,'utf-8')
soup=BeautifulSoup(plain_text)
soups_div=soup.find_all('div',{'class':'all'})
soups_a=soups_div.pop().find_all('a')
#
#=====================================获取内层a标签======================================  
url_host2='https://www.aqistudy.cn/historydata/daydata.php?city=%E9%83%91%E5%B7%9E&month=201909'
html2=requests.get(url_host2)
html2.encoding='utf-8'
html_code2=html2.text
soup2=BeautifulSoup(html_code2)
soups_div2=soup2.find_all('ul',{'class':'unstyled1'})
soups_a2=soups_div2.pop().find_all('a')


#=====================================循环获取页面数据============================================
def aqi_get(url_host,citys,months):
    chromedriver = r'C:\chromedriver.exe'#指定软件存放的位置
    driver = webdriver.Chrome(executable_path=chromedriver)
    air_quality=pd.DataFrame(columns=['city','month','AQI','level','PM25','PM10','SO2','CO','NO2','O3_8h'])
    for city in citys:
        for month in months:
            for hh in soups_a:
                for mm in soups_a2:
                    if hh.text==city and mm.text==month:                        
                        url1=url_host+'daydata.php?city='+city+'&month='+month[0:4]+month[5:7]
            driver.get(url1)
            print(url1)
            html2 = driver.page_source
            bf1 = BeautifulSoup(html2, 'lxml')
            result = bf1.find_all('tr')
            #若没有爬取到数据,一直访问该页面
            while len(result) < 10:
                html2 = driver.page_source
                bf1 = BeautifulSoup(html2, 'lxml')
                result = bf1.find_all('tr')
            for re in result[1:]:
                td=re.find_all('td')
                ss=[]
                for tt in td:
                    ss.append(tt.text)
                sss=[city]+ss
                s=pd.Series(sss,index=air_quality.columns)
                air_quality=air_quality.append(s,ignore_index=True)
             #适当暂停一段时间,防止反爬虫
#            time.sleep(8)
    driver.close()
    return air_quality



#citys=['郑州', '平顶山', '洛阳','安阳']
months=['2019年07月','2019年08月','2019年09月']
citys=['郑州','开封','许昌','洛阳','平顶山','三门峡','南阳','信阳','安阳','濮阳','商丘','鹤壁','焦作','驻马店','周口','新乡','漯河','济源']
now = time.strftime("%Y-%m-%d-%H-%M-%S",time.localtime(time.time()))
air_quality=aqi_get(url_host,citys,months)
pd.DataFrame.to_excel(air_quality,r"D:\air_quality_day"+now+".xls",',')