python 爬取中国天气网(济南)数据
程序员文章站
2022-07-14 17:59:27
...
图片位置的数据,具体数据可以页面右击,查看网页源代码,里边有这些数据,正则提出来
dd = re.findall(r'{"od":{"od0".*', html)[0][:-1],集体看代码
# 济南天气
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime
import json
import time
import random
import requests
import re
from lxml import etree
import pymysql
from DBUtils.PooledDB import PooledDB
class Weather_china:
headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
{"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
{"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"},
]
#
def __init__(self):
self.host = '127.0.0.1'
self.pool = PooledDB(creator=pymysql, maxcached=5, maxshared=5, host=self.host, user='root',
passwd='123', db="test01", port=3306,
charset="utf8")
self.conn = self.pool.connection()
self.cursor = self.conn.cursor()
def parseUrl(self, url):
header1 = random.choice(self.headers)
res = requests.get(url, headers=header1, verify=False)
res.encoding = "utf-8"
html = res.text
# print(html)
dd = re.findall(r'{"od":{"od0".*', html)[0][:-1]
datas = json.loads(dd)
# print(datas)
# 当前城市 -----------
parse = etree.HTML(html)
now_city = parse.xpath('//div[@class="crumbs fl"]//text()')
city = ''
for i in now_city:
city += i.strip()
print(city)
od0 = datas["od"]["od0"][:-4]
year = od0[0:4]
mouth = od0[4:6]
day = od0[6:8]
hour = int(od0[8:10])
data1 = datas['od']['od2'][:-1]
# print(data1)
# 当前时间---------------
now_time = ''
L = []
for data in data1:
# 整理时间输出
if int(data['od21']) <= hour:
now_time = year + '-' + mouth + '-' + str(int(day)) + ' ' + data['od21']
else:
now_time = year + '-' + mouth + '-' + str(int(day) - 1) + ' ' + data['od21']
"""
空气质量 今天没有
{"od21":"16","od22":"29","od23":"61","od24":"东北风",
"od25":"1","od26":"0.0","od27":"53","od28":"90"}
od21 当前小时 now_time
od22 温度 temperature
od23 风向 wind_direction
od25 风力 wind_power
od26 降水量 0.0 precipitation
od27 相对湿度 relative_humidity
od28: 空气质量 air_quality
city 城市
"""
temperature = data['od22']
wind_direction = data['od23']
wind_power = data['od25']
precipitation = data['od26']
relative_humidity = data['od27']
air_quality = data['od28']
# 判断数据存在与否
sql = "select * from weather_china where now_time='%s'and city='%s'" % (now_time, city)
number = self.cursor.execute(sql)
if number == 1:
try:
command_a = "update weather_china set air_quality='%s' where now_time='%s'and city='%s'" % (air_quality, now_time,city)
# 使用execute方法执行SQL语句
self.cursor.execute(command_a)
# 提交到数据库执行
self.conn.commit()
print("更新成功")
except Exception as e:
self.conn.rollback()
print("更新失败")
elif number == 0:
L.append((now_time, city, temperature, wind_direction, wind_power, precipitation, relative_humidity,
air_quality))
# print(L)
self.toMysql(L)
def toMysql(self, L):
if L:
try:
sql = """INSERT INTO weather_china(now_time, city, temperature, wind_direction, wind_power, precipitation, relative_humidity,
air_quality) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""
self.cursor.executemany(sql, L)
self.conn.commit()
print("写入成功")
del L[:]
except Exception as e:
self.conn.rollback()
print(e)
else:
pass
def workOn(self):
urls =['http://www.weather.com.cn/weather1d/101120101.shtml#around2', # 济南城区
'http://www.weather.com.cn/weather1d/101121601.shtml#around2', # # 莱芜
'http://www.weather.com.cn/weather1d/101120102.shtml#around2',# # 长清
'http://www.weather.com.cn/weather1d/101120107.shtml#input', # # 历下
'http://www.weather.com.cn/weather1d/101120111.shtml#input', # # 历城
'http://www.weather.com.cn/weather1d/101120109.shtml#input',# # 槐荫
'http://www.weather.com.cn/weather1d/101120110.shtml#input',# # 天桥
'http://www.weather.com.cn/weather1d/101120108.shtml#input',# # 市中
'http://www.weather.com.cn/weather1d/101120104.shtml#input',# # 章丘
'http://www.weather.com.cn/weather1d/101120106.shtml#input', # # 济阳
'http://www.weather.com.cn/weather1d/101121603.shtml#input',# # 钢城
'http://www.weather.com.cn/weather1d/101120103.shtml#input',# # 商河
'http://www.weather.com.cn/weather1d/101120105.shtml#input']# # 平阴
for url in urls:
time.sleep(0.1)
self.parseUrl(url)
if __name__ == '__main__':
spider = Weather_china()
spider.workOn()