欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python 爬取中国天气网(济南)数据

程序员文章站 2022-07-14 17:59:27
...

图片位置的数据,具体数据可以页面右击,查看网页源代码,里边有这些数据,正则提出来

 dd = re.findall(r'{"od":{"od0".*', html)[0][:-1],集体看代码

python 爬取中国天气网(济南)数据

# 济南天气
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import datetime
import json
import time
import random

import requests
import re
from lxml import etree
import pymysql
from DBUtils.PooledDB import PooledDB


class Weather_china:
    headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"},
               {"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"},
               {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"},
               ]

    #
    def __init__(self):
        self.host = '127.0.0.1'
        self.pool = PooledDB(creator=pymysql, maxcached=5, maxshared=5, host=self.host, user='root',
                             passwd='123', db="test01", port=3306,
                             charset="utf8")
        self.conn = self.pool.connection()
        self.cursor = self.conn.cursor()

    def parseUrl(self, url):
        header1 = random.choice(self.headers)
        res = requests.get(url, headers=header1, verify=False)
        res.encoding = "utf-8"
        html = res.text
        # print(html)
        dd = re.findall(r'{"od":{"od0".*', html)[0][:-1]
        datas = json.loads(dd)
        # print(datas)
        # 当前城市 -----------
        parse = etree.HTML(html)
        now_city = parse.xpath('//div[@class="crumbs fl"]//text()')
        city = ''
        for i in now_city:
            city += i.strip()
        print(city)
        od0 = datas["od"]["od0"][:-4]
        year = od0[0:4]
        mouth = od0[4:6]
        day = od0[6:8]
        hour = int(od0[8:10])

        data1 = datas['od']['od2'][:-1]
        # print(data1)
        # 当前时间---------------
        now_time = ''
        L = []
        for data in data1:
            # 整理时间输出
            if int(data['od21']) <= hour:
                now_time = year + '-' + mouth + '-' + str(int(day)) + ' ' + data['od21']
            else:
                now_time = year + '-' + mouth + '-' + str(int(day) - 1) + ' ' + data['od21']

            """
            空气质量  今天没有
            {"od21":"16","od22":"29","od23":"61","od24":"东北风",
            "od25":"1","od26":"0.0","od27":"53","od28":"90"}
            od21  当前小时    now_time
            od22  温度        temperature
            od23  风向        wind_direction
            od25  风力        wind_power
            od26  降水量 0.0  precipitation
            od27  相对湿度    relative_humidity
            od28: 空气质量    air_quality
            city  城市
            """
            temperature = data['od22']
            wind_direction = data['od23']
            wind_power = data['od25']
            precipitation = data['od26']
            relative_humidity = data['od27']
            air_quality = data['od28']

            # 判断数据存在与否
            sql = "select * from weather_china where now_time='%s'and city='%s'" % (now_time, city)
            number = self.cursor.execute(sql)
            if number == 1:

                try:
                    command_a = "update weather_china set air_quality='%s' where now_time='%s'and city='%s'" % (air_quality, now_time,city)
                    # 使用execute方法执行SQL语句
                    self.cursor.execute(command_a)
                    # 提交到数据库执行
                    self.conn.commit()
                    print("更新成功")
                except Exception as e:
                    self.conn.rollback()
                    print("更新失败")
            elif number == 0:

                L.append((now_time, city, temperature, wind_direction, wind_power, precipitation, relative_humidity,
                          air_quality))

        # print(L)
        self.toMysql(L)

    def toMysql(self, L):
        if L:
            try:
                sql = """INSERT INTO weather_china(now_time, city, temperature, wind_direction, wind_power, precipitation, relative_humidity,
                          air_quality) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"""
                self.cursor.executemany(sql, L)
                self.conn.commit()
                print("写入成功")
                del L[:]
            except Exception as e:
                self.conn.rollback()
                print(e)
        else:
            pass

    def workOn(self):

        urls =['http://www.weather.com.cn/weather1d/101120101.shtml#around2', # 济南城区
               'http://www.weather.com.cn/weather1d/101121601.shtml#around2', # # 莱芜
               'http://www.weather.com.cn/weather1d/101120102.shtml#around2',# # 长清
               'http://www.weather.com.cn/weather1d/101120107.shtml#input', # # 历下
               'http://www.weather.com.cn/weather1d/101120111.shtml#input', # # 历城
               'http://www.weather.com.cn/weather1d/101120109.shtml#input',# # 槐荫
               'http://www.weather.com.cn/weather1d/101120110.shtml#input',# # 天桥
               'http://www.weather.com.cn/weather1d/101120108.shtml#input',# # 市中
               'http://www.weather.com.cn/weather1d/101120104.shtml#input',# # 章丘
               'http://www.weather.com.cn/weather1d/101120106.shtml#input', # # 济阳
               'http://www.weather.com.cn/weather1d/101121603.shtml#input',# # 钢城
               'http://www.weather.com.cn/weather1d/101120103.shtml#input',# # 商河
               'http://www.weather.com.cn/weather1d/101120105.shtml#input']# # 平阴

        for url in urls:
            time.sleep(0.1)
            self.parseUrl(url)


if __name__ == '__main__':
    spider = Weather_china()
    spider.workOn()

相关标签: python基础