Python【爬虫】/ 爱词霸·每日一句 / 按日期批量爬取

程序员文章站 2022-05-18 21:25:41

...

Talk is cheap, show me the Code.
ok,... 安排.

代码被托管在了Gitee（码云）：

https://gitee.com/leviathan-litan/Discovery_Data_Web

代码如下：

# coding:utf-8

# Describe
"""
Author: Adamhuan
Blog: http://www.d-prototype.com
目标：爬取【网页数据】
"""

# Import

# 数据分析
import pandas as pd

# 系统
import os

# 时间
import datetime,time

# 正则表达式
import re

# 处理JSON数据
import json

# HTTP or HTTPS
from urllib import request,response
import requests

# 【WEB】编码解析器
from bs4 import BeautifulSoup

# Variable

# 目标URL
url_address = ""

# Class
class Dig_Data_Web:

    # Class Attribute
    # -- Object
    obj_request = None
    obj_response = None
    obj_html = None

    # -- Variable
    headers = ""
    string_url = ""

    # -- Path
    path_script_base = os.getcwd()
    path_download_base = os.path.dirname(path_script_base) + "/download"

    def __init__(self, str_url=""):

        # Display / Intro
        print("************************")
        print("脚本：Web数据 - 爬取")
        print("------------------")
        print("当前路径：【" + self.path_script_base + "】")
        print("下载路径：【" + self.path_download_base + "】")
        print("************************")

        # 目标URL
        self.string_url = str_url

        # 自定义【header】
        # 防止【反爬虫】
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}

        # 初始化【爬取网页】所需要的对象
        if str_url != "" and str_url != None:
            # 响应返回的数据
            self.obj_request = request.Request(url=self.string_url, headers=self.headers)
            self.obj_response = request.urlopen(self.obj_request)
            self.obj_html = self.obj_response.read()
            # 【WEB】编码解析器
            self.obj_bs = BeautifulSoup(self.obj_html, "html.parser")
        else:
            print("!!! Warning !!! ---> 【URL】为空")

        print()

    def __del__(self):
        pass

    # ============================
    # 文件操作

    # 文件路径检查
    def path_check(self, str_path):

        if not os.path.exists(str_path):
            os.makedirs(str_path)

        # End.

    # 文件 / 写
    def file_write(self,obj_source,target_path,file_type):

        # Variable
        obj_file = None

        if file_type == "file":
            obj_file = open(target_path,'wb')
        elif file_type == "text":
            obj_file = open(target_path, 'w', encoding='utf-8')

        obj_file.write(obj_source)
        obj_file.close()

        # End.
    # ============================
    # 资源操作 / 基础

    # 将URL转换成可以直接操作的对象
    def obj_url(self,resource_url):

        obj_resp = request.urlopen(url=resource_url)
        obj_url = obj_resp.read()

        # Return.
        return obj_url

    # ============================
    # 资源操作 / 规则

    # 查找【m3u8】
    """
    str_html_label，HTML标签名
    str_attrs_key，HTML标签的属性名
    str_attrs_value，HTML标签的属性值
    """
    def find_m3u8(self,str_html_label="",str_attrs_key="",str_attrs_value=""):

        # variable
        path_download_current_base = self.path_download_base + "/m3u8"

        # 路径 / 创建
        self.path_check(str_path=path_download_current_base)

        # Do Find
        data_find_result_m3u8 = self.obj_bs.findAll(str_html_label,attrs={str_attrs_key: re.compile(str_attrs_value)})

        # Display
        print("找到的【m3u8】：")
        for item in data_find_result_m3u8:
            print("==================")
            print(item.get('data-normal'))

        # Return
        return data_find_result_m3u8

        # End.

    # 查找【img】
    """
    str_html_label，HTML标签名
    str_attrs_key，HTML标签的属性名
    str_attrs_value，HTML标签的属性值
    """
    def find_img(self,str_html_label="",str_attrs_key="",str_attrs_value=""):

        # variable
        path_download_current_base = self.path_download_base + "/img"

        # 路径 / 创建
        self.path_check(str_path=path_download_current_base)

        # Do Find
        data_find_result_img = self.obj_bs.findAll(str_html_label, attrs={str_attrs_key: re.compile(str_attrs_value)})

        # Display
        print("找到的【图片】：")
        for item in data_find_result_img:
            print("==================")
            print(item.get('file'))

        # Return
        return data_find_result_img

        # End.

    # ============================
    # 单独的请求对象的创建，与类初始化的时候的请求对象没关系

    # 获取【iciba.com/daily/每日一句】
    """
    注释
    """
    def get_iciba_daily(self,str_date=""):

        # Variable

        # 系统当前时间
        str_date_current = time.strftime("%Y-%m-%d", time.localtime())

        # 基准时间
        date_base = None
        date_base_formatted = None

        if str_date == "":
            date_base = time.strptime(str_date_current, "%Y-%m-%d")
            date_base_formatted = str_date_current
        else:
            date_base = time.strptime(str_date, "%Y-%m-%d")
            date_base_formatted = str_date

        # 当前参与计算的日期

        str_date_current_year = time.strftime("%Y", date_base)
        str_date_current_month = time.strftime("%Y-%m", date_base)

        # 路径
        path_download_current_base = self.path_download_base + "/iciba.com"
        path_download_current = path_download_current_base + "/" + str_date_current_year + "/" + str_date_current_month+ "/" + date_base_formatted
        # 文件
        file_download_current_image = path_download_current + "/" + date_base_formatted + ".jpg"
        file_download_current_content_en = path_download_current + "/" + date_base_formatted + "_content_en.txt"
        file_download_current_content_zh = path_download_current + "/" + date_base_formatted + "_content_zh.txt"

        # 路径 / 创建
        self.path_check(str_path=path_download_current)

        # 目标地址
        # str_url = "http://news.iciba.com/views/dailysentence/daily.html#!/detail/title/2020-01-20"
        # str_url = "http://open.iciba.com/dsapi/"
        str_url = "http://sentence.iciba.com/?&c=dailysentence&m=getdetail&title=" + date_base_formatted

        # 响应返回的数据（单独的请求对象的创建，与类初始化的时候的请求对象没关系）
        obj_resp = requests.get(str_url)
        obj_resp_data = obj_resp.text

        # JSON
        data_html_json = json.loads(obj_resp_data)

        # 数据
        data_picture = dict(data_html_json).get('picture2')
        data_content_en = dict(data_html_json).get('content')
        data_content_zh = dict(data_html_json).get('note')

        # 数据对象
        obj_data_picture = self.obj_url(resource_url=data_picture)

        # 写入文件
        # -- 图片
        self.file_write(obj_source=obj_data_picture,target_path=file_download_current_image,file_type="file")
        # -- 英文
        self.file_write(obj_source=data_content_en, target_path=file_download_current_content_en,file_type="text")
        # -- 中文
        self.file_write(obj_source=data_content_zh, target_path=file_download_current_content_zh,file_type="text")

        # Display
        print("路径：【" + path_download_current + "】")
        print("日期：【" + str_date_current + "】")
        print("图片：【" + data_picture + "】")
        print("英文：【" + data_content_en + "】")
        print("中文：【" + data_content_zh + "】")

    # 获取【iciba.com/daily/每日一句】
    """
    注释
    """
    def get_iciba_daily_by_DateRange(self,date_begin,date_end):

        # Variable
        date_array = pd.date_range(date_begin,date_end)

        # Display
        for date_item in date_array:

            # Variable
            date_item_formatted = time.strftime("%Y-%m-%d", time.strptime(str(date_item), "%Y-%m-%d %H:%M:%S"))

            # Do
            self.get_iciba_daily(str_date=date_item_formatted)

            # Display
            print("-------")
            print(date_item_formatted)

# Section: main()
if __name__ == "__main__":

    # 例子
    # obj_dig_data_web = Dig_Data_Web(str_url=url_address)
    # obj_dig_data_web.find_m3u8(str_html_label="div",str_attrs_key="class",str_attrs_value="playerWrap ckplayerPlugin")
    # obj_dig_data_web.find_img(str_html_label="img", str_attrs_key="id",str_attrs_value="detail-banner-img")

    # 爬取【iciba.com / 爱词霸每日一句】
    obj_dig_data_web = Dig_Data_Web()
    obj_dig_data_web.get_iciba_daily_by_DateRange('2019/12/3','2020/1/20')

# Section: End
# Finished

执行效果如下：

Python【爬虫】/ 爱词霸·每日一句 / 按日期批量爬取

就是这样，...

代码详情不解释了。

有问题留言。

上一篇： python爬虫之通过pyquery爬取大众点评评论信息

下一篇： Flash水坑钓鱼