欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python【爬虫】/ 爱词霸·每日一句 / 按日期批量爬取

程序员文章站 2022-05-18 21:25:41
...
Talk is cheap, show me the Code.
ok,... 安排.

代码被托管在了Gitee(码云):

https://gitee.com/leviathan-litan/Discovery_Data_Web

代码如下:

# coding:utf-8

# Describe
"""
Author: Adamhuan
Blog: http://www.d-prototype.com
目标:爬取【网页数据】
"""

# Import

# 数据分析
import pandas as pd

# 系统
import os

# 时间
import datetime,time

# 正则表达式
import re

# 处理JSON数据
import json

# HTTP or HTTPS
from urllib import request,response
import requests

# 【WEB】编码解析器
from bs4 import BeautifulSoup

# Variable

# 目标URL
url_address = ""

# Class
class Dig_Data_Web:

    # Class Attribute
    # -- Object
    obj_request = None
    obj_response = None
    obj_html = None

    # -- Variable
    headers = ""
    string_url = ""

    # -- Path
    path_script_base = os.getcwd()
    path_download_base = os.path.dirname(path_script_base) + "/download"

    def __init__(self, str_url=""):

        # Display / Intro
        print("************************")
        print("脚本:Web数据 - 爬取")
        print("------------------")
        print("当前路径:【" + self.path_script_base + "】")
        print("下载路径:【" + self.path_download_base + "】")
        print("************************")

        # 目标URL
        self.string_url = str_url

        # 自定义【header】
        # 防止【反爬虫】
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}

        # 初始化【爬取网页】所需要的对象
        if str_url != "" and str_url != None:
            # 响应返回的数据
            self.obj_request = request.Request(url=self.string_url, headers=self.headers)
            self.obj_response = request.urlopen(self.obj_request)
            self.obj_html = self.obj_response.read()
            # 【WEB】编码解析器
            self.obj_bs = BeautifulSoup(self.obj_html, "html.parser")
        else:
            print("!!! Warning !!! ---> 【URL】为空")

        print()

    def __del__(self):
        pass

    # ============================
    # 文件操作

    # 文件路径检查
    def path_check(self, str_path):

        if not os.path.exists(str_path):
            os.makedirs(str_path)

        # End.

    # 文件 / 写
    def file_write(self,obj_source,target_path,file_type):

        # Variable
        obj_file = None

        if file_type == "file":
            obj_file = open(target_path,'wb')
        elif file_type == "text":
            obj_file = open(target_path, 'w', encoding='utf-8')

        obj_file.write(obj_source)
        obj_file.close()

        # End.
    # ============================
    # 资源操作 / 基础

    # 将URL转换成可以直接操作的对象
    def obj_url(self,resource_url):

        obj_resp = request.urlopen(url=resource_url)
        obj_url = obj_resp.read()

        # Return.
        return obj_url

    # ============================
    # 资源操作 / 规则

    # 查找【m3u8】
    """
    str_html_label,HTML标签名
    str_attrs_key,HTML标签的属性名
    str_attrs_value,HTML标签的属性值
    """
    def find_m3u8(self,str_html_label="",str_attrs_key="",str_attrs_value=""):

        # variable
        path_download_current_base = self.path_download_base + "/m3u8"

        # 路径 / 创建
        self.path_check(str_path=path_download_current_base)

        # Do Find
        data_find_result_m3u8 = self.obj_bs.findAll(str_html_label,attrs={str_attrs_key: re.compile(str_attrs_value)})

        # Display
        print("找到的【m3u8】:")
        for item in data_find_result_m3u8:
            print("==================")
            print(item.get('data-normal'))

        # Return
        return data_find_result_m3u8

        # End.

    # 查找【img】
    """
    str_html_label,HTML标签名
    str_attrs_key,HTML标签的属性名
    str_attrs_value,HTML标签的属性值
    """
    def find_img(self,str_html_label="",str_attrs_key="",str_attrs_value=""):

        # variable
        path_download_current_base = self.path_download_base + "/img"

        # 路径 / 创建
        self.path_check(str_path=path_download_current_base)

        # Do Find
        data_find_result_img = self.obj_bs.findAll(str_html_label, attrs={str_attrs_key: re.compile(str_attrs_value)})

        # Display
        print("找到的【图片】:")
        for item in data_find_result_img:
            print("==================")
            print(item.get('file'))

        # Return
        return data_find_result_img

        # End.

    # ============================
    # 单独的请求对象的创建,与类初始化的时候的请求对象没关系

    # 获取【iciba.com/daily/每日一句】
    """
    注释
    """
    def get_iciba_daily(self,str_date=""):

        # Variable

        # 系统当前时间
        str_date_current = time.strftime("%Y-%m-%d", time.localtime())

        # 基准时间
        date_base = None
        date_base_formatted = None

        if str_date == "":
            date_base = time.strptime(str_date_current, "%Y-%m-%d")
            date_base_formatted = str_date_current
        else:
            date_base = time.strptime(str_date, "%Y-%m-%d")
            date_base_formatted = str_date

        # 当前参与计算的日期

        str_date_current_year = time.strftime("%Y", date_base)
        str_date_current_month = time.strftime("%Y-%m", date_base)

        # 路径
        path_download_current_base = self.path_download_base + "/iciba.com"
        path_download_current = path_download_current_base + "/" + str_date_current_year + "/" + str_date_current_month+ "/" + date_base_formatted
        # 文件
        file_download_current_image = path_download_current + "/" + date_base_formatted + ".jpg"
        file_download_current_content_en = path_download_current + "/" + date_base_formatted + "_content_en.txt"
        file_download_current_content_zh = path_download_current + "/" + date_base_formatted + "_content_zh.txt"

        # 路径 / 创建
        self.path_check(str_path=path_download_current)

        # 目标地址
        # str_url = "http://news.iciba.com/views/dailysentence/daily.html#!/detail/title/2020-01-20"
        # str_url = "http://open.iciba.com/dsapi/"
        str_url = "http://sentence.iciba.com/?&c=dailysentence&m=getdetail&title=" + date_base_formatted

        # 响应返回的数据(单独的请求对象的创建,与类初始化的时候的请求对象没关系)
        obj_resp = requests.get(str_url)
        obj_resp_data = obj_resp.text

        # JSON
        data_html_json = json.loads(obj_resp_data)

        # 数据
        data_picture = dict(data_html_json).get('picture2')
        data_content_en = dict(data_html_json).get('content')
        data_content_zh = dict(data_html_json).get('note')

        # 数据对象
        obj_data_picture = self.obj_url(resource_url=data_picture)

        # 写入文件
        # -- 图片
        self.file_write(obj_source=obj_data_picture,target_path=file_download_current_image,file_type="file")
        # -- 英文
        self.file_write(obj_source=data_content_en, target_path=file_download_current_content_en,file_type="text")
        # -- 中文
        self.file_write(obj_source=data_content_zh, target_path=file_download_current_content_zh,file_type="text")

        # Display
        print("路径:【" + path_download_current + "】")
        print("日期:【" + str_date_current + "】")
        print("图片:【" + data_picture + "】")
        print("英文:【" + data_content_en + "】")
        print("中文:【" + data_content_zh + "】")

    # 获取【iciba.com/daily/每日一句】
    """
    注释
    """
    def get_iciba_daily_by_DateRange(self,date_begin,date_end):

        # Variable
        date_array = pd.date_range(date_begin,date_end)

        # Display
        for date_item in date_array:

            # Variable
            date_item_formatted = time.strftime("%Y-%m-%d", time.strptime(str(date_item), "%Y-%m-%d %H:%M:%S"))

            # Do
            self.get_iciba_daily(str_date=date_item_formatted)

            # Display
            print("-------")
            print(date_item_formatted)

# Section: main()
if __name__ == "__main__":

    # 例子
    # obj_dig_data_web = Dig_Data_Web(str_url=url_address)
    # obj_dig_data_web.find_m3u8(str_html_label="div",str_attrs_key="class",str_attrs_value="playerWrap ckplayerPlugin")
    # obj_dig_data_web.find_img(str_html_label="img", str_attrs_key="id",str_attrs_value="detail-banner-img")

    # 爬取【iciba.com / 爱词霸每日一句】
    obj_dig_data_web = Dig_Data_Web()
    obj_dig_data_web.get_iciba_daily_by_DateRange('2019/12/3','2020/1/20')

# Section: End
# Finished

执行效果如下:

Python【爬虫】/ 爱词霸·每日一句 / 按日期批量爬取

就是这样,... 

代码详情不解释了。

有问题留言。