Python【爬虫】/ 爱词霸·每日一句 / 按日期批量爬取
程序员文章站
2022-05-18 21:25:41
...
Talk is cheap, show me the Code.
ok,... 安排.
代码被托管在了Gitee(码云):
https://gitee.com/leviathan-litan/Discovery_Data_Web
代码如下:
# coding:utf-8
# Describe
"""
Author: Adamhuan
Blog: http://www.d-prototype.com
目标:爬取【网页数据】
"""
# Import
# 数据分析
import pandas as pd
# 系统
import os
# 时间
import datetime,time
# 正则表达式
import re
# 处理JSON数据
import json
# HTTP or HTTPS
from urllib import request,response
import requests
# 【WEB】编码解析器
from bs4 import BeautifulSoup
# Variable
# 目标URL
url_address = ""
# Class
class Dig_Data_Web:
# Class Attribute
# -- Object
obj_request = None
obj_response = None
obj_html = None
# -- Variable
headers = ""
string_url = ""
# -- Path
path_script_base = os.getcwd()
path_download_base = os.path.dirname(path_script_base) + "/download"
def __init__(self, str_url=""):
# Display / Intro
print("************************")
print("脚本:Web数据 - 爬取")
print("------------------")
print("当前路径:【" + self.path_script_base + "】")
print("下载路径:【" + self.path_download_base + "】")
print("************************")
# 目标URL
self.string_url = str_url
# 自定义【header】
# 防止【反爬虫】
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36'}
# 初始化【爬取网页】所需要的对象
if str_url != "" and str_url != None:
# 响应返回的数据
self.obj_request = request.Request(url=self.string_url, headers=self.headers)
self.obj_response = request.urlopen(self.obj_request)
self.obj_html = self.obj_response.read()
# 【WEB】编码解析器
self.obj_bs = BeautifulSoup(self.obj_html, "html.parser")
else:
print("!!! Warning !!! ---> 【URL】为空")
print()
def __del__(self):
pass
# ============================
# 文件操作
# 文件路径检查
def path_check(self, str_path):
if not os.path.exists(str_path):
os.makedirs(str_path)
# End.
# 文件 / 写
def file_write(self,obj_source,target_path,file_type):
# Variable
obj_file = None
if file_type == "file":
obj_file = open(target_path,'wb')
elif file_type == "text":
obj_file = open(target_path, 'w', encoding='utf-8')
obj_file.write(obj_source)
obj_file.close()
# End.
# ============================
# 资源操作 / 基础
# 将URL转换成可以直接操作的对象
def obj_url(self,resource_url):
obj_resp = request.urlopen(url=resource_url)
obj_url = obj_resp.read()
# Return.
return obj_url
# ============================
# 资源操作 / 规则
# 查找【m3u8】
"""
str_html_label,HTML标签名
str_attrs_key,HTML标签的属性名
str_attrs_value,HTML标签的属性值
"""
def find_m3u8(self,str_html_label="",str_attrs_key="",str_attrs_value=""):
# variable
path_download_current_base = self.path_download_base + "/m3u8"
# 路径 / 创建
self.path_check(str_path=path_download_current_base)
# Do Find
data_find_result_m3u8 = self.obj_bs.findAll(str_html_label,attrs={str_attrs_key: re.compile(str_attrs_value)})
# Display
print("找到的【m3u8】:")
for item in data_find_result_m3u8:
print("==================")
print(item.get('data-normal'))
# Return
return data_find_result_m3u8
# End.
# 查找【img】
"""
str_html_label,HTML标签名
str_attrs_key,HTML标签的属性名
str_attrs_value,HTML标签的属性值
"""
def find_img(self,str_html_label="",str_attrs_key="",str_attrs_value=""):
# variable
path_download_current_base = self.path_download_base + "/img"
# 路径 / 创建
self.path_check(str_path=path_download_current_base)
# Do Find
data_find_result_img = self.obj_bs.findAll(str_html_label, attrs={str_attrs_key: re.compile(str_attrs_value)})
# Display
print("找到的【图片】:")
for item in data_find_result_img:
print("==================")
print(item.get('file'))
# Return
return data_find_result_img
# End.
# ============================
# 单独的请求对象的创建,与类初始化的时候的请求对象没关系
# 获取【iciba.com/daily/每日一句】
"""
注释
"""
def get_iciba_daily(self,str_date=""):
# Variable
# 系统当前时间
str_date_current = time.strftime("%Y-%m-%d", time.localtime())
# 基准时间
date_base = None
date_base_formatted = None
if str_date == "":
date_base = time.strptime(str_date_current, "%Y-%m-%d")
date_base_formatted = str_date_current
else:
date_base = time.strptime(str_date, "%Y-%m-%d")
date_base_formatted = str_date
# 当前参与计算的日期
str_date_current_year = time.strftime("%Y", date_base)
str_date_current_month = time.strftime("%Y-%m", date_base)
# 路径
path_download_current_base = self.path_download_base + "/iciba.com"
path_download_current = path_download_current_base + "/" + str_date_current_year + "/" + str_date_current_month+ "/" + date_base_formatted
# 文件
file_download_current_image = path_download_current + "/" + date_base_formatted + ".jpg"
file_download_current_content_en = path_download_current + "/" + date_base_formatted + "_content_en.txt"
file_download_current_content_zh = path_download_current + "/" + date_base_formatted + "_content_zh.txt"
# 路径 / 创建
self.path_check(str_path=path_download_current)
# 目标地址
# str_url = "http://news.iciba.com/views/dailysentence/daily.html#!/detail/title/2020-01-20"
# str_url = "http://open.iciba.com/dsapi/"
str_url = "http://sentence.iciba.com/?&c=dailysentence&m=getdetail&title=" + date_base_formatted
# 响应返回的数据(单独的请求对象的创建,与类初始化的时候的请求对象没关系)
obj_resp = requests.get(str_url)
obj_resp_data = obj_resp.text
# JSON
data_html_json = json.loads(obj_resp_data)
# 数据
data_picture = dict(data_html_json).get('picture2')
data_content_en = dict(data_html_json).get('content')
data_content_zh = dict(data_html_json).get('note')
# 数据对象
obj_data_picture = self.obj_url(resource_url=data_picture)
# 写入文件
# -- 图片
self.file_write(obj_source=obj_data_picture,target_path=file_download_current_image,file_type="file")
# -- 英文
self.file_write(obj_source=data_content_en, target_path=file_download_current_content_en,file_type="text")
# -- 中文
self.file_write(obj_source=data_content_zh, target_path=file_download_current_content_zh,file_type="text")
# Display
print("路径:【" + path_download_current + "】")
print("日期:【" + str_date_current + "】")
print("图片:【" + data_picture + "】")
print("英文:【" + data_content_en + "】")
print("中文:【" + data_content_zh + "】")
# 获取【iciba.com/daily/每日一句】
"""
注释
"""
def get_iciba_daily_by_DateRange(self,date_begin,date_end):
# Variable
date_array = pd.date_range(date_begin,date_end)
# Display
for date_item in date_array:
# Variable
date_item_formatted = time.strftime("%Y-%m-%d", time.strptime(str(date_item), "%Y-%m-%d %H:%M:%S"))
# Do
self.get_iciba_daily(str_date=date_item_formatted)
# Display
print("-------")
print(date_item_formatted)
# Section: main()
if __name__ == "__main__":
# 例子
# obj_dig_data_web = Dig_Data_Web(str_url=url_address)
# obj_dig_data_web.find_m3u8(str_html_label="div",str_attrs_key="class",str_attrs_value="playerWrap ckplayerPlugin")
# obj_dig_data_web.find_img(str_html_label="img", str_attrs_key="id",str_attrs_value="detail-banner-img")
# 爬取【iciba.com / 爱词霸每日一句】
obj_dig_data_web = Dig_Data_Web()
obj_dig_data_web.get_iciba_daily_by_DateRange('2019/12/3','2020/1/20')
# Section: End
# Finished
执行效果如下:
就是这样,...
代码详情不解释了。
有问题留言。
上一篇: python爬虫之通过pyquery爬取大众点评评论信息
下一篇: Flash水坑钓鱼