14.多线程爬虫, 及抖音视频爬取小结
程序员文章站
2022-03-02 13:26:24
...
1.理解Queue队列中join()与task_done()的关系
2. 多线程爬虫(糗事百科)
1. 实现
# coding=utf-8
import requests
from lxml import etree
import threading
from queue import Queue # 引入队列
class QiubaiSpdier:
def __init__(self):
self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/63.0.3239.84 Safari/537.36"}
self.url_queue = Queue() # 存放带爬url的队列
self.html_queue = Queue() # 存放爬去网页返回数据的队列
self.content_queue = Queue() # 存放爬去结果集的队列
def get_url_list(self):
# return [self.url_temp.format(i) for i in range(1,14)]
for i in range(1, 4):
self.url_queue.put(self.url_temp.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, headers=self.headers)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
def get_content_list(self): # 提取数据
while True:
html_str = self.html_queue.get()
html = etree.HTML(html_str)
div_list = html.xpath("//div[@id='content-left']/div") # 分组
content_list = []
for div in div_list:
item = {}
item["content"] = div.xpath(".//div[@class='content']/span/text()")
item["content"] = [i.replace("\n", "") for i in item["content"]]
item["author_gender"] = div.xpath(".//div[contains(@class,'articleGender')]/@class")
item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(
item["author_gender"]) > 0 else None
item["auhtor_age"] = div.xpath(".//div[contains(@class,'articleGender')]/text()")
item["auhtor_age"] = item["auhtor_age"][0] if len(item["auhtor_age"]) > 0 else None
item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
item["content_img"] = "https:" + item["content_img"][0] if len(item["content_img"]) > 0 else None
item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"]) > 0 else None
item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"]) > 0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done() #
def save_content_list(self): # 保存
while True:
content_list = self.content_queue.get()
for i in content_list:
print(i)
pass
self.content_queue.task_done()
def run(self): # 实现主要逻辑
thread_list = []
# 1.url_list
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
# 2.遍历,发送请求,获取响应
for i in range(20):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
# 3.提取数据
for i in range(2):
t_html = threading.Thread(target=self.get_content_list)
thread_list.append(t_html)
# 4.保存
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) # 把子线程设置为守护线程,该线程不重要主线程结束,子线程结束
t.start()
for q in [self.url_queue, self.html_queue, self.content_queue]:
q.join() # 让主线程等待阻塞,等待队列的任务完成之后再完成
print("主线程结束")
if __name__ == '__main__':
qiubai = QiubaiSpdier()
qiubai.run()
2.1. 注意事项
- 主要掌握多线程配合Queue的使用,内容没有太大价值,并且网页结构发生了变化
3. 使用python爬虫,批量爬取抖音app视频(requests+Fiddler+appium)
00. 抖音爬虫教程链接: https://www.cnblogs.com/stevenshushu/p/9635097.html
01. windows虚拟环境搭建链接:https://blog.csdn.net/qq_33404767/article/details/86479820
02. Fiddler的安装和使用 https://blog.csdn.net/ychgyyn/article/details/82154433
- Fiddler乱码解决方法: https://blog.csdn.net/quiet_girl/article/details/50577828
03. appium桌面版1.6.5+python3.6
- appium1.6.3也通过了,不要下太高的版本,会找不到adb https://blog.csdn.net/qq_33236708/article/details/78061787
- Appium-Desktop 下载地址:https://testerhome.com/topics/680
04. 定位元素链接:
-
https://www.jianshu.com/p/6d71624cb5bb
-
https://www.cnblogs.com/bendouyao/p/9346379.html
-
自动化控制计算器小案例
# coding=utf-8
from appium import webdriver
import time
desired_caps = {}
desired_caps['platformName'] = 'Android'
desired_caps['platformVersion'] = '5.0'
desired_caps['deviceName'] = 'T7G0215A14000138'
desired_caps['appPackage'] = 'com.android.calculator2'
desired_caps['appActivity'] = '.Calculator'
desired_caps["unicodeKeyboard"] = "True" # appium提供的一种输入法,可以传中文。测试时直接用这个输入法
desired_caps["resetKeyboard"] = "True" # 程序结束时重置原来的输入法
desired_caps["noReset"] = "False" # 不初始化手机app信息(类似不清楚缓存)
driver = webdriver.Remote('http://localhost:4723/wd/hub', desired_caps)
driver.find_element_by_id("com.android.calculator2:id/digit1").click()
driver.find_element_by_id("com.android.calculator2:id/digit5").click()
driver.find_element_by_id("com.android.calculator2:id/digit9").click()
driver.find_element_by_id("com.android.calculator2:id/plus").click()
driver.find_element_by_id("com.android.calculator2:id/digit2").click()
driver.find_element_by_id("com.android.calculator2:id/digit3").click()
driver.find_element_by_id("com.android.calculator2:id/equal").click()
time.sleep(5)
driver.quit()
- 定位QQ案例的小案例
# coding=utf-8
from appium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
desired_caps = {}
desired_caps['platformName'] = 'Android'
desired_caps['platformVersion'] = '5.0'
desired_caps['deviceName'] = 'T7G0215A14000138'
desired_caps['appPackage'] = 'com.tencent.mobileqq'
desired_caps['appActivity'] = '.activity.SplashActivity'
desired_caps["unicodeKeyboard"] = "True" # appium提供的一种输入法,可以传中文。测试时直接用这个输入法
desired_caps["resetKeyboard"] = "True" # 程序结束时重置原来的输入法
desired_caps["noReset"] = "False" # 不初始化手机app信息(类似不清楚缓存)
driver = webdriver.Remote('http://localhost:4723/wd/hub', desired_caps)
WebDriverWait(driver, 60).until(
# 这里只能追踪的元素,不能追踪到元素的具体属性
EC.presence_of_element_located((By.ID, "com.tencent.mobileqq:id/dialogRightBtn"))
)
driver.find_element_by_id("com.tencent.mobileqq:id/dialogRightBtn").click()
time.sleep(1)
driver.find_element_by_id("com.tencent.mobileqq:id/btn_login").click()
time.sleep(1)
element = driver.find_element_by_accessibility_id('请输入QQ号码或手机或邮箱')
element.clear()
element.send_keys("594042358")
# driver.find_element_by_xpath('//android.widget.EditText[@content-desc="请输入QQ号码或手机或邮箱"]').send_keys("594042358")
driver.find_element_by_xpath('//android.widget.EditText[@content-desc="密码 安全"]').send_keys("fanjianhai")
time.sleep(1)
driver.find_element_by_xpath('//android.widget.ImageView[@content-desc="登 录"]').click()
time.sleep(8)
driver.quit()
"""
{
"platformName": "Android",
"platformVersion": "5.0",
"deviceName": "T7G0215A14000138",
"appPackage": "com.tencent.mobileqq",
"appActivity": ".activity.SplashActivity"
}
"""
05. 抖音小视频综合案例爬取
- fiddler脚本
//保存到本地添加开始
try{
//这是抖音的地址||"v1-dy.ixigua.com"||"v3-dy.ixigua.com"||"v6-dy.ixigua.com"||"v9-dy.ixigua.com"||
if (oSession.fullUrl.Contains("ixigua.com")){
var d = new Date();
var fso;
var file;
var hours = d.getHours();
var hoursValue = parseInt(hours)
if(hoursValue>= 0 && hoursValue <= 9)
{
hoursValue = "0" + hoursValue
}
var minutes = d.getMinutes();
var minutesValue = parseInt(minutes)
if(minutesValue>= 0 && minutesValue <= 9)
{
minutesValue = "0" + minutesValue
}
fso = new ActiveXObject("Scripting.FileSystemObject");
//文件保存路径,可自定义
file = fso.OpenTextFile("D:\\douyin\\url\\video_url_"+d.getDay()+"_"+hoursValue + "_" + minutesValue + ".txt" ,8 ,true);
//file.writeLine("Request-url:" + oSession.url);
file.writeLine("http://"+oSession.url)
//file.writeLine("Request-host:" + oSession.host);
//file.writeLine("Request-header:" + "\n" + oSession.oRequest.headers);
//file.writeLine("Request-body:" + oSession.GetRequestBodyAsString());
//file.writeLine("\n");
file.close();
}
}catch(e){
var fsoErr = new ActiveXObject("Scripting.FileSystemObject");
var file = fsoErr.OpenTextFile("D:\\douyin\\error_log.txt" ,8 ,true);
file.writeLine(e.description);
}
//保存到本地添加结束
douyin_appium.py
from appium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class Action:
def __init__(self):
# 初始化配置,设置Desired Capabilities参数
self.desired_caps = {
"platformName": "Android",
"platformVersion": "5.0",
"deviceName": "T7G0215A14000138",
"appPackage": "com.ss.android.ugc.aweme",
"appActivity": ".splash.SplashActivity"
}
# 指定Appium Server
self.server = 'http://localhost:4723/wd/hub'
# 新建一个Session
self.driver = webdriver.Remote(self.server, self.desired_caps)
# 设置滑动初始坐标和滑动距离
self.start_x = 500
self.start_y = 1500
self.distance = 1300
def allow_permission(self):
try:
WebDriverWait(self.driver, 10).until(
# 这里只能追踪的元素,不能追踪到元素的具体属性
EC.presence_of_element_located((By.ID, "com.ss.android.ugc.aweme:id/y0"))
)
self.driver.find_element_by_id("com.ss.android.ugc.aweme:id/y0").click()
except Exception as e:
print(e)
def comments(self):
sleep(3)
# app开启之后点击一次屏幕,确保页面的展示
self.driver.tap([(500, 1200)], 500)
def scroll(self):
# 无限滑动
while True:
try:
# 模拟滑动
self.driver.swipe(self.start_x, self.start_y, self.start_x,
self.start_y - self.distance)
# 设置延时等待
sleep(3)
except Exception as e:
print(e)
def run(self):
self.allow_permission()
self.comments()
self.scroll()
if __name__ == '__main__':
action = Action()
action.run()
"""
{
"platformName": "Android",
"platformVersion": "5.0",
"deviceName": "T7G0215A14000138",
"appPackage": "com.ss.android.ugc.aweme",
"appActivity": ".splash.SplashActivity"
}
"""
douyin_spider.py
# _*_ coding: utf-8 _*_
import time
from urllib.request import urlretrieve
from urllib.error import ContentTooShortError
import os
import re
filter_pool = []
def distinct_data(pathtxt):
"""
对抓取的url进行去重
:return:
"""
global filter_pool
datalist_blank = []
with open(pathtxt) as f:
f_data_list = f.readlines() # d得到的是一个list类型
for a in f_data_list:
datalist_blank.append(a.strip()) # 去掉\n strip去掉头尾默认空格或换行符
data_dict = {}
for data in datalist_blank:
ret = re.match(".+\/m\/(.+)\/\?rc=", data)
if not ret:
continue
result = ret.group(1)
data_dict[result] = data
data_new = []
for x, y in data_dict.items():
if y not in filter_pool:
data_new.append(y)
filter_pool.append(y)
else:
continue
return data_new
def download(data_list, url_dir):
"""下载视频"""
video_dir = "D:/douyin/video/{}".format(url_dir[:-4].replace("video_url", "video"))
if not os.path.exists(video_dir):
os.mkdir(video_dir)
for index, data in enumerate(data_list):
video_path = video_dir + "/{}.mp4".format(index)
try:
urlretrieve(data, video_path)
time.sleep(3)
# 下载完成后,删除url的文件路径
print("{}--下载完成".format(video_path))
except ContentTooShortError:
pass
os.remove(URL_DIRECTOR_ROOT_PATH + "\\" + url_dir)
if __name__ == '__main__':
URL_DIRECTOR_ROOT_PATH = r"D:\douyin\url"
# 当视频路径文件夹多余两个的时候开始去重,下载视频
# 用来过滤重复的链接,阈值给到500时,删掉50
while True:
url_dirs = os.listdir(URL_DIRECTOR_ROOT_PATH)
try:
if url_dirs.__len__() >= 2:
print(url_dirs)
url_dir = url_dirs.pop(0)
print(url_dir)
# 去重开始...
data_list = distinct_data(URL_DIRECTOR_ROOT_PATH + "\\" + url_dir)
download(data_list, url_dir)
else:
print("请稍等,还没来得及抓包...")
# 只有一个文件时,等待1分钟
time.sleep(20)
if filter_pool.__len__() > 1000:
filter_pool = filter_pool[100:]
except Exception as e:
print(e)
time.sleep(1)
JScript脚本链接:http://doc.51windows.net/jscript5/?url=/jscript5/dir.htm
上一篇: 一加8上手:仅重180g 2020年里不可多得的轻薄真香旗舰
下一篇: python编码的意义