python爬虫之多线程解密
多线程对于Python来说是个鸡肋,但是我们还是趋之若鹜的去学习是因为,他确实在爬虫中能极大程度上提高我们的代码效率,所以,我作为一个学习者,也抱着一个学习的心态去学习了多线程,但是这条路十分的艰辛,从中也遇到了很多的问题,这些问题也让我在爬虫中有了很大的成长。
大家通常学习到的多线程都是构建一个类然后去构造run方法去实现多线程,具体我就不讲述通常的多线程了,但是我在一次网课上看到了一种另类的写法,这种多线程的构造更加的契合逻辑思维,而且简单,但是这里面的坑也很多。
这种新的写法更多的是实现队列的取值和放出值,是队列的数据能够得到有效的更新的同时,多线程处理这些数据,在学习这个方法之前一定要对队列的方法有一定的认知,已经要明白线程之间的逻辑关系以及阻塞。
接下来我就放出我的代码和我遇到的坑。
# -*- coding: utf-8 -*-
# Created : 2019/4/26 19:17
# author :GL
import requests
import os
from lxml import etree
from queue import Queue
import threading
import time
class ShoeSpider:
def __init__(self,name):
self.name = str(name)
self.num = 1
self.base_json_url = 'https://www.shopstyle.com/api/v2/products?fts=' + self.name + '&includeLooks=true&includeProducts=true&includeSavedQueryId=true&limit=40&locales=all&maxNumFilters=1000&numLooks=20&offset={}&pid=shopstyle&prevCat=women&productScore=LessPopularityEPC&url=%2Fbrowse&useElasticsearch=true&view=angular2'
self.base_url = 'https://www.shopstyle.com/browse/flats?fts=Slingback+Flats&r=therealreal-us'
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"cookie": cookie,
"pragma": "no-cache",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
}
self.info_queue = Queue()
self.url_queue = Queue()
def parse_json(self):
response = requests.get(self.base_json_url.format(0), headers=self.headers).json()
total_page = response["metadata"]["total"]
now_page = 0
for now_page in range(0,int(total_page),80):
response = requests.get(self.base_json_url.format(now_page),headers = self.headers).json()
self.info_queue.put(response)
def parse_detail_json(self):
while True:
response = self.info_queue.get()
results = response["products"]
result_list = []
for result in results:
goods_name = result['name']
img_src = result['image']['sizes']['Best']["url"]
name = self.name.split(" ")
for i in name:
if i in goods_name.title():
print(img_src)
result_list.append(img_src)
break
self.url_queue.put(result_list)
self.info_queue.task_done()
def write_to_file(self):
while True:
url_list = self.url_queue.get()
for url in url_list:
content = requests.get(url).content
path = './images/women/shoes/Flat/Ballet/裹踝平底鞋/' + self.name + '/'
if not os.path.exists(path):
os.makedirs(path)
filename = url.split("/")[-1]
filename = path + filename + ".jpg"
with open(filename,"wb")as f:
f.write(content)
print("第{}张图片写入成功!".format(self.num))
self.num += 1
self.url_queue.task_done()
def run(self):
thread_list = []
t_info = threading.Thread(target=self.parse_json)
thread_list.append(t_info)
for i in range(10):
t_url = threading.Thread(target=self.parse_detail_json)
thread_list.append(t_url)
for i in range(10):
t_save = threading.Thread(target=self.write_to_file)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
time.sleep(3)
for q in [self.info_queue, self.url_queue ]:
q.join()
print("结束!")
if __name__ == '__main__':
# name = input("请输入需要搜索的商品名称:")
name = "Suede Loafers"
name = name.title()
print("您输入的关键字为:",name)
shoe = ShoeSpider(name)
shoe.run()
在我当时按照学习视频的方法去学习这个多线程的方法时,当时老师的第一个队列里的值时直接构造请求出来的,所以导致老师的代码我复制过来也可以使用,但是当我自己将这种多线程的方式融合到自己的多线程中去的时候,我发现我的会直接执行结束,我当时一直以为是saedon这个方法除了问题,因为将这个方法注释掉,我的程序就可以执行 ,但是到最后没法停止,后来我分析很久很久,也没弄懂为啥 ,最后又一次我在分析别的程序的时候,请求超时了,让我想到了这个坑,恍然大悟,因为我的队列的第一个值时通过请求得到的,虽然可能只有几秒,但是我们不能用自己的时间观念来衡量计算机,对于计算机来说,这可能是一个很漫长的过程,所以导致程序在执行到最后,可能队列里还没有放入任何的值,导致整个程序在执行完毕以后还是没有办法停下来 。
下面我贴上我当时学习时用的代码,以供大家吸取教训,不要再这个上面掉坑里。
# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author :GL
from queue import Queue
import requests
from lxml import etree
import json
import threading
class QiushiSpider:
def __init__(self):
self.start_url = 'https://www.qiushibaike.com/hot/page/{}/'
self.headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Host": "www.qiushibaike.com",
"Referer": "https://www.qiushibaike.com/hot/",
}
self.url_queue = Queue()
self.html_queue = Queue()
self.content_queue = Queue()
def get_url_list(self):
# return [self.start_url.format(i) for i in range(1, 14)]
for i in range(1, 14):
self.url_queue.put(self.start_url.format(i))
def parse_url(self):
while True:
url = self.url_queue.get()
print(url)
response = requests.get(url, headers=self.headers)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
def get_content_list(self):
while True:
html_info = self.html_queue.get()
html = etree.HTML(html_info)
content_list = []
div_list = html.xpath("//div[@id='content-left']/div")
for div in div_list:
item = {}
item['name'] = div.xpath(".//div[@class='author clearfix'][1]/a[2]/h2/text()")
item['name'] = [item['name'][i].replace("\n", "") for i in range(len(item['name']))]
item['info'] = div.xpath(".//div[@class='content']/span/text()")
item['info'] = [item['info'][i].replace("\n", "") for i in range(len(item['info']))]
item['reader'] = div.xpath(".//div[@class='stats']/span[1]/i/text()")
item['img'] = div.xpath(".//div[@class='thumb']/a/img/@src")
item['img'] = "https:" + item['img'][0] if len(item['img']) > 0 else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
def save_content_list(self):
while True:
content_list = self.content_queue.get()
for i in content_list:
with open('qiushi_1.json', 'a', encoding="utf-8") as f:
f.write(json.dumps(i, ensure_ascii=False, indent=4))
f.write("\n")
self.content_queue.task_done()
def run(self):
# url_list = self.get_url_list()
thread_list = []
t_url = threading.Thread(target=self.get_url_list)
thread_list.append(t_url)
for i in range(3):
t_parse = threading.Thread(target=self.parse_url)
thread_list.append(t_parse)
for i in range(2):
t_html = threading.Thread(target=self.get_content_list)
thread_list.append(t_html)
t_save = threading.Thread(target=self.save_content_list)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True) # 把子线程设置为守护线程,等待队列中的任务完成之后再完成
t.start()
for q in [self.url_queue, self.html_queue, self.content_queue]:
q.join() # 让主线程阻塞,等待队列中的任务完成之后在完成
print("结束!")
# for url in url_list:
# html_info = self.parse_url(url)
# content_list = self.get_content_list(html_info)
# self.save_content_list(content_list)
if __name__ == '__main__':
qiushi = QiushiSpider()
qiushi.run()
如有问题,欢迎相互探讨!
qq:986361369!