多线程爬取狗妈表情包
程序员文章站
2022-06-15 10:09:57
通过多线程爬取狗妈表情包import requestsimport jsonfrom jsonpath import jsonpathimport threadingimport queueimport time#创建下载线程class downlodethread(threading.Thread): def __init__(self,name,urlpage): super().__init__(name=name) self.url....
通过多线程爬取狗妈表情包
import requests
import json
from jsonpath import jsonpath
import threading
import queue
import time
#创建下载线程
class downlodethread(threading.Thread):
def __init__(self,name,urlpage):
super().__init__(name=name)
self.urlpage = urlpage
def run(self):
print(f"{self.name}正在运行")
#print(self.urlpage.get())
#判断队列是否为空,不为空则取出url
while not self.urlpage.empty():
url = self.urlpage.get(block=False)
#拿到表情包序号
d = url[58:78]
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
reponse = requests.get(url =url,headers =headers)
#为方便管理我在e盘下建文件夹下载,运行代码时需要修改成你想下载的地址
with open(r'E:\\狗妈表情包\\'+str(d)+'.jpg','wb')as f:
print(self.name,'正在下载')
f.write(reponse.content)
#分析请求爬取数据
class crawl(threading.Thread):
def __init__(self,name,first_queue):
super().__init__(name=name)
def run(self):
lock = threading.Lock()
print(f"{self.name}正在运行")
while not first_queue.empty():
t = first_queue.get(block=False)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
print(f"{self.name}正在运行")
response = requests.get(url=t,headers = headers)
#把数据放入队列
lock.acquire()
second_queue.put(response.text)
lock.release()
print(f"{self.name}运行结束")
def parse_response(t):
data = json.loads(t)
x = jsonpath(data, '$..data..thumbURL')
print('解析线程启动')
#print(x)
for i in x:
urlpage.put(i)
if __name__ == '__main__':
first_queue = queue.Queue()
second_queue = queue.Queue()
third_queue = queue.Queue()
urlpage = queue.Queue()
#网址通过分析ajax请求得到
for i in range(0,480,30):#把网址放入队列中
first_queue.put(f"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn={i}&rn=30&gsm=5a&1594005271382=")
#print(first_queue.get())
#启动爬取线程组
for c in range(0,3):
crawlthread = crawl(f'爬取{c}',first_queue)
crawlthread.start()
#time.sleep(5)
crawlthread.join()
while not second_queue.empty():
t = second_queue.get()
#解析
t = threading.Thread(target=parse_response(t))
t.start()
t.join()
# 等待解析线程
time.sleep(2)
for e in range(0,3):
thread = downlodethread(f'下载{e}',urlpage)
#print(urlpage.get())
thread.start()
thread.join()
这是从百度图片中抓取下的狗妈表情包,因为百度图片用的是ajax请求,所以需要分析网页,并解析json数据。
本文地址:https://blog.csdn.net/snake_boy_/article/details/107666775