欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

多线程爬取狗妈表情包

程序员文章站 2022-06-15 10:09:57
通过多线程爬取狗妈表情包import requestsimport jsonfrom jsonpath import jsonpathimport threadingimport queueimport time#创建下载线程class downlodethread(threading.Thread): def __init__(self,name,urlpage): super().__init__(name=name) self.url....

 通过多线程爬取狗妈表情包

import requests
import json
from jsonpath import jsonpath
import threading
import queue
import time
#创建下载线程
class downlodethread(threading.Thread):
    	def __init__(self,name,urlpage):
        	super().__init__(name=name)
        	self.urlpage = urlpage
    def run(self):
        print(f"{self.name}正在运行")
        #print(self.urlpage.get())
#判断队列是否为空,不为空则取出url
        while not self.urlpage.empty():
            url = self.urlpage.get(block=False)
            #拿到表情包序号
            d = url[58:78]
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
                }
            reponse = requests.get(url =url,headers =headers)
            #为方便管理我在e盘下建文件夹下载,运行代码时需要修改成你想下载的地址
            with open(r'E:\\狗妈表情包\\'+str(d)+'.jpg','wb')as f:
                print(self.name,'正在下载')
                f.write(reponse.content)
#分析请求爬取数据
class crawl(threading.Thread):
    def __init__(self,name,first_queue):
        super().__init__(name=name)
    def run(self):
        lock = threading.Lock()
        print(f"{self.name}正在运行")
        while not first_queue.empty():
            t = first_queue.get(block=False)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
            print(f"{self.name}正在运行")
            response = requests.get(url=t,headers = headers)
            #把数据放入队列
            lock.acquire()
            second_queue.put(response.text)
            lock.release()
            print(f"{self.name}运行结束")
def parse_response(t):
    data = json.loads(t)
    x = jsonpath(data, '$..data..thumbURL')
    print('解析线程启动')
    #print(x)
    for i in x:
        urlpage.put(i)

if __name__ == '__main__':
    first_queue = queue.Queue()
    second_queue = queue.Queue()
    third_queue = queue.Queue()
    urlpage = queue.Queue()
    #网址通过分析ajax请求得到
    for i in range(0,480,30):#把网址放入队列中
        first_queue.put(f"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn={i}&rn=30&gsm=5a&1594005271382=")
    #print(first_queue.get())
    #启动爬取线程组
    for c in range(0,3):
        crawlthread = crawl(f'爬取{c}',first_queue)
        crawlthread.start()
    #time.sleep(5)
    crawlthread.join()
    while not second_queue.empty():
        t = second_queue.get()
    #解析
        t = threading.Thread(target=parse_response(t))
        t.start()
        t.join()
    # 等待解析线程
    time.sleep(2)
    for e in range(0,3):
        thread = downlodethread(f'下载{e}',urlpage)
        #print(urlpage.get())
        thread.start()
    thread.join()

这是从百度图片中抓取下的狗妈表情包,因为百度图片用的是ajax请求,所以需要分析网页,并解析json数据。

本文地址:https://blog.csdn.net/snake_boy_/article/details/107666775