多线程爬取狗妈表情包

程序员文章站 2022-06-15 10:09:57

通过多线程爬取狗妈表情包import requestsimport jsonfrom jsonpath import jsonpathimport threadingimport queueimport time#创建下载线程class downlodethread(threading.Thread): def __init__(self,name,urlpage): super().__init__(name=name) self.url....

通过多线程爬取狗妈表情包

import requests
import json
from jsonpath import jsonpath
import threading
import queue
import time
#创建下载线程
class downlodethread(threading.Thread):
    	def __init__(self,name,urlpage):
        	super().__init__(name=name)
        	self.urlpage = urlpage
    def run(self):
        print(f"{self.name}正在运行")
        #print(self.urlpage.get())
#判断队列是否为空，不为空则取出url
        while not self.urlpage.empty():
            url = self.urlpage.get(block=False)
            #拿到表情包序号
            d = url[58:78]
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
                }
            reponse = requests.get(url =url,headers =headers)
            #为方便管理我在e盘下建文件夹下载，运行代码时需要修改成你想下载的地址
            with open(r'E:\\狗妈表情包\\'+str(d)+'.jpg','wb')as f:
                print(self.name,'正在下载')
                f.write(reponse.content)
#分析请求爬取数据
class crawl(threading.Thread):
    def __init__(self,name,first_queue):
        super().__init__(name=name)
    def run(self):
        lock = threading.Lock()
        print(f"{self.name}正在运行")
        while not first_queue.empty():
            t = first_queue.get(block=False)
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
            print(f"{self.name}正在运行")
            response = requests.get(url=t,headers = headers)
            #把数据放入队列
            lock.acquire()
            second_queue.put(response.text)
            lock.release()
            print(f"{self.name}运行结束")
def parse_response(t):
    data = json.loads(t)
    x = jsonpath(data, '$..data..thumbURL')
    print('解析线程启动')
    #print(x)
    for i in x:
        urlpage.put(i)

if __name__ == '__main__':
    first_queue = queue.Queue()
    second_queue = queue.Queue()
    third_queue = queue.Queue()
    urlpage = queue.Queue()
    #网址通过分析ajax请求得到
    for i in range(0,480,30):#把网址放入队列中
        first_queue.put(f"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E7%A5%9E%E4%B9%90%E4%B8%83%E5%A5%88%E8%A1%A8%E6%83%85%E5%8C%85&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn={i}&rn=30&gsm=5a&1594005271382=")
    #print(first_queue.get())
    #启动爬取线程组
    for c in range(0,3):
        crawlthread = crawl(f'爬取{c}',first_queue)
        crawlthread.start()
    #time.sleep(5)
    crawlthread.join()
    while not second_queue.empty():
        t = second_queue.get()
    #解析
        t = threading.Thread(target=parse_response(t))
        t.start()
        t.join()
    # 等待解析线程
    time.sleep(2)
    for e in range(0,3):
        thread = downlodethread(f'下载{e}',urlpage)
        #print(urlpage.get())
        thread.start()
    thread.join()

这是从百度图片中抓取下的狗妈表情包，因为百度图片用的是ajax请求，所以需要分析网页，并解析json数据。

本文地址：https://blog.csdn.net/snake_boy_/article/details/107666775

相关标签：爬虫 python 多线程队列

上一篇： Python发送form-data请求及拼接form-data内容的方法

下一篇：使用Python保存网页上的图片或者保存页面为截图

多线程爬取狗妈表情包

Python爬虫入门教程 13-100 斗图啦表情包多线程爬取

Python爬虫使用requests库爬取表情包

简单 20 行 python 代码实现爬取表情包，麻麻再也不用担心我聊天图慌了！！

python之scrapy实战爬取表情包

用Python爬取百度表情包

爬取海量表情包，让你表情包仓库持续更新，成为群里的图王（附代码）

多线程爬取狗妈表情包

Python爬虫使用requests库爬取表情包

python之scrapy实战爬取表情包

简单 20 行 python 代码实现爬取表情包，麻麻再也不用担心我聊天图慌了！！