python爬虫之多线程解密

程序员文章站 2023-12-23 11:23:34

...

多线程对于Python来说是个鸡肋，但是我们还是趋之若鹜的去学习是因为，他确实在爬虫中能极大程度上提高我们的代码效率，所以，我作为一个学习者，也抱着一个学习的心态去学习了多线程，但是这条路十分的艰辛，从中也遇到了很多的问题，这些问题也让我在爬虫中有了很大的成长。

大家通常学习到的多线程都是构建一个类然后去构造run方法去实现多线程，具体我就不讲述通常的多线程了，但是我在一次网课上看到了一种另类的写法，这种多线程的构造更加的契合逻辑思维，而且简单，但是这里面的坑也很多。

这种新的写法更多的是实现队列的取值和放出值，是队列的数据能够得到有效的更新的同时，多线程处理这些数据，在学习这个方法之前一定要对队列的方法有一定的认知，已经要明白线程之间的逻辑关系以及阻塞。

接下来我就放出我的代码和我遇到的坑。

# -*- coding: utf-8 -*-
# Created : 2019/4/26  19:17
# author ：GL


import requests
import os
from lxml import etree
from queue import Queue
import threading
import time

class ShoeSpider:
    def __init__(self,name):
        self.name = str(name)
        self.num = 1
        self.base_json_url = 'https://www.shopstyle.com/api/v2/products?fts=' + self.name + '&includeLooks=true&includeProducts=true&includeSavedQueryId=true&limit=40&locales=all&maxNumFilters=1000&numLooks=20&offset={}&pid=shopstyle&prevCat=women&productScore=LessPopularityEPC&url=%2Fbrowse&useElasticsearch=true&view=angular2'
        self.base_url = 'https://www.shopstyle.com/browse/flats?fts=Slingback+Flats&r=therealreal-us'
        self.headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
            "cache-control": "no-cache",
            "cookie": cookie,
            "pragma": "no-cache",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36",
        }
        self.info_queue = Queue()
        self.url_queue = Queue()

    def parse_json(self):
        response = requests.get(self.base_json_url.format(0), headers=self.headers).json()
        total_page = response["metadata"]["total"]
        now_page = 0
        for now_page in range(0,int(total_page),80):
            response = requests.get(self.base_json_url.format(now_page),headers = self.headers).json()

            self.info_queue.put(response)


    def parse_detail_json(self):
        while True:
            response = self.info_queue.get()
            results = response["products"]
            result_list = []
            for result in results:
                goods_name = result['name']
                img_src = result['image']['sizes']['Best']["url"]
                name = self.name.split(" ")
                for i in name:
                    if i in goods_name.title():
                        print(img_src)
                        result_list.append(img_src)
                        break
            self.url_queue.put(result_list)
            self.info_queue.task_done()

    def write_to_file(self):
        while True:
            url_list = self.url_queue.get()
            for url in url_list:
                content = requests.get(url).content
                path = './images/women/shoes/Flat/Ballet/裹踝平底鞋/' + self.name + '/'
                if not os.path.exists(path):
                    os.makedirs(path)
                filename = url.split("/")[-1]
                filename = path + filename + ".jpg"
                with open(filename,"wb")as f:
                    f.write(content)
                print("第{}张图片写入成功！".format(self.num))
                self.num += 1
            self.url_queue.task_done()

    def run(self):
        thread_list = []
        t_info = threading.Thread(target=self.parse_json)
        thread_list.append(t_info)
        for i in range(10):
            t_url = threading.Thread(target=self.parse_detail_json)
            thread_list.append(t_url)
        for i in range(10):
            t_save = threading.Thread(target=self.write_to_file)
            thread_list.append(t_save)
        for t in thread_list:
            t.setDaemon(True)
            t.start()
        time.sleep(3)
        for q in [self.info_queue, self.url_queue ]:
            q.join()
        print("结束！")

if __name__ == '__main__':
    # name = input("请输入需要搜索的商品名称：")
    name = "Suede Loafers"
    name = name.title()
    print("您输入的关键字为:",name)
    shoe = ShoeSpider(name)
    shoe.run()

在我当时按照学习视频的方法去学习这个多线程的方法时，当时老师的第一个队列里的值时直接构造请求出来的，所以导致老师的代码我复制过来也可以使用，但是当我自己将这种多线程的方式融合到自己的多线程中去的时候，我发现我的会直接执行结束，我当时一直以为是saedon这个方法除了问题，因为将这个方法注释掉，我的程序就可以执行，但是到最后没法停止，后来我分析很久很久，也没弄懂为啥，最后又一次我在分析别的程序的时候，请求超时了，让我想到了这个坑，恍然大悟，因为我的队列的第一个值时通过请求得到的，虽然可能只有几秒，但是我们不能用自己的时间观念来衡量计算机，对于计算机来说，这可能是一个很漫长的过程，所以导致程序在执行到最后，可能队列里还没有放入任何的值，导致整个程序在执行完毕以后还是没有办法停下来。

下面我贴上我当时学习时用的代码，以供大家吸取教训，不要再这个上面掉坑里。

# -*- coding: utf-8 -*-
# Created : 2018/8/26 18:33
# author ：GL



from queue import Queue
import requests
from lxml import etree
import json
import threading


class QiushiSpider:
    def __init__(self):
        self.start_url = 'https://www.qiushibaike.com/hot/page/{}/'
        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Host": "www.qiushibaike.com",
            "Referer": "https://www.qiushibaike.com/hot/",
        }
        self.url_queue = Queue()
        self.html_queue = Queue()
        self.content_queue = Queue()

    def get_url_list(self):
        # return [self.start_url.format(i) for i in range(1, 14)]
        for i in range(1, 14):
            self.url_queue.put(self.start_url.format(i))

    def parse_url(self):
        while True:
            url = self.url_queue.get()
            print(url)
            response = requests.get(url, headers=self.headers)
            self.html_queue.put(response.content.decode())
            self.url_queue.task_done()

    def get_content_list(self):
        while True:
            html_info = self.html_queue.get()
            html = etree.HTML(html_info)
            content_list = []
            div_list = html.xpath("//div[@id='content-left']/div")
            for div in div_list:
                item = {}
                item['name'] = div.xpath(".//div[@class='author clearfix'][1]/a[2]/h2/text()")
                item['name'] = [item['name'][i].replace("\n", "") for i in range(len(item['name']))]
                item['info'] = div.xpath(".//div[@class='content']/span/text()")
                item['info'] = [item['info'][i].replace("\n", "") for i in range(len(item['info']))]
                item['reader'] = div.xpath(".//div[@class='stats']/span[1]/i/text()")
                item['img'] = div.xpath(".//div[@class='thumb']/a/img/@src")
                item['img'] = "https:" + item['img'][0] if len(item['img']) > 0 else None
                content_list.append(item)
            self.content_queue.put(content_list)
            self.html_queue.task_done()

    def save_content_list(self):
        while True:
            content_list = self.content_queue.get()
            for i in content_list:
                with open('qiushi_1.json', 'a', encoding="utf-8") as f:
                    f.write(json.dumps(i, ensure_ascii=False, indent=4))
                    f.write("\n")
            self.content_queue.task_done()

    def run(self):
        # url_list = self.get_url_list()
        thread_list = []
        t_url = threading.Thread(target=self.get_url_list)
        thread_list.append(t_url)
        for i in range(3):
            t_parse = threading.Thread(target=self.parse_url)
            thread_list.append(t_parse)

        for i in range(2):
            t_html = threading.Thread(target=self.get_content_list)
            thread_list.append(t_html)

        t_save = threading.Thread(target=self.save_content_list)
        thread_list.append(t_save)
        for t in thread_list:
            t.setDaemon(True)  # 把子线程设置为守护线程，等待队列中的任务完成之后再完成
            t.start()


        for q in [self.url_queue, self.html_queue, self.content_queue]:
            q.join()  # 让主线程阻塞，等待队列中的任务完成之后在完成

        print("结束！")

        # for url in url_list:
        #     html_info = self.parse_url(url)
        #     content_list = self.get_content_list(html_info)
        #     self.save_content_list(content_list)


if __name__ == '__main__':
    qiushi = QiushiSpider()
    qiushi.run()

如有问题，欢迎相互探讨！

qq：986361369！

python爬虫之多线程解密

python爬虫之多线程解密

python爬虫：爬虫进阶之多线程爬虫

Python 爬虫多线程详解及实例代码

使用Python多线程爬虫爬取电影天堂资源

Python 爬虫学习笔记之多线程爬虫

Python 爬虫学习笔记之单线程爬虫

python并发编程之多进程、多线程、异步和协程详解

使用Python多线程爬虫爬取电影天堂资源

Python 爬虫学习笔记之多线程爬虫

Python 爬虫学习笔记之单线程爬虫