Python多线程，多进程，多协程代码，Redis数据库-分布式速度学习测试

程序员文章站 2024-03-19 19:26:28

...

学习Scrapy框架前要先了解这些各个提高代码运行方式的内在关系逻辑

三种方式同时利用5个(线，进，协)来测试请求50个网页，比较速度

首先上多线程：

采用的是队列+多线程，这也是分布式爬虫底架常见的使用方法

本此多线程采用的是threading框架，也有如_thread等其它框架

import time
import requests
import threading
import queue as Qe

threads = []
link_list = []
threadList = ['Thread-1', 'Thread-2', 'Thread-3', 'Thread-4', 'Thread-5']

with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1].replace('\n', '')
        link_list.append(link)
        file.close()

start = time.time()

class MyThread(threading.Thread):
    def __init__(self, name, q):
        threading.Thread.__init__(self)
        self.name = name
        self.q = q

    def run(self):
        print('Starting' + self.name)
        while True:
            try:
                crawler(self.name, self.q)
            except:
                break
        print('Exiting' + self.name)

def crawler(threadName, q):
    url = q.get(timeout=2)
    try:
        r = requests.get(url, timeout=2)
        print(q.qsize(), threadName, r.status_code, url)
    except Exception as e:
        print(q.qsize(), threadName, url, 'Error：', e)

workQueue = Qe.Queue(50)

# 创建新线程
for tNname in threadList:
    thread = MyThread(tNname, workQueue)
    thread.start()
    threads.append(thread)

# 填充队列
for url in link_list:
    workQueue.put(url)

# 等待所有线程完成
for t in threads:
    t.join()

end = time.time()
print('总时间：', end-start)
print('Exiting Main Thread')

Python多线程，多进程，多协程代码，Redis数据库-分布式速度学习测试

多进程：

采用的是非阻塞调用，Pool框架，也有如Process等框架大家可自行学习

import time
import requests
from multiprocessing import Pool, Manager

link_list = []

with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1].replace('\n', '')
        link_list.append(link)
        file.close()

start = time.time()

def crawler(q, index):
    Process_id = 'Process-' + str(index)
    while not q.empty():
        url = q.get(timeout=2)
        try:
            r = requests.get(url, timeout=2)
            print(Process_id, q.qsize(), r.status_code, url)
        except Exception as e:
            print(Process_id, q.qsize(), url, 'Error：', e)


if __name__ == '__main__':
    manager = Manager()
    workQueue = manager.Queue(50)

    # 填充队列
    for url in link_list:
        workQueue.put(url)

    po = Pool(processes=5)  # 无穷多进程
    for i in range(5):
        po.apply_async(crawler, args=(workQueue, i))

    print('Started processes')
    po.close()
    po.join()

    end = time.time()
    print('总时间：', end-start)
    print('Main process Ended')

Python多线程，多进程，多协程代码，Redis数据库-分布式速度学习测试

多协程：

本次多协程采用的是常用的gevent框架

import time
import gevent
import requests
from gevent.queue import Queue, Empty

from gevent import monkey # 把下面有可能有IO操作的单独做上标记
monkey.patch_all() # 将Io转为异步执行的函数

jobs = []
link_list = []

with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1].replace('\n', '')
        link_list.append(link)
        file.close()

start = time.time()

def crawler(index):
    Process_id = 'Process-' + str(index)
    while not workQueue.empty():
        url = workQueue.get(timeout=2)
        try:
            r = requests.get(url, timeout=2)
            print(Process_id, workQueue.qsize(), r.status_code, url)
        except Exception as e:
            print(Process_id, workQueue.qsize(), url, 'Error：', e)

def boss():
    for url in link_list:
        workQueue.put_nowait(url)


if __name__ == '__main__':
    workQueue = Queue(50)

    gevent.spawn(boss).join()
    for i in range(5):
        jobs.append(gevent.spawn(crawler, i))
    gevent.joinall(jobs)

    end = time.time()
    print('总时间：', end-start)
    print('Main Ended')

Python多线程，多进程，多协程代码，Redis数据库-分布式速度学习测试

由上述代码运行对比得到时间：

多线程：11.943s

多进程：9.652s

多协程：5.673s

可以看出多协程是最出色的，更为巧妙的是三种提高代码运行速度的方式可相互联系，相互配合，让爬虫更加快速(本此结果仅代表作者本次测试结论，真实结论有待证明)

下面上Redis内存式数据库分布式爬虫获取50个网页图片的代码

1.master(中枢管理)

import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}

def push_redis_list():
    r = Redis(host='127.0.0.1', port=6379)

    link_list = []
    with open('alexa.txt', 'r') as file:
        file_list = file.readlines()
        for eachone in file_list:
            link = eachone.split('\t')[1].replace('\n', '')
            link_list.append(link)
            file.close()

    for url in link_list:
        try:
            response = requests.get(url, timeout=2)
            soup = BeautifulSoup(response.text, 'html.parser')
            img_list = soup.find_all('img')
            for img in img_list:
                img_url = img['src']
                if img_url != '':
                    print('加入的图片url：', img_url)
                    r.lpush('img_url', img_url)
        except Exception as e:
            print(url, 'Error：', e)
    print('现在的图片链接个数为', r.llen('img_url'))

def get_img():
    r = Redis(host='127.0.0.1', port=6379)
    while True:
        try:
            url = r.lpop('img_url')
            url = url.decode('ascii')
            if url[:2] == '//':
                url = 'http:' + url
            try:
                response = requests.get(url, timeout=2)
                name = int(time.time())
                f = open('E:\截图库\\' + str(name) + url[-4:], 'wb')
                f.write(response.content)
                f.close()
                print('已经获取图片', url)
            except Exception as e:
                print('爬取图片过程出问题', e)
            time.sleep(3)
        except Exception as e:
            print('Error：', e)
            time.sleep(10)
            break

if __name__ == '__main__':
    this_machine = 'master'
    print('开始分布式爬虫')
    if this_machine == 'master':
        push_redis_list()
    else:
        get_img()

2.slave(爬虫执行者)

import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}

def push_redis_list():
    r = Redis(host='127.0.0.1', port=6379)

    link_list = []
    with open('alexa.txt', 'r') as file:
        file_list = file.readlines()
        for eachone in file_list:
            link = eachone.split('\t')[1].replace('\n', '')
            link_list.append(link)
            file.close()

    for url in link_list:
        try:
            response = requests.get(url, timeout=2)
            soup = BeautifulSoup(response.text, 'html.parser')
            img_list = soup.find_all('img')
            for img in img_list:
                img_url = img['src']
                if img_url != '':
                    print('加入的图片url：', img_url)
                    r.lpush('img_url', img_url)
        except Exception as e:
            print(url, 'Error：', e)
    print('现在的图片链接个数为', r.llen('img_url'))

def get_img():
    r = Redis(host='127.0.0.1', port=6379)
    while True:
        try:
            url = r.lpop('img_url')
            url = url.decode('ascii')
            if url[:2] == '//':
                url = 'http:' + url
            try:
                response = requests.get(url, timeout=2)
                name = int(time.time())
                f = open('E:\截图库\\' + str(name) + url[-4:], 'wb')
                f.write(response.content)
                f.close()
                print('已经获取图片', url)
            except Exception as e:
                print('爬取图片过程出问题', e)
            time.sleep(3)
        except Exception as e:
            print('Error：', e)
            time.sleep(10)
            break

if __name__ == '__main__':
    this_machine = 'slave'
    print('开始分布式爬虫')
    if this_machine == 'master':
        push_redis_list()
    else:
        get_img()

master只有一个，而slave可以有很多个。互不干扰，相互运行，大大加快了爬虫的运行速度

上一篇： 1126 Eulerian Path (25分)/图的遍历

下一篇： python爬虫——全书网