Python多线程,多进程,多协程代码,Redis数据库-分布式速度学习测试
程序员文章站
2024-03-19 19:26:28
...
学习Scrapy框架前要先了解这些各个提高代码运行方式的内在关系逻辑
三种方式同时利用5个(线,进,协)来测试请求50个网页,比较速度
首先上多线程:
采用的是队列+多线程,这也是分布式爬虫底架常见的使用方法
本此多线程采用的是threading框架,也有如_thread等其它框架
import time
import requests
import threading
import queue as Qe
threads = []
link_list = []
threadList = ['Thread-1', 'Thread-2', 'Thread-3', 'Thread-4', 'Thread-5']
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()
start = time.time()
class MyThread(threading.Thread):
def __init__(self, name, q):
threading.Thread.__init__(self)
self.name = name
self.q = q
def run(self):
print('Starting' + self.name)
while True:
try:
crawler(self.name, self.q)
except:
break
print('Exiting' + self.name)
def crawler(threadName, q):
url = q.get(timeout=2)
try:
r = requests.get(url, timeout=2)
print(q.qsize(), threadName, r.status_code, url)
except Exception as e:
print(q.qsize(), threadName, url, 'Error:', e)
workQueue = Qe.Queue(50)
# 创建新线程
for tNname in threadList:
thread = MyThread(tNname, workQueue)
thread.start()
threads.append(thread)
# 填充队列
for url in link_list:
workQueue.put(url)
# 等待所有线程完成
for t in threads:
t.join()
end = time.time()
print('总时间:', end-start)
print('Exiting Main Thread')
多进程:
采用的是非阻塞调用,Pool框架,也有如Process等框架大家可自行学习
import time
import requests
from multiprocessing import Pool, Manager
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()
start = time.time()
def crawler(q, index):
Process_id = 'Process-' + str(index)
while not q.empty():
url = q.get(timeout=2)
try:
r = requests.get(url, timeout=2)
print(Process_id, q.qsize(), r.status_code, url)
except Exception as e:
print(Process_id, q.qsize(), url, 'Error:', e)
if __name__ == '__main__':
manager = Manager()
workQueue = manager.Queue(50)
# 填充队列
for url in link_list:
workQueue.put(url)
po = Pool(processes=5) # 无穷多进程
for i in range(5):
po.apply_async(crawler, args=(workQueue, i))
print('Started processes')
po.close()
po.join()
end = time.time()
print('总时间:', end-start)
print('Main process Ended')
多协程:
本次多协程采用的是常用的gevent框架
import time
import gevent
import requests
from gevent.queue import Queue, Empty
from gevent import monkey # 把下面有可能有IO操作的单独做上标记
monkey.patch_all() # 将Io转为异步执行的函数
jobs = []
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()
start = time.time()
def crawler(index):
Process_id = 'Process-' + str(index)
while not workQueue.empty():
url = workQueue.get(timeout=2)
try:
r = requests.get(url, timeout=2)
print(Process_id, workQueue.qsize(), r.status_code, url)
except Exception as e:
print(Process_id, workQueue.qsize(), url, 'Error:', e)
def boss():
for url in link_list:
workQueue.put_nowait(url)
if __name__ == '__main__':
workQueue = Queue(50)
gevent.spawn(boss).join()
for i in range(5):
jobs.append(gevent.spawn(crawler, i))
gevent.joinall(jobs)
end = time.time()
print('总时间:', end-start)
print('Main Ended')
由上述代码运行对比得到时间:
多线程:11.943s
多进程:9.652s
多协程:5.673s
可以看出多协程是最出色的,更为巧妙的是三种提高代码运行速度的方式可相互联系,相互配合,让爬虫更加快速(本此结果仅代表作者本次测试结论,真实结论有待证明)
下面上Redis内存式数据库分布式爬虫获取50个网页图片的代码
1.master(中枢管理)
import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}
def push_redis_list():
r = Redis(host='127.0.0.1', port=6379)
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()
for url in link_list:
try:
response = requests.get(url, timeout=2)
soup = BeautifulSoup(response.text, 'html.parser')
img_list = soup.find_all('img')
for img in img_list:
img_url = img['src']
if img_url != '':
print('加入的图片url:', img_url)
r.lpush('img_url', img_url)
except Exception as e:
print(url, 'Error:', e)
print('现在的图片链接个数为', r.llen('img_url'))
def get_img():
r = Redis(host='127.0.0.1', port=6379)
while True:
try:
url = r.lpop('img_url')
url = url.decode('ascii')
if url[:2] == '//':
url = 'http:' + url
try:
response = requests.get(url, timeout=2)
name = int(time.time())
f = open('E:\截图库\\' + str(name) + url[-4:], 'wb')
f.write(response.content)
f.close()
print('已经获取图片', url)
except Exception as e:
print('爬取图片过程出问题', e)
time.sleep(3)
except Exception as e:
print('Error:', e)
time.sleep(10)
break
if __name__ == '__main__':
this_machine = 'master'
print('开始分布式爬虫')
if this_machine == 'master':
push_redis_list()
else:
get_img()
2.slave(爬虫执行者)
import re
import time
import requests
from redis import Redis
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116\
Safari/537.36'}
def push_redis_list():
r = Redis(host='127.0.0.1', port=6379)
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1].replace('\n', '')
link_list.append(link)
file.close()
for url in link_list:
try:
response = requests.get(url, timeout=2)
soup = BeautifulSoup(response.text, 'html.parser')
img_list = soup.find_all('img')
for img in img_list:
img_url = img['src']
if img_url != '':
print('加入的图片url:', img_url)
r.lpush('img_url', img_url)
except Exception as e:
print(url, 'Error:', e)
print('现在的图片链接个数为', r.llen('img_url'))
def get_img():
r = Redis(host='127.0.0.1', port=6379)
while True:
try:
url = r.lpop('img_url')
url = url.decode('ascii')
if url[:2] == '//':
url = 'http:' + url
try:
response = requests.get(url, timeout=2)
name = int(time.time())
f = open('E:\截图库\\' + str(name) + url[-4:], 'wb')
f.write(response.content)
f.close()
print('已经获取图片', url)
except Exception as e:
print('爬取图片过程出问题', e)
time.sleep(3)
except Exception as e:
print('Error:', e)
time.sleep(10)
break
if __name__ == '__main__':
this_machine = 'slave'
print('开始分布式爬虫')
if this_machine == 'master':
push_redis_list()
else:
get_img()
master只有一个,而slave可以有很多个。互不干扰,相互运行,大大加快了爬虫的运行速度
上一篇: 1126 Eulerian Path (25分)/图的遍历
下一篇: python爬虫——全书网