Python - 分布式爬取百度贴吧
程序员文章站
2022-05-03 19:40:14
...
Environment Configure:
Scrapy
- settings.py
- middlewares.py
- tieba.py
Selenium
Redis
MongoDB
Linux
step 1:scrapy startproject name
- windows写好的爬虫文件整个传进linux无法辨识settings.py属于哪个爬虫
- linux中创建scrapy爬虫
- windows中编写好scrapy爬虫文件对应覆盖linux中scrapy爬虫文件即可
step 2:settings.py
# 设置URL去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 设置调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 设置暂停恢复后是否继续
SCHEDULER_PERSIST = True
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
# 开启redis管道
ITEM_PIPELINES = {'scrapy_redis.pipelines.RedisPipeline': 400,}
# 设置显示日志等级
LOG_LEVEL = 'DEBUG'
# 设置redis服务器IP
REDIS_HOST = "192.168.2.208"
# 设置redis端口
REDIS_PORT = 6379
# 设置redis数据库编号
REDIS_DB = 5
# 连接数据库配置
REDIS_PARAMS = {
'socket_timeout': 30,
'socket_connect_timeout': 30,
'retry_on_timeout': True,
'encoding': 'utf-8',
'db': REDIS_DB
}
# 开启Download_Middleware 级别调高
DOWNLOADER_MIDDLEWARES = {
# 注意这里baidutieba需修改项目对应的名称
'baidutieba.middlewares.SeleniumMiddleware': 200,
}
step 3:middlewares.py
- 拦截原生request , 使用selenium访问网站
from scrapy.http import HtmlResponse
class SeleniumMiddleware():
def process_request(self, request, spider):
url = request.url
spider.chrome.get(url)
html = spider.chrome.page_source
return HtmlResponse(url=url, request=request, body=html, encoding="utf-8")
step 4:Building Project
scrapy genspider name domain
import scrapy
from scrapy_redis.spiders import RedisSpider
from selenium import webdriver
from scrapy import signals
import re
# 继承RedisSpider类
class TiebaSpider(RedisSpider):
name = 'tieba'
allowed_domains = ['tieba.baidu.com']
#注销原生URL获取方式
# start_urls = [f'http://tieba.baidu.com/f?kw={keyword}']
#启用redis_key获取URL
redis_key = "tieba:start_urls"
base_url = "http://tieba.baidu.com/f?kw=%E9%A3%9E%E5%BA%A6&ie=utf-8&pn={}"
index = 2
def parse(self, response):
a_class1 = response.xpath("//a[@class='j_th_tit']/@href")
a_class2 = response.xpath("//a[@class='j_th_tit ']/@href")
a_class = a_class1.extract() if len(a_class1) > len(a_class2) else a_class2.extract()
urls = ["http://tieba.baidu.com" + url for url in a_class]
for url in urls:
yield scrapy.Request(url, callback=self.parse_info)
if self.index < 50:
yield scrapy.Request(self.base_url.format(self.index * 50), callback=self.parse)
self.index += 2
def parse_info(self, response):
title = response.xpath("//div[@class='core_title core_title_theme_bright']/h1/text()").extract()
div1 = response.xpath("//div[@class='d_post_content j_d_post_content clearfix']/text()")
div2 = response.xpath("//div[@class='d_post_content j_d_post_content clearfix']/text()")
reply = div1.extract() if len(div1) > len(div2) else div2.extract()
replys = ",".join(reply)
replys = re.sub(r"\s+", "", replys)
yield {
"title": title,
"replys": replys
}
- 拦截spider_idle信号(如果redis数据库中没有可爬取连接会调用此方法)
from scrapy import exceptions
def spider_idle(self):
"" supplement redis urls """
# Add function in here
self.schedule_next_requests()
# send message to scrapy for don't close spider
raise exceptions.DontCloseSpider
- 配置selenium
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(TiebaSpider, cls).from_crawler(crawler, *args, **kwargs)
options = webdriver.ChromeOptions()
# headless mode
options.add_argument('--headless')
# run in the root environment(disable sandbox running)
options.add_argument('--no-sandbox')
# disable GUI accelerator
options.add_argument('--disable-gpu')
# Create temp memory avoided running collapse
options.add_argument('--disable-dev-shm-usage')
# Don't loading images
options.add_argument('blink-settings=imagesEnabled=false')
spider.chrome = webdriver.Chrome(options=options)
# Capture spider closed signals (usage for close Chrome)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
# call this function of spider closed
def spider_closed(self, spider):
spider.chrome.quit()
step 5:替换文件
- linux中创建scrapy爬虫
- windows中编写好scrapy爬虫文件对应覆盖linux中scrapy爬虫文件即可
step 6:虚拟机克隆
- 右键虚拟机 > 管理 > 克隆
step 7:开启分布式爬虫
- scrapy runspider Spidername
step 8:从redis中取出数据存进Mongodb
import redis
import json
import pymongo
连接redis服务器 ip,port,db
redis_client = redis.Redis(“localhost”, 6379, 0)
连接mongodb服务器 ip,port
mongo_client = pymongo.MongoClient("localhost", 27017)
data = mongo_client.tieba.data
while True:
# 如redis中没有数据会阻塞等待
key, items = redis_client.blpop(["tieba:items"])
data.insert_one(json.loads(items))
上一篇: MySql连接查询与联合查询