scrapy框架中redis的使用
程序员文章站
2022-05-12 09:19:02
...
scrapy startproject 项目名称
cd 项目名称
scrapy genspider 爬虫名称 要爬取的域名
scrapy crawl 爬虫名称
⒈在settings中配置
在settings末尾添加
# 启用Redis调度存储请求队列
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 确保所有的爬虫通过Redis去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用优先级调度请求队列 (默认使用)
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# redis服务器地址(IP地址)
REDIS_HOST = '127.0.0.1'[如果不是本地,那么还需要修改pipeline]
# redis端口号
REDIS_PORT = 6378 # 此处端口号一定要和服务器上的redis端口号一致
解开:
row(12) BOT_NAME = 'demo01'
row(14) SPIDER_MODULES = ['demo01.spiders']
row(15) NEWSPIDER_MODULE = 'demo01.spiders'
row(22) ROBOTSTXT_OBEY = False(是否遵守爬虫协议)
# 开启中间件
row(55) DOWNLOADER_MIDDLEWARES = {
'demo01.middlewares.Demo01DownloaderMiddleware': 543,
}
# 开启管道
row(67) ITEM_PIPELINES = {
'demo01.pipelines.Demo01Pipeline': 300,
}
如果执行失败,可能需要在settings中解开、修改配置
row(42) DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
pip install scrapy-redis
"""
scrapy-redis库是队列形式
如果运行爬虫文件,slaver获取一个url,数据库就会删除一个url
如果想在数据库中查看url,一定要先关闭爬虫文件
"""
⒉在item中声明要爬取的数据字段
class DuShuItem(scrapy.Item):
bookName = scrapy.Field() # 书 名
bookAuthor = scrapy.Field() # 作 者
bookIntroduction = scrapy.Field() # 简 介
bookImg = scrapy.Field() # 图 片
bookPrice = scrapy.Field() # 价 格
bookPublishing = scrapy.Field() # 出版社
bookPromotionPrice = scrapy.Field() # 促销价
⒊在项目目录下,进入spider文件,解析数据
import scrapy
from ..item import DuShuItem
class DuShuSpider(scrapy.spider):
name = 'DuShu'
allowed_domains = ['dushu.com']
start_urls = []
for i in range(1, 10, 1):
# 此处url,可以通过举例多个页面的url,进行查看规律
url = 'https://www.dushu.com/book/1175_{}.html'.format(i)
# 将需要爬取的每一个页面的url,加入到路由集中
start_urls.append(url)
def detection(content):
"""
为了防止数据获取失败,程序停止运行
:return:检测之后的数据
"""
if not content:
content = '暂无数据'
else:
content = content[0]
return content
def parse(self, response):
count = 0
# scrapy框架中,response自带xpath功能
# 解析网页,获取对应数据
div_list = response.xpath(r'//div[@class="bookslist"]/ul/li/div[@class="book-info"]')
for div in div_list:
bookName = div.xpath(r'./h3/a/text()').extract()
detection(bookName)
bookAuthor = div.xpath(r'./p[1]/a/text()').extract()
detection(bookAuthor)
bookImg = div.xpath(r'./div[@class="img152 float-left margin-right"]/a/img/@data-original').extract()
detection(bookImg)
bookIntroduction = div.xpath(r'./p[2]/text()').extract()
detection(bookIntroduction)
# 创建实例对象
item = DuShuItem()
item['bookName'] = bookName
item['bookAuthor'] = bookAuthor
item['bookImg'] = bookImg
item['bookIntroduction'] = bookIntroduction
# 剩余信息在列表页没有,所以需要跳转详情页
new_url = div.xpath(r'./div[1]/a/@href').extract()
detection(new_url)
detail_url = 'https://www.dushu.com' + str(new_url)
count += 1
# 跳转页面,将参数传递给下一个函数
yield scrapy.Request(url=detail_url, callback=self.detail_parse, meta={'item': item, 'count': count})
# 详情页(二级页面)
def detail_url(self, response):
item = response.request.meta['item']
count = response.request.meta['count']
# 如果程序中断、或者数据不完整,可以通过count知道具体解析成功的数据具体有多少
print('count', count)
# 开始解析
div = response.xpath(r'.//div[@class="book-details"]')
bookPrice = div.xpath(r'./div[@id="ctl00_c1_bookleft"]/p[@class="price"]/span[@class="num"]/text()').extract()
detection(bookPrice)
bookPublishing = div.xpath(r'./div[@id="ctl00_c1_bookleft"]/table/tbody/tr[2]/td[2]/a/text()').extract()
detection(bookPublishing)
bookPromotionPrice = response.xpath(r'//div[@class="book-details-right bg-mix"]/ul/li/a/text()').extract()
detection(bookPromotionPrice)
item['bookPrice'] = bookPrice
item['bookPublishing'] = bookPublishing
item['bookPromotionPrice'] = bookPromotionPrice
# 在没有第三级页面的情况下,将完整的item传递给pipeline处理
yield item
⒊在Pipeline中对数据进行存储
from redis import *
class DuShuPipeline(object):
def __init__(self):
# 链接数据库
redis = StrictRedis(
host='127.0.0.1',
port=6379, # 端口号要和settings中的port端口号一致
db=0 # 关联的redis数据库
)
def process_item(self, item, spider):
# 进行存储
redis.lpush('bookName', item['bookName'])
redis.lpush('bookAuthor', item['bookAuthor'])
redis.lpush('bookIntroduction', item['bookIntroduction'])
redis.lpush('bookImg', item['bookImg'])
redis.lpush('bookPrice', item['bookPrice'])
redis.lpush('bookPublishing', item['bookPublishing'])
redis.lpush('bookPromotionPrice', item['bookPromotionPrice'])
"""
redis数据库不需要手动关闭
"""
下一篇: 处理udp请求