欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

scrapy框架中redis的使用

程序员文章站 2022-05-12 09:19:02
...
scrapy startproject 项目名称
cd 项目名称
scrapy genspider 爬虫名称 要爬取的域名
scrapy crawl 爬虫名称

⒈在settings中配置
	在settings末尾添加
		# 启用Redis调度存储请求队列
		SCHEDULER = "scrapy_redis.scheduler.Scheduler"
		# 确保所有的爬虫通过Redis去重
		DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
		# 使用优先级调度请求队列 (默认使用)
		SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
		# redis服务器地址(IP地址)
		REDIS_HOST = '127.0.0.1'[如果不是本地,那么还需要修改pipeline]
		# redis端口号
		REDIS_PORT = 6378	# 此处端口号一定要和服务器上的redis端口号一致
	解开:
		row(12)		BOT_NAME = 'demo01'	
		row(14)		SPIDER_MODULES = ['demo01.spiders']
		row(15)		NEWSPIDER_MODULE = 'demo01.spiders'
		row(22)		ROBOTSTXT_OBEY = False(是否遵守爬虫协议)
		# 开启中间件	
		row(55)		DOWNLOADER_MIDDLEWARES = {
  							'demo01.middlewares.Demo01DownloaderMiddleware': 543,
					}
		# 开启管道
		row(67)		ITEM_PIPELINES = {
  							'demo01.pipelines.Demo01Pipeline': 300,
					}
	如果执行失败,可能需要在settings中解开、修改配置
		row(42)		DEFAULT_REQUEST_HEADERS = {
 							'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 							'Accept-Language': 'en',
 							'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
					}


pip install scrapy-redis
"""
scrapy-redis库是队列形式
如果运行爬虫文件,slaver获取一个url,数据库就会删除一个url
如果想在数据库中查看url,一定要先关闭爬虫文件
"""

⒉在item中声明要爬取的数据字段
	class DuShuItem(scrapy.Item):
		bookName = scrapy.Field()               # 书  名
    	bookAuthor = scrapy.Field()             # 作  者
    	bookIntroduction = scrapy.Field()       # 简  介
    	bookImg = scrapy.Field()                # 图  片
    	bookPrice = scrapy.Field()              # 价  格
    	bookPublishing = scrapy.Field()     	# 出版社
    	bookPromotionPrice = scrapy.Field() 	# 促销价

⒊在项目目录下,进入spider文件,解析数据
	import scrapy 
	from ..item import DuShuItem
	
	class DuShuSpider(scrapy.spider):
		name = 'DuShu'
		allowed_domains = ['dushu.com']
		start_urls = []
		for i in range(1, 10, 1):
			# 此处url,可以通过举例多个页面的url,进行查看规律
			url = 'https://www.dushu.com/book/1175_{}.html'.format(i)
			# 将需要爬取的每一个页面的url,加入到路由集中
			start_urls.append(url)
			
		def detection(content):
			"""
			为了防止数据获取失败,程序停止运行
			:return:检测之后的数据
			"""
			if not content:
				content = '暂无数据'
			else:
				content = content[0]
			return content
		
		def parse(self, response):
			count = 0
			# scrapy框架中,response自带xpath功能
			# 解析网页,获取对应数据
			div_list = response.xpath(r'//div[@class="bookslist"]/ul/li/div[@class="book-info"]')
			for div in div_list:
				bookName = div.xpath(r'./h3/a/text()').extract()
				detection(bookName)
				bookAuthor = div.xpath(r'./p[1]/a/text()').extract()
				detection(bookAuthor)
				bookImg = div.xpath(r'./div[@class="img152 float-left margin-right"]/a/img/@data-original').extract()
				detection(bookImg)
				bookIntroduction = div.xpath(r'./p[2]/text()').extract()
				detection(bookIntroduction)
				# 创建实例对象
				item = DuShuItem()
				item['bookName'] = bookName 
				item['bookAuthor'] = bookAuthor  
				item['bookImg'] = bookImg
				item['bookIntroduction'] = bookIntroduction
				# 剩余信息在列表页没有,所以需要跳转详情页
				new_url = div.xpath(r'./div[1]/a/@href').extract()
				detection(new_url)
				detail_url = 'https://www.dushu.com' + str(new_url)
				count += 1
				# 跳转页面,将参数传递给下一个函数
				yield scrapy.Request(url=detail_url, callback=self.detail_parse, meta={'item': item, 'count': count})
				
		# 详情页(二级页面)
		def detail_url(self, response):
			item = response.request.meta['item']
			count = response.request.meta['count']
			# 如果程序中断、或者数据不完整,可以通过count知道具体解析成功的数据具体有多少
			print('count', count)
			# 开始解析
			div = response.xpath(r'.//div[@class="book-details"]')
			bookPrice = div.xpath(r'./div[@id="ctl00_c1_bookleft"]/p[@class="price"]/span[@class="num"]/text()').extract()
			detection(bookPrice)
			bookPublishing = div.xpath(r'./div[@id="ctl00_c1_bookleft"]/table/tbody/tr[2]/td[2]/a/text()').extract()
			detection(bookPublishing)
			bookPromotionPrice = response.xpath(r'//div[@class="book-details-right bg-mix"]/ul/li/a/text()').extract()
			detection(bookPromotionPrice)
			item['bookPrice'] = bookPrice
			item['bookPublishing'] = bookPublishing
			item['bookPromotionPrice'] = bookPromotionPrice
			# 在没有第三级页面的情况下,将完整的item传递给pipeline处理
			yield item

⒊在Pipeline中对数据进行存储
	from redis import *
	
	class DuShuPipeline(object):
		def __init__(self):
			# 链接数据库
			redis = StrictRedis(
						host='127.0.0.1',
						port=6379,	# 端口号要和settings中的port端口号一致
						db=0		# 关联的redis数据库
					)
					
		def process_item(self, item, spider):
			# 进行存储
			redis.lpush('bookName', item['bookName'])
			redis.lpush('bookAuthor', item['bookAuthor'])
			redis.lpush('bookIntroduction', item['bookIntroduction'])
			redis.lpush('bookImg', item['bookImg'])
			redis.lpush('bookPrice', item['bookPrice'])
			redis.lpush('bookPublishing', item['bookPublishing'])
			redis.lpush('bookPromotionPrice', item['bookPromotionPrice'])
		"""
		redis数据库不需要手动关闭
		"""