scrapy-redis 完全重写start_requests函数
程序员文章站
2022-03-02 22:31:08
...
from scrapy.spiders import Spider, CrawlSpider
#此为scrapy-redis 中从redis队列读取URL
class RedisMixin(object):
"""Mixin class to implement reading urls from a redis queue."""
redis_key = None
redis_batch_size = None
redis_encoding = None
# Redis client placeholder.
server = None
def start_requests(self):
"""Returns a batch of start requests from redis."""
return self.next_requests()
def setup_redis(self, crawler=None):
if self.server is not None:
return
if crawler is None:
crawler = getattr(self, 'crawler', None)
if crawler is None:
raise ValueError("crawler is required")
settings = crawler.settings
if self.redis_key is None:
self.redis_key = settings.get(
'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
)
self.redis_key = self.redis_key % {'name': self.name}
if not self.redis_key.strip():
raise ValueError("redis_key must not be empty")
if self.redis_batch_size is None:
self.redis_batch_size = settings.getint(
'REDIS_START_URLS_BATCH_SIZE',
settings.getint('CONCURRENT_REQUESTS'),)
try:
self.redis_batch_size = int(self.redis_batch_size)
except (TypeError, ValueError):
raise ValueError("redis_batch_size must be an integer")
if self.redis_encoding is None:
self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
"(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
self.__dict__)
self.server = connection.from_settings(crawler.settings)
crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def next_requests(self):
"""Returns a request to be scheduled or none."""
use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) #根据REDIS_START_URLS_AS_SET此参数判断是否是set集合类型
fetch_one = self.server.spop if use_set else self.server.lpop
found = 0
while found < self.redis_batch_size: #redis_batch_size为每次请求的数量
data = fetch_one(self.redis_key)
if not data:
# Queue empty.
break
req = self.make_request_from_data(data) #返回Request对象
if req:
yield req
found += 1
else:
self.logger.debug("Request not made from data: %r", data)
if found:
self.logger.debug("Read %s requests from '%s'", found, self.redis_key)
def make_request_from_data(self, data):
url = bytes_to_str(data, self.redis_encoding)
return self.make_requests_from_url(url)
class RedisSpider(RedisMixin, Spider):
#RedisSpider 继承自RedisMixin 和scrapy的spider类
@classmethod
def from_crawler(self, crawler, *args, **kwargs):
obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
obj.setup_redis(crawler)
return obj
重写start_requests函数的测试项目 https://github.com/pythonlw/scrapy_redis_spiders/,也可以只重写make_requests_from_url 函数
import redis
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider
# from scrapy.utils.project import get_project_settings #读取settings属性
# CONCURRENT_REQUESTS=get_project_settings().get('CONCURRENT_REQUESTS')
#重写startrequest函数
class RedisSpiderSpider(RedisSpider):
name = 'redis_spider'
allowed_domains = ['www.baidu.com']
start_urls = []
redis_key = 'redis_spider_key'
redis_server = redis.StrictRedis()
def start_requests(self):
#从redis队列读取url
CONCURRENT_REQUESTS=5
found = 0
while found < CONCURRENT_REQUESTS:
obj = self.redis_server.rpop(self.redis_key)
if not isinstance(obj,bytes):continue
obj=obj.decode()
print('obj:',obj)
obj1={'url':json.loads(obj)['url']}
req = self.make_requests_from_url(json.dumps(obj1))
if req:
yield req
found += 1
print('found:',found)
#只运行一次
# obj1 = {'url': 'http://www.baidu.com'}
# print('obj1:',obj1)
# req = self.make_requests_from_url(json.dumps(obj1))
# if req:
# yield req
def make_requests_from_url(self, url):
url=json.loads(url)
item0 = {'url': url['url']}
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4115.0 Safari/537.36'
}
return Request(url=url['url'], headers=headers,
meta={'item0': item0, 'headers': headers},dont_filter=True)
def parse(self, response):
print('response:',response.url)
print(response.status)
print(dir(response))
print('headers:',response.request.headers)
上一篇: 用C语言实现二分查找算法
下一篇: Nginx配置的那些七七八八事