欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

scrapy-redis 完全重写start_requests函数

程序员文章站 2022-03-02 22:31:08
...

 

from scrapy.spiders import Spider, CrawlSpider

#此为scrapy-redis 中从redis队列读取URL
class RedisMixin(object):
    """Mixin class to implement reading urls from a redis queue."""
    redis_key = None
    redis_batch_size = None
    redis_encoding = None
    # Redis client placeholder.
    server = None
    def start_requests(self):
        """Returns a batch of start requests from redis."""
        return self.next_requests()

    def setup_redis(self, crawler=None):
        if self.server is not None:
            return
        if crawler is None:
            crawler = getattr(self, 'crawler', None)
        if crawler is None:
            raise ValueError("crawler is required")
        settings = crawler.settings
        if self.redis_key is None:
            self.redis_key = settings.get(
                'REDIS_START_URLS_KEY', defaults.START_URLS_KEY,
            )
        self.redis_key = self.redis_key % {'name': self.name}
        if not self.redis_key.strip():
            raise ValueError("redis_key must not be empty")

        if self.redis_batch_size is None:
            self.redis_batch_size = settings.getint(
                'REDIS_START_URLS_BATCH_SIZE',
                settings.getint('CONCURRENT_REQUESTS'),)
        try:
            self.redis_batch_size = int(self.redis_batch_size)
        except (TypeError, ValueError):
            raise ValueError("redis_batch_size must be an integer")
        if self.redis_encoding is None:
            self.redis_encoding = settings.get('REDIS_ENCODING', defaults.REDIS_ENCODING)
        self.logger.info("Reading start URLs from redis key '%(redis_key)s' "
                         "(batch size: %(redis_batch_size)s, encoding: %(redis_encoding)s",
                         self.__dict__)
        self.server = connection.from_settings(crawler.settings)
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)

    def next_requests(self):
        """Returns a request to be scheduled or none."""
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) #根据REDIS_START_URLS_AS_SET此参数判断是否是set集合类型
        fetch_one = self.server.spop if use_set else self.server.lpop 
        found = 0
        while found < self.redis_batch_size: #redis_batch_size为每次请求的数量
            data = fetch_one(self.redis_key)
            if not data:
                # Queue empty.
                break
            req = self.make_request_from_data(data) #返回Request对象
            if req:
                yield req
                found += 1
            else:
                self.logger.debug("Request not made from data: %r", data)
        if found:
            self.logger.debug("Read %s requests from '%s'", found, self.redis_key)

    def make_request_from_data(self, data):
        url = bytes_to_str(data, self.redis_encoding)
        return self.make_requests_from_url(url)

class RedisSpider(RedisMixin, Spider): 
    #RedisSpider 继承自RedisMixin 和scrapy的spider类
    @classmethod
    def from_crawler(self, crawler, *args, **kwargs):
        obj = super(RedisSpider, self).from_crawler(crawler, *args, **kwargs)
        obj.setup_redis(crawler)
        return obj

 重写start_requests函数的测试项目 https://github.com/pythonlw/scrapy_redis_spiders/,也可以只重写make_requests_from_url 函数

import redis
from scrapy.http import Request
from scrapy_redis.spiders import RedisSpider
# from scrapy.utils.project import get_project_settings #读取settings属性
# CONCURRENT_REQUESTS=get_project_settings().get('CONCURRENT_REQUESTS')

#重写startrequest函数
class RedisSpiderSpider(RedisSpider):
    name = 'redis_spider'
    allowed_domains = ['www.baidu.com']
    start_urls = []
    redis_key = 'redis_spider_key'
    redis_server = redis.StrictRedis()
    def start_requests(self):
        #从redis队列读取url
        CONCURRENT_REQUESTS=5
        found = 0
        while found < CONCURRENT_REQUESTS:
            obj = self.redis_server.rpop(self.redis_key)
            if not isinstance(obj,bytes):continue
            obj=obj.decode()
            print('obj:',obj)
            obj1={'url':json.loads(obj)['url']}
            req = self.make_requests_from_url(json.dumps(obj1))
            if req:
                yield req
                found += 1
                print('found:',found)
        #只运行一次
        # obj1 = {'url': 'http://www.baidu.com'}
        # print('obj1:',obj1)
        # req = self.make_requests_from_url(json.dumps(obj1))
        # if req:
        #     yield req

    def make_requests_from_url(self, url):
        url=json.loads(url)
        item0 = {'url': url['url']}
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4115.0 Safari/537.36'
        }
        return Request(url=url['url'], headers=headers,
                meta={'item0': item0, 'headers': headers},dont_filter=True)

    def parse(self, response):
        print('response:',response.url)
        print(response.status)
        print(dir(response))
        print('headers:',response.request.headers)