python开源爬虫框架scrapy源码解析(五)
程序员文章站
2024-02-18 12:07:16
...
调用下载器downloader返回response后将使用_handle_downloader_output(self, response, request, spider)解析response。
def _handle_downloader_output(self, response, request, spider):
assert isinstance(response, (Request, Response, Failure)), response
# downloader middleware can return requests (for example, redirects)
if isinstance(response, Request): #如果返回的request则跳转到crawl
self.crawl(response, spider)
return
# response is a Response or Failure
d = self.scraper.enqueue_scrape(response, request, spider) # 解析response
d.addErrback(lambda f: logger.error('Error while enqueuing downloader output',
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
return d
看完上面代码后就应该了解实际的解析是有scraper.py的scraper.enqueue_scrape(response, request, spider)方法来完成的。
def enqueue_scrape(self, response, request, spider):
slot = self.slot
dfd = slot.add_response_request(response, request)
def finish_scraping(_):
slot.finish_response(response, request)
self._check_if_closing(spider, slot)
self._scrape_next(spider, slot)
return _
dfd.addBoth(finish_scraping)
dfd.addErrback(
lambda f: logger.error('Scraper bug processing %(request)s',
{'request': request},
exc_info=failure_to_exc_info(f),
extra={'spider': spider}))
self._scrape_next(spider, slot)
return dfd def _scrape_next(self, spider, slot):
while slot.queue:
response, request, deferred = slot.next_response_request_deferred() #从dequez中取出response request
self._scrape(response, request, spider).chainDeferred(deferred)
_scrape方法通过调用_scrape2返回ITEM,然后将ITEM传递到handle_spider_output方法.
def _scrape(self, response, request, spider):
"""Handle the downloaded response or failure trough the spider
callback/errback"""
assert isinstance(response, (Response, Failure))
dfd = self._scrape2(response, request, spider) # returns spiders processed output
dfd.addErrback(self.handle_spider_error, request, response, spider)
dfd.addCallback(self.handle_spider_output, request, response, spider) #调用回掉方法handle_spider_output
return dfd
_scrape2方法通过调用call_spider实现回掉request的callback或spider默认的parse方法解析response返回ITEM.
def _scrape2(self, request_result, request, spider):
"""Handle the different cases of request's result been a Response or a
Failure"""
if not isinstance(request_result, Failure):
return self.spidermw.scrape_response(
self.call_spider, request_result, request, spider) #调用call_spider方法返回ITEM
else:
# FIXME: don't ignore errors in spider middleware
dfd = self.call_spider(request_result, request, spider)
return dfd.addErrback(
self._log_download_errors, request_result, request, spider)
def call_spider(self, result, request, spider):
result.request = request
dfd = defer_result(result)
dfd.addCallbacks(request.callback or spider.parse, request.errback) #调用request的callback方法或者spider默认parse
return dfd.addCallback(iterate_spider_output) def handle_spider_output(self, result, request, response, spider):
if not result:
return defer_succeed(None)
it = iter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel(it, self.concurrent_items,
self._process_spidermw_output, request, response, spider)
return dfd
handle_spider_output中通过parallerl将ITEM传参到_process_spidermw_output方法
def handle_spider_output(self, result, request, response, spider):
if not result:
return defer_succeed(None)
it = iter_errback(result, self.handle_spider_error, request, response, spider)
dfd = parallel(it, self.concurrent_items,
self._process_spidermw_output, request, response, spider)
return dfd
_process_spidermw_output(self, output, request, response, spider)方法中通过调用itemproc.process_item将ITEM通过pipelines持久化的本地。
def _process_spidermw_output(self, output, request, response, spider):
"""Process each Request/Item (given in the output parameter) returned
from the given spider
"""
if isinstance(output, Request):
self.crawler.engine.crawl(request=output, spider=spider)
elif isinstance(output, (BaseItem, dict)): #通过pipeline输出item
self.slot.itemproc_size += 1
dfd = self.itemproc.process_item(output, spider)
dfd.addBoth(self._itemproc_finished, output, response, spider)
return dfd
elif output is None:
pass
else:
typename = type(output).__name__
logger.error('Spider must return Request, BaseItem, dict or None, '
'got %(typename)r in %(request)s',
{'request': request, 'typename': typename},
extra={'spider': spider})
最后说下将数据持久化本地的实现。
def process_item(self, item, spider):
return self._process_chain('process_item', item, spider)
def _add_middleware(self, pipe): #添加itempipeline到middleware
super(ItemPipelineManager, self)._add_middleware(pipe)
if hasattr(pipe, 'process_item'):
self.methods['process_item'].append(pipe.process_item)
转载于:https://my.oschina.net/sojie/blog/651062