欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy之Images Pipeline

程序员文章站 2022-03-02 21:16:25
...

items. py

import scrapy

class MyItem(scrapy.Item):

    # ... other item fields ...
    img_urls = scrapy.Field()
    img_paths = scrapy.Field()

pipelines. py

import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem

class ZhihuImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        for img_url in item['img_urls']:
            yield scrapy.Request(img_url)

    def item_completed(self, results, item, info):
        img_paths = [x['path'] for ok, x in results if ok]
        if not img_paths:
            raise DropItem("Item contains no images")
        item['img_paths'] = img_paths
        return item

注释

results返回一个元组list,典型值如下:

 [(True,
  {'checksum': '2b00042f7481c7b056c4b410d28f33cf',
   'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
   'url': 'http://www.example.com/files/product1.pdf'}),
 (False,
  Failure(...))]

setting. py

ITEM_PIPELINES = {'myProject.pipelines.MyImagesPipeline': 1}	#数字越低,优先级越高
IMAGES_STORE = 'D:\\path\\...'