欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Scrapy框架自定义pipeline两层下载路径去下载图片,关于item传值的问题

程序员文章站 2022-05-05 16:09:54
...

自定义两层路径的时候,item是需要经过传值的,爬虫函数如下

import scrapy
from urllib.request import urljoin
from ..items import OffmymindspiderItem
class OffmymindSpider(scrapy.Spider):
    name = 'offmymind'
    allowed_domains = ['www.biaobaiju.com']
    start_urls = ['http://www.biaobaiju.com/']

    def parse(self, response):
        """
        获取每个分类的地址和分类的名称
        :param response:
        :return:
        """
        a_list = response.xpath("//ul[@class='nav clearfix']/li/a")
        for a in a_list:
            img_type_url = a.xpath("@href").extract_first("")
            img_type_name = a.xpath("text()").extract_first("")
            yield scrapy.Request(url=img_type_url, dont_filter=False, callback=self.parse_img_type_info, meta={"img_type_name":img_type_name})

    def parse_img_type_info(self, response):
        """
        解析每个分类地址的源代码,并取出每个图片集链接的网址
        :param response:
        :return:
        """
        div_list = response.xpath("//ul[@id='container']/li/div[2]")
        #因为parse()里item的值是不能丢弃的,所以在这需要用item接收一下,将meta里的值取出来,然后在parse_img_type_info()里给item再添加一个键值对(第二层路径),通过request一块传给下一个函数。
        item = response.meta
        #取每一页中的小分类的url地址
        for div in div_list:
            img_small_type_href = div.xpath("a/@href").extract_first("")
            img_small_type_name = div.xpath("a/text()").extract_first("")
            item["img_small_type_name"] = img_small_type_name
            yield scrapy.Request(url=img_small_type_href, dont_filter=True, callback=self.parse_every_small_type_info, meta=item)
        #判断是否有下一页;由于页数较多,这段代码没有执行,只下载每个分类的第一页
        # href = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract_first("")
        # if href=="":
        #     print("没有下一页了!")
        # elif href!="":
        #     next_page_url = urljoin(response.url,href)
        #    #需要调用自身,即下载完第一页的图片后,判断是否有下一页,有的话在调用parse_img_type_info(),然后下载第二页的图片
        #     yield scrapy.Request(url=next_page_url, dont_filter=True, callback=self.parse_img_type_info, meta=item)

    def parse_every_small_type_info(self, response):
        """
        解析每个小分类地址的图片网址
        :return:
        """
        # 只取第一页的图片
        # print(response)
        p_list = response.xpath("//div[@class='content tag_blue']/p")
        for p in p_list:
            img_url = p.xpath("img/@src").extract_first("")
            if img_url=="":
                del img_url
            elif img_url!="":
                item = OffmymindspiderItem()
                item["img_url"] = [img_url]
                #img_type_name,img_small_type_name是以字典的形式存放在meta中的,属于response
                item["img_small_type_name"] = response.meta.get("img_small_type_name")
                item["img_type_name"] = response.meta.get("img_type_name")
                yield item

items.py中的代码

import scrapy
class ZhanzhangsucaispiderItem(scrapy.Item):
    name = scrapy.Field()
    img_url = scrapy.Field()
    img_path = scrapy.Field()

settings.py中修改的内容

ROBOTSTXT_OBEY = False#第22行需要改成False
#第67行
ITEM_PIPELINES = {
   'OffMyMindSpider.pipelines.CustomImagesPipeline': 300,
}
IMAGES_STORE = "imgs"

pipelines.py中的代码

from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import scrapy
class CustomImagesPipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for img_download_url in item.get("img_url"):
            yield scrapy.Request(url=img_download_url, meta={"item":item})

    def file_path(self, request, response=None, info=None):
        # print(request)
        #将item取出来
        item = request.meta["item"]
        img_type_name = item["img_type_name"]
        img_small_type_name = item["img_small_type_name"]
        #img_url存放在列表中
        img_url = item.get("img_url")[0].split("/")[-1]
        return "%s/%s/%s"%(img_type_name, img_small_type_name, img_url)

    def item_completed(self, results, item, info):
        print("---")
        img_path = results[0][1].get("path")
        if not img_path:
            raise DropItem("Image download failed, delete the corresponding item value, do not let it return out")
        item["img_path"] = img_path
        return item