Scrapy框架自定义pipeline两层下载路径去下载图片,关于item传值的问题
程序员文章站
2022-05-05 16:09:54
...
自定义两层路径的时候,item是需要经过传值的,爬虫函数如下
import scrapy
from urllib.request import urljoin
from ..items import OffmymindspiderItem
class OffmymindSpider(scrapy.Spider):
name = 'offmymind'
allowed_domains = ['www.biaobaiju.com']
start_urls = ['http://www.biaobaiju.com/']
def parse(self, response):
"""
获取每个分类的地址和分类的名称
:param response:
:return:
"""
a_list = response.xpath("//ul[@class='nav clearfix']/li/a")
for a in a_list:
img_type_url = a.xpath("@href").extract_first("")
img_type_name = a.xpath("text()").extract_first("")
yield scrapy.Request(url=img_type_url, dont_filter=False, callback=self.parse_img_type_info, meta={"img_type_name":img_type_name})
def parse_img_type_info(self, response):
"""
解析每个分类地址的源代码,并取出每个图片集链接的网址
:param response:
:return:
"""
div_list = response.xpath("//ul[@id='container']/li/div[2]")
#因为parse()里item的值是不能丢弃的,所以在这需要用item接收一下,将meta里的值取出来,然后在parse_img_type_info()里给item再添加一个键值对(第二层路径),通过request一块传给下一个函数。
item = response.meta
#取每一页中的小分类的url地址
for div in div_list:
img_small_type_href = div.xpath("a/@href").extract_first("")
img_small_type_name = div.xpath("a/text()").extract_first("")
item["img_small_type_name"] = img_small_type_name
yield scrapy.Request(url=img_small_type_href, dont_filter=True, callback=self.parse_every_small_type_info, meta=item)
#判断是否有下一页;由于页数较多,这段代码没有执行,只下载每个分类的第一页
# href = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract_first("")
# if href=="":
# print("没有下一页了!")
# elif href!="":
# next_page_url = urljoin(response.url,href)
# #需要调用自身,即下载完第一页的图片后,判断是否有下一页,有的话在调用parse_img_type_info(),然后下载第二页的图片
# yield scrapy.Request(url=next_page_url, dont_filter=True, callback=self.parse_img_type_info, meta=item)
def parse_every_small_type_info(self, response):
"""
解析每个小分类地址的图片网址
:return:
"""
# 只取第一页的图片
# print(response)
p_list = response.xpath("//div[@class='content tag_blue']/p")
for p in p_list:
img_url = p.xpath("img/@src").extract_first("")
if img_url=="":
del img_url
elif img_url!="":
item = OffmymindspiderItem()
item["img_url"] = [img_url]
#img_type_name,img_small_type_name是以字典的形式存放在meta中的,属于response
item["img_small_type_name"] = response.meta.get("img_small_type_name")
item["img_type_name"] = response.meta.get("img_type_name")
yield item
items.py中的代码
import scrapy class ZhanzhangsucaispiderItem(scrapy.Item): name = scrapy.Field() img_url = scrapy.Field() img_path = scrapy.Field()
settings.py中修改的内容
ROBOTSTXT_OBEY = False#第22行需要改成False
#第67行 ITEM_PIPELINES = { 'OffMyMindSpider.pipelines.CustomImagesPipeline': 300, } IMAGES_STORE = "imgs"
pipelines.py中的代码
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import scrapy
class CustomImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for img_download_url in item.get("img_url"):
yield scrapy.Request(url=img_download_url, meta={"item":item})
def file_path(self, request, response=None, info=None):
# print(request)
#将item取出来
item = request.meta["item"]
img_type_name = item["img_type_name"]
img_small_type_name = item["img_small_type_name"]
#img_url存放在列表中
img_url = item.get("img_url")[0].split("/")[-1]
return "%s/%s/%s"%(img_type_name, img_small_type_name, img_url)
def item_completed(self, results, item, info):
print("---")
img_path = results[0][1].get("path")
if not img_path:
raise DropItem("Image download failed, delete the corresponding item value, do not let it return out")
item["img_path"] = img_path
return item
上一篇: 清空购物车靠删
下一篇: 这4个门受过严重的伤