Scrapy框架学习 - 扩展内置的ImagesPipeline实现图片下载
程序员文章站
2022-05-26 23:30:14
...
分析
需求:下载斗鱼主播图片
思路:
- 使用Fiddler抓包工具,抓取斗鱼APP中关于主播信息的请求接口,响应数据为Json格式
- 在items中定义Item,包含 image_urls 和 images 字段
- 在spider中提取图片链接,保存到Item的 image_urls 字段中,返回Item
- 在Pipeline中通过继承内置的ImagesPipeline类,扩展功能,实现自定义ImagesPipeline
核心是重写get_media_requests
和item_completed
2个方法 - 在配置文件中进行相关配置
代码实现
items
class DouyuMeiziItem(scrapy.Item):
"""斗鱼妹子爬虫Item"""
image_urls=scrapy.Field()
images=scrapy.Field()
# 主播昵称
nickname=scrapy.Field()
# 图片存储路径
image_paths=scrapy.Field()
spider
# !/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import scrapy
from myscrapy.items import DouyuMeiziItem
class DouyuzhuboSpider(scrapy.Spider):
"""
通过扩展内置的ImagesPipeline爬取斗鱼直播平台中的主播图片
"""
name = 'douyuzhubo'
allowed_domains=['douyucdn.cn',]
offset=0
base_url='http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset='
start_urls=[base_url+str(offset),]
def parse(self, response):
# 获取响应内容,字符串
content=response.text
data=json.loads(content)['data']
for i in data:
# 图片链接
image_url=i['vertical_src']
# 主播昵称
nickname=i['nickname']
item=DouyuMeiziItem()
# 该字段必须是图片链接的可迭代对象,否则报错
item['image_urls']=[image_url]
item['nickname']=nickname
yield item
if self.offset<230:
self.offset+=20
yield scrapy.Request(url=self.base_url+str(self.offset),callback=self.parse)
pipelines
import os
import json
import pymysql
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings
class DouyuzhuboPipeline(ImagesPipeline):
"""
斗鱼直播爬虫 Item Pipeline
扩展内置的ImagesPipeline实现图片下载
"""
# 获取配置文件中配置的图片存储路径
IMAGES_STORE=get_project_settings().get('IMAGES_STORE')
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
"""
result结构:
[(True,
{'checksum': '2b00042f7481c7b056c4b410d28f33cf',
'path': 'full/0a79c461a4062ac383dc4fade7bc09f1384a3910.jpg',
'url': 'http://www.example.com/files/product1.pdf'}),
(False,
Failure(...))]
"""
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
# 修改图片保存名称为主播昵称
os.rename(self.IMAGES_STORE + image_paths[0], self.IMAGES_STORE + 'full/' + item["nickname"] + ".jpg")
item['image_paths'] = self.IMAGES_STORE+'full/'+item["nickname"]
return item
settings
ITEM_PIPELINES = {
'myscrapy.pipelines.DouyuzhuboPipeline': 300, # 扩展内置的图片下载Pipeline
}
# 图片存储路径(绝对路径 or 相对路径)
IMAGES_STORE = 'data/斗鱼主播图片/'
运行结果
上一篇: js生成随机数