python爬虫Scrapy框架:媒体管道原理学习分析
程序员文章站
2024-01-07 21:23:52
目录二、imagespipeline类简介一、媒体管道1.1、媒体管道的特性媒体管道实现了以下特性: 避免重新下载最近下载的媒体 指定存储位置(文件系统目录,amazon s3 bucke...
一、媒体管道
1.1、媒体管道的特性
媒体管道实现了以下特性:
- 避免重新下载最近下载的媒体
- 指定存储位置(文件系统目录,amazon s3 bucket,谷歌云存储bucket)
图像管道具有一些额外的图像处理功能:
- 将所有下载的图片转换为通用格式(jpg)和模式(rgb)
- 生成缩略图
- 检查图像的宽度/高度,进行最小尺寸过滤
1.2、媒体管道的设置
item_pipelines = {'scrapy.pipelines.images.imagespipeline': 120} 启用 files_store = '/path/to/valid/dir' 文件管道存放位置 images_store = '/path/to/valid/dir' 图片管道存放位置 files_urls_field = 'field_name_for_your_files_urls' 自定义文件url字段 files_result_field = 'field_name_for_your_processed_files' 自定义结果字段 images_urls_field = 'field_name_for_your_images_urls' 自定义图片url字段 images_result_field = 'field_name_for_your_processed_images' 结果字段 files_expires = 90 文件过期时间 默认90天 images_expires = 90 图片过期时间 默认90天 images_thumbs = {'small': (50, 50), 'big':(270, 270)} 缩略图尺寸 images_min_height = 110 过滤最小高度 images_min_width = 110 过滤最小宽度 media_allow_redirects = true 是否重定向
二、imagespipeline类简介
#解析settings里的配置字段 def __init__(self, store_uri, download_func=none, settings=none) #图片下载 def image_downloaded(self, response, request, info) #图片获取 图片大小的过滤 #缩略图的生成 def get_images(self, response, request, info) #转化图片格式 def convert_image(self, image, size=none) #生成媒体请求 可重写 def get_media_requests(self, item, info) return [request(x) for x in item.get(self.images_urls_field, [])] #得到图片url 变成请求 发给引擎 #此方法获取文件名 进行改写 def item_completed(self, results, item, info) #文件路径 def file_path(self, request, response=none, info=none) #缩略图的存储路径 def thumb_path(self, request, thumb_id, response=none, info=none):
三、小案例:使用图片管道爬取百度图片
(当然不使用图片管道的话也是可以爬取百度图片的,但这还需要我们去分析网页的代码,还是有点麻烦,使用图片管道就可以省去这个步骤了)
3.1、spider文件
注意:由于需要添加所有的请求头,所以我们要重写start_requests函数
import re import scrapy from ..items import dbimgitem class dbspider(scrapy.spider): name = 'db' # allowed_domains = ['xxx.com'] start_urls = ['https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111110&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%e7%8b%97&oq=%e7%8b%97&rsp=-1'] def start_requests(self): #因为需要添加所有的请求头,所以我们要重写start_requests函数 # url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111110&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%e7%8b%97&oq=%e7%8b%97&rsp=-1' headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-cn,zh;q=0.9", "cache-control": "max-age=0", "connection": "keep-alive", "cookie": "bidupsid=4b61d634d704a324e3c7e274bf11f280; pstm=1624157516; baiduid=4b61d634d704a324c7ea5ba47ba5886e:fg=1; __yjs_duid=1_f7116f04cddf75093b9236654a2d70931624173362209; baiduid_bfess=101022aee931e08a9b9a3ba623709cfe:fg=1; bdorz=b490b5ebf6f3cd402e515d22bcda1598; bdrcvfr[dg2jnjb_ajr]=mk3slvn4hkm; cleanhistorystatus=0; h_ps_pssid=34099_33969_34222_31660_34226_33848_34113_34073_33607_34107_34134_34118_26350_22159; delper=0; psino=6; ba_hector=24ak842ka421210koq1gdtj070r; bdrcvfr[x_xkqks0s63]=mk3slvn4hkm; userfrom=www.baidu.com; firstshowtip=1; indexpagesuglist=%5b%22%e7%8b%97%22%2c%22%e7%8c%ab%e5%92%aa%22%2c%22%e5%b0%8f%e9%80%8f%e6%98%8e%22%5d; ab_sr=1.0.1_ogywmtzimjg5ztniymuxodixotgyytllzgmymzhjode2zwe5ogy4ymeyzwvjogzhowixm2nlm2fhztqxmmfjody0owzinzqxmjvlmwiyodvlzwfizjy2ntqymtzhy2njntm5ndnmytfmzjgxmtlkogyxytuzytizmza0nde3mgnmzdhkytbkzmjimmjhzmfkzdnmztm1zmi2mwzknzyyyq==", "host": "image.baidu.com", "referer": "https://image.baidu.com/", "sec-ch-ua": '" not;a brand";v="99", "google chrome";v="91", "chromium";v="91"', "sec-ch-ua-mobile": "?0", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/91.0.4472.106 safari/537.36" } for url in self.start_urls: yield scrapy.request(url,headers=headers,callback=self.parse,dont_filter=true) def parse(self, response): img_urls = re.findall('"thumburl":"(.*?)"', response.text) # print(img_urls) item = dbimgitem() item['image_urls'] = img_urls yield item
3.2、items文件
import scrapy class dbimgitem(scrapy.item): # define the fields for your item here like: # name = scrapy.field() image_urls = scrapy.field()
3.3、settings文件
robotstxt_obey = false #打开我们写的管道 item_pipelines = { # 'dbimg.pipelines.dbimgpipeline': 300, 'dbimg.pipelines.imgpipe': 300, } #图片存放位置 images_store = 'd:/python test/爬虫/scrapy6/dbimg/imgs'
3.4、pipelines文件
import os from itemadapter import itemadapter from scrapy.pipelines.images import imagespipeline import settings """ def item_completed(self, results, item, info): with suppress(keyerror): itemadapter(item)[self.images_result_field] = [x for ok, x in results if ok] return item """ class imgpipe(imagespipeline): num=0 #重写此函数修改获取的图片的名字 不然图片名称就是一串数字字母 def item_completed(self, results, item, info): images_path = [x['path'] for ok, x in results if ok] #print('results: ',results) 先查看下results的数据格式,然后才能获取到我们需要的值 for image_path in images_path: os.rename(settings.images_store + "/" + image_path, settings.images_store + "/" + str(self.num) + ".jpg") self.num += 1
结果:
以上就是python爬虫scrapy框架:媒体管道原理学习分析的详细内容,更多关于python爬虫scrapy框架的资料请关注其它相关文章!