使用scrapy框架爬取桌面背景图片
程序员文章站
2023-11-06 22:41:40
目标数据: zol桌面壁纸,[风景] [1920*1080] 分类下19页每个图册的图片 items.py 1 import scrapy 2 3 4 class Zol2Item(scrapy.Item): 5 # define the fields for your item here like ......
目标数据: zol桌面壁纸,[风景] [1920*1080] 分类下19页每个图册的图片
items.py
1 import scrapy 2 3 4 class zol2item(scrapy.item): 5 # define the fields for your item here like: 6 # name = scrapy.field() 7 image_urls = scrapy.field() 8 images = scrapy.field() 9 10 image_title = scrapy.field()
pipelines.py
1 from scrapy import request 2 from scrapy.pipelines.images import imagespipeline 3 4 class zolpipeline(imagespipeline): 5 # num = 1 6 def get_media_requests(self, item, info): 7 image_url = item["image_urls"] 8 if image_url: 9 # self.num + 1 10 yield request(url=image_url, meta={"item": item}) 11 12 def file_path(self, request, response=none, info=none): 13 ## start of deprecation warning block (can be removed in the future) 14 def _warn(): 15 from scrapy.exceptions import scrapydeprecationwarning 16 import warnings 17 warnings.warn('imagespipeline.image_key(url) and file_key(url) methods are deprecated, ' 18 'please use file_path(request, response=none, info=none) instead', 19 category=scrapydeprecationwarning, stacklevel=1) 20 21 # check if called from image_key or file_key with url as first argument 22 if not isinstance(request, request): 23 _warn() 24 url = request 25 else: 26 url = request.url 27 28 # detect if file_key() or image_key() methods have been overridden 29 if not hasattr(self.file_key, '_base'): 30 _warn() 31 return self.file_key(url) 32 elif not hasattr(self.image_key, '_base'): 33 _warn() 34 return self.image_key(url) 35 ## end of deprecation warning block 36 37 return 'desk/{}.jpg'.format(request.meta["item"]["image_title"])
middlewares.py
1 from scrapy import signals 2 from zol2.useragents import agents 3 4 5 class zol2spidermiddleware(object): 6 # not all methods need to be defined. if a method is not defined, 7 # scrapy acts as if the spider middleware does not modify the 8 # passed objects. 9 10 @classmethod 11 def from_crawler(cls, crawler): 12 # this method is used by scrapy to create your spiders. 13 s = cls() 14 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 15 return s 16 17 def process_spider_input(self, response, spider): 18 # called for each response that goes through the spider 19 # middleware and into the spider. 20 21 # should return none or raise an exception. 22 return none 23 24 def process_spider_output(self, response, result, spider): 25 # called with the results returned from the spider, after 26 # it has processed the response. 27 28 # must return an iterable of request, dict or item objects. 29 for i in result: 30 yield i 31 32 def process_spider_exception(self, response, exception, spider): 33 # called when a spider or process_spider_input() method 34 # (from other spider middleware) raises an exception. 35 36 # should return either none or an iterable of response, dict 37 # or item objects. 38 pass 39 40 def process_start_requests(self, start_requests, spider): 41 # called with the start requests of the spider, and works 42 # similarly to the process_spider_output() method, except 43 # that it doesn’t have a response associated. 44 45 # must return only requests (not items). 46 for r in start_requests: 47 yield r 48 49 def spider_opened(self, spider): 50 spider.logger.info('spider opened: %s' % spider.name) 51 52 53 class zol2downloadermiddleware(object): 54 # not all methods need to be defined. if a method is not defined, 55 # scrapy acts as if the downloader middleware does not modify the 56 # passed objects. 57 58 @classmethod 59 def from_crawler(cls, crawler): 60 # this method is used by scrapy to create your spiders. 61 s = cls() 62 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 63 return s 64 65 def process_request(self, request, spider): 66 # called for each request that goes through the downloader 67 # middleware. 68 69 # must either: 70 # - return none: continue processing this request 71 # - or return a response object 72 # - or return a request object 73 # - or raise ignorerequest: process_exception() methods of 74 # installed downloader middleware will be called 75 return none 76 77 def process_response(self, request, response, spider): 78 # called with the response returned from the downloader. 79 80 # must either; 81 # - return a response object 82 # - return a request object 83 # - or raise ignorerequest 84 return response 85 86 def process_exception(self, request, exception, spider): 87 # called when a download handler or a process_request() 88 # (from other downloader middleware) raises an exception. 89 90 # must either: 91 # - return none: continue processing this exception 92 # - return a response object: stops process_exception() chain 93 # - return a request object: stops process_exception() chain 94 pass 95 96 def spider_opened(self, spider): 97 spider.logger.info('spider opened: %s' % spider.name)
settings.py
1 # -*- coding: utf-8 -*- 2 3 # scrapy settings for zol2 project 4 # 5 # for simplicity, this file contains only settings considered important or 6 # commonly used. you can find more settings consulting the documentation: 7 # 8 # https://doc.scrapy.org/en/latest/topics/settings.html 9 # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 12 bot_name = 'zol2' 13 14 spider_modules = ['zol2.spiders'] 15 newspider_module = 'zol2.spiders' 16 17 18 # crawl responsibly by identifying yourself (and your website) on the user-agent 19 user_agent = 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/73.0.3683.75 safari/537.36' 20 21 # obey robots.txt rules 22 # robotstxt_obey = true 23 24 # configure maximum concurrent requests performed by scrapy (default: 16) 25 #concurrent_requests = 32 26 27 # configure a delay for requests for the same website (default: 0) 28 # see https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 # see also autothrottle settings and docs 30 download_delay = 0.5 31 # the download delay setting will honor only one of: 32 #concurrent_requests_per_domain = 16 33 #concurrent_requests_per_ip = 16 34 35 # disable cookies (enabled by default) 36 #cookies_enabled = false 37 38 # disable telnet console (enabled by default) 39 #telnetconsole_enabled = false 40 41 # override the default request headers: 42 #default_request_headers = { 43 # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 # 'accept-language': 'en', 45 #} 46 47 # enable or disable spider middlewares 48 # see https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 #spider_middlewares = { 50 # 'zol2.middlewares.zol2spidermiddleware': 543, 51 #} 52 53 # enable or disable downloader middlewares 54 # see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 #downloader_middlewares = { 56 # 'zol2.middlewares.zol2downloadermiddleware': 543, 57 #} 58 59 # enable or disable extensions 60 # see https://doc.scrapy.org/en/latest/topics/extensions.html 61 #extensions = { 62 # 'scrapy.extensions.telnet.telnetconsole': none, 63 #} 64 65 # configure item pipelines 66 # see https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 item_pipelines = { 68 'zol2.pipelines.zol2pipeline': 300, 69 } 70 images_store = "/home/pyvip/env_spider/zol2/zol2/images" 71 72 # enable and configure the autothrottle extension (disabled by default) 73 # see https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 #autothrottle_enabled = true 75 # the initial download delay 76 #autothrottle_start_delay = 5 77 # the maximum download delay to be set in case of high latencies 78 #autothrottle_max_delay = 60 79 # the average number of requests scrapy should be sending in parallel to 80 # each remote server 81 #autothrottle_target_concurrency = 1.0 82 # enable showing throttling stats for every response received: 83 #autothrottle_debug = false 84 85 # enable and configure http caching (disabled by default) 86 # see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 #httpcache_enabled = true 88 #httpcache_expiration_secs = 0 89 #httpcache_dir = 'httpcache' 90 #httpcache_ignore_http_codes = [] 91 #httpcache_storage = 'scrapy.extensions.httpcache.filesystemcachestorage'
pazol2.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import linkextractor 4 from scrapy.spiders import crawlspider, rule 5 from zol2.items import zol2item 6 7 class pazol2spider(crawlspider): 8 name = 'pazol2' 9 # allowed_domains = ['desk.zol.com.cn'] 10 start_urls = ['http://desk.zol.com.cn/fengjing/1920x1080/'] 11 front_url = "http://desk.zol.com.cn" 12 num = 1 13 14 rules = ( 15 # 1.解决翻页 16 rule(linkextractor(allow=r'/fengjing/1920x1080/[0-1]?[0-9]?.html'), callback='parse_album', follow=true), 17 # 2.进入各个图库的每一张图片页 18 rule(linkextractor(allow=r'/bizhi/\d+_\d+_\d+.html', restrict_xpaths=("//div[@class='main']/ul[@class='pic-list2 clearfix']/li", "//div[@class='photo-list-box']")), follow=true), 19 # 3.点击各个图片1920*1080按钮,获得html 20 rule(linkextractor(allow=r'/showpic/1920x1080_\d+_\d+.html'), callback='get_img', follow=true), 21 ) 22 23 def get_img(self, response): 24 item = zol2item() 25 item['image_urls'] = response.xpath("//body/img[1]/@src").extract_first() 26 item['image_title'] = str(self.num) 27 self.num += 1 28 yield item
爬取结果
共爬取了4517张图片,用时108分钟
放在桌面图库,半小时换一张,美滋滋。
上一篇: 泡柠檬水的温度和柠檬水的禁忌
下一篇: 拔罐能治疗过敏性鼻炎