欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

使用scrapy框架爬取桌面背景图片

程序员文章站 2023-11-06 22:41:40
目标数据: zol桌面壁纸,[风景] [1920*1080] 分类下19页每个图册的图片 items.py 1 import scrapy 2 3 4 class Zol2Item(scrapy.Item): 5 # define the fields for your item here like ......

目标数据: zol桌面壁纸,[风景] [1920*1080] 分类下19页每个图册的图片

items.py

 1 import scrapy
 2 
 3 
 4 class zol2item(scrapy.item):
 5     # define the fields for your item here like:
 6     # name = scrapy.field()
 7     image_urls = scrapy.field()
 8     images = scrapy.field()
 9 
10     image_title = scrapy.field()

 

pipelines.py 

 1 from scrapy import request
 2 from scrapy.pipelines.images import imagespipeline
 3 
 4 class zolpipeline(imagespipeline):
 5     # num = 1
 6     def get_media_requests(self, item, info):
 7         image_url = item["image_urls"]
 8         if image_url:
 9             # self.num + 1
10             yield request(url=image_url, meta={"item": item})
11 
12     def file_path(self, request, response=none, info=none):
13         ## start of deprecation warning block (can be removed in the future)
14         def _warn():
15             from scrapy.exceptions import scrapydeprecationwarning
16             import warnings
17             warnings.warn('imagespipeline.image_key(url) and file_key(url) methods are deprecated, '
18                           'please use file_path(request, response=none, info=none) instead',
19                           category=scrapydeprecationwarning, stacklevel=1)
20 
21         # check if called from image_key or file_key with url as first argument
22         if not isinstance(request, request):
23             _warn()
24             url = request
25         else:
26             url = request.url
27 
28         # detect if file_key() or image_key() methods have been overridden
29         if not hasattr(self.file_key, '_base'):
30             _warn()
31             return self.file_key(url)
32         elif not hasattr(self.image_key, '_base'):
33             _warn()
34             return self.image_key(url)
35         ## end of deprecation warning block
36 
37         return 'desk/{}.jpg'.format(request.meta["item"]["image_title"])

 

middlewares.py

 1 from scrapy import signals
 2 from zol2.useragents import agents
 3 
 4 
 5 class zol2spidermiddleware(object):
 6     # not all methods need to be defined. if a method is not defined,
 7     # scrapy acts as if the spider middleware does not modify the
 8     # passed objects.
 9 
10     @classmethod
11     def from_crawler(cls, crawler):
12         # this method is used by scrapy to create your spiders.
13         s = cls()
14         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
15         return s
16 
17     def process_spider_input(self, response, spider):
18         # called for each response that goes through the spider
19         # middleware and into the spider.
20 
21         # should return none or raise an exception.
22         return none
23 
24     def process_spider_output(self, response, result, spider):
25         # called with the results returned from the spider, after
26         # it has processed the response.
27 
28         # must return an iterable of request, dict or item objects.
29         for i in result:
30             yield i
31 
32     def process_spider_exception(self, response, exception, spider):
33         # called when a spider or process_spider_input() method
34         # (from other spider middleware) raises an exception.
35 
36         # should return either none or an iterable of response, dict
37         # or item objects.
38         pass
39 
40     def process_start_requests(self, start_requests, spider):
41         # called with the start requests of the spider, and works
42         # similarly to the process_spider_output() method, except
43         # that it doesn’t have a response associated.
44 
45         # must return only requests (not items).
46         for r in start_requests:
47             yield r
48 
49     def spider_opened(self, spider):
50         spider.logger.info('spider opened: %s' % spider.name)
51 
52 
53 class zol2downloadermiddleware(object):
54     # not all methods need to be defined. if a method is not defined,
55     # scrapy acts as if the downloader middleware does not modify the
56     # passed objects.
57 
58     @classmethod
59     def from_crawler(cls, crawler):
60         # this method is used by scrapy to create your spiders.
61         s = cls()
62         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
63         return s
64 
65     def process_request(self, request, spider):
66         # called for each request that goes through the downloader
67         # middleware.
68 
69         # must either:
70         # - return none: continue processing this request
71         # - or return a response object
72         # - or return a request object
73         # - or raise ignorerequest: process_exception() methods of
74         #   installed downloader middleware will be called
75         return none
76 
77     def process_response(self, request, response, spider):
78         # called with the response returned from the downloader.
79 
80         # must either;
81         # - return a response object
82         # - return a request object
83         # - or raise ignorerequest
84         return response
85 
86     def process_exception(self, request, exception, spider):
87         # called when a download handler or a process_request()
88         # (from other downloader middleware) raises an exception.
89 
90         # must either:
91         # - return none: continue processing this exception
92         # - return a response object: stops process_exception() chain
93         # - return a request object: stops process_exception() chain
94         pass
95 
96     def spider_opened(self, spider):
97         spider.logger.info('spider opened: %s' % spider.name)

 

settings.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # scrapy settings for zol2 project
 4 #
 5 # for simplicity, this file contains only settings considered important or
 6 # commonly used. you can find more settings consulting the documentation:
 7 #
 8 #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 
12 bot_name = 'zol2'
13 
14 spider_modules = ['zol2.spiders']
15 newspider_module = 'zol2.spiders'
16 
17 
18 # crawl responsibly by identifying yourself (and your website) on the user-agent
19 user_agent = 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/73.0.3683.75 safari/537.36'
20 
21 # obey robots.txt rules
22 # robotstxt_obey = true
23 
24 # configure maximum concurrent requests performed by scrapy (default: 16)
25 #concurrent_requests = 32
26 
27 # configure a delay for requests for the same website (default: 0)
28 # see https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 # see also autothrottle settings and docs
30 download_delay = 0.5
31 # the download delay setting will honor only one of:
32 #concurrent_requests_per_domain = 16
33 #concurrent_requests_per_ip = 16
34 
35 # disable cookies (enabled by default)
36 #cookies_enabled = false
37 
38 # disable telnet console (enabled by default)
39 #telnetconsole_enabled = false
40 
41 # override the default request headers:
42 #default_request_headers = {
43 #   'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 #   'accept-language': 'en',
45 #}
46 
47 # enable or disable spider middlewares
48 # see https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 #spider_middlewares = {
50 #    'zol2.middlewares.zol2spidermiddleware': 543,
51 #}
52 
53 # enable or disable downloader middlewares
54 # see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 #downloader_middlewares = {
56 #    'zol2.middlewares.zol2downloadermiddleware': 543,
57 #}
58 
59 # enable or disable extensions
60 # see https://doc.scrapy.org/en/latest/topics/extensions.html
61 #extensions = {
62 #    'scrapy.extensions.telnet.telnetconsole': none,
63 #}
64 
65 # configure item pipelines
66 # see https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 item_pipelines = {
68    'zol2.pipelines.zol2pipeline': 300,
69 }
70 images_store = "/home/pyvip/env_spider/zol2/zol2/images"
71 
72 # enable and configure the autothrottle extension (disabled by default)
73 # see https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 #autothrottle_enabled = true
75 # the initial download delay
76 #autothrottle_start_delay = 5
77 # the maximum download delay to be set in case of high latencies
78 #autothrottle_max_delay = 60
79 # the average number of requests scrapy should be sending in parallel to
80 # each remote server
81 #autothrottle_target_concurrency = 1.0
82 # enable showing throttling stats for every response received:
83 #autothrottle_debug = false
84 
85 # enable and configure http caching (disabled by default)
86 # see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 #httpcache_enabled = true
88 #httpcache_expiration_secs = 0
89 #httpcache_dir = 'httpcache'
90 #httpcache_ignore_http_codes = []
91 #httpcache_storage = 'scrapy.extensions.httpcache.filesystemcachestorage'

 

pazol2.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.linkextractors import linkextractor
 4 from scrapy.spiders import crawlspider, rule
 5 from zol2.items import zol2item
 6 
 7 class pazol2spider(crawlspider):
 8     name = 'pazol2'
 9     # allowed_domains = ['desk.zol.com.cn']
10     start_urls = ['http://desk.zol.com.cn/fengjing/1920x1080/']
11     front_url = "http://desk.zol.com.cn"
12     num = 1
13 
14     rules = (
15         # 1.解决翻页
16         rule(linkextractor(allow=r'/fengjing/1920x1080/[0-1]?[0-9]?.html'), callback='parse_album', follow=true),
17         # 2.进入各个图库的每一张图片页
18         rule(linkextractor(allow=r'/bizhi/\d+_\d+_\d+.html', restrict_xpaths=("//div[@class='main']/ul[@class='pic-list2  clearfix']/li", "//div[@class='photo-list-box']")), follow=true),
19         # 3.点击各个图片1920*1080按钮,获得html
20         rule(linkextractor(allow=r'/showpic/1920x1080_\d+_\d+.html'), callback='get_img', follow=true),
21     )
22 
23     def get_img(self, response):
24         item = zol2item()
25         item['image_urls'] = response.xpath("//body/img[1]/@src").extract_first()
26         item['image_title'] = str(self.num)
27         self.num += 1
28         yield item

 

爬取结果

 使用scrapy框架爬取桌面背景图片

 

共爬取了4517张图片,用时108分钟

放在桌面图库,半小时换一张,美滋滋。