爬虫(十七):Scrapy框架(四) 对接selenium爬取京东商品数据
1. scrapy对接selenium
scrapy抓取页面的方式和requests库类似,都是直接模拟http请求,而scrapy也不能抓取javascript动态谊染的页面。在前面的博客中抓取javascript渲染的页面有两种方式。一种是分析ajax请求,找到其对应的接口抓取,scrapy同样可以用此种方式抓取。另一种是直接用 selenium模拟浏览器进行抓取,我们不需要关心页面后台发生的请求,也不需要分析渲染过程,只需要关心页面最终结果即可,可见即可爬。那么,如果scrapy可以对接selenium,那 scrapy就可以处理任何网站的抓取了。
1.1 新建项目
scrapy startproject scrapyseleniumtest
scrapy genspider jd www.jd.com
robotstxt_obey = false
1.2 定义item
初步实现spider的start _requests()方法。
# -*- coding: utf-8 -*- from scrapy import request,spider from urllib.parse import quote from bs4 import beautifulsoup class jdspider(spider): name = 'jd' allowed_domains = ['www.jd.com'] base_url = 'https://search.jd.com/search?keyword=' def start_requests(self): for keyword in self.settings.get('keywords'): for page in range(1, self.settings.get('max_page') + 1): url = self.base_url + quote(keyword) # dont_filter = true 不去重 yield request(url=url, callback=self.parse, meta={'page': page}, dont_filter=true)
keywords = ['ipad'] max_page = 2
1.3 对接selenium
接下来我们需要处理这些请求的抓取。这次我们对接selenium进行抓取,采用downloader middleware来实现。在middleware中对接selenium,输出源代码之后,构造htmlresponse对象,直接返回给spider解析页面,提取数据,并且也不在执行下载器下载页面动作。
class seleniummiddleware(object): # not all methods need to be defined. if a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def __init__(self,timeout=none): self.logger=getlogger(__name__) self.timeout = timeout self.browser = webdriver.chrome() self.browser.set_window_size(1400,700) self.browser.set_page_load_timeout(self.timeout) self.wait = webdriverwait(self.browser,self.timeout) def __del__(self): self.browser.close() @classmethod def from_crawler(cls, crawler): # this method is used by scrapy to create your spiders. return cls(timeout=crawler.settings.get('selenium_timeout')) def process_request(self, request, spider): ''' 在下载器中间件中对接使用selenium,输出源代码之后,构造htmlresponse对象,直接返回 给spider解析页面,提取数据 并且也不在执行下载器下载页面动作 htmlresponse对象的文档: :param request: :param spider: :return: ''' print('phantomjs is starting') page = request.meta.get('page', 1) self.wait = webdriverwait(self.browser, self.timeout) # self.browser.set_page_load_timeout(30) # self.browser.set_script_timeout(30) try: self.browser.get(request.url) if page > 1: input = self.wait.until(ec.presence_of_element_located((by.css_selector, '#j_bottompage > span.p-skip > input'))) input.clear() input.send_keys(page) time.sleep(5) # 将网页中输入跳转页的输入框赋值给input变量 ec.presence_of_element_located,判断输入框已经被加载出来 input = self.wait.until(ec.presence_of_element_located((by.css_selector, '#j_bottompage > span.p-skip > input'))) # 将网页中调准页面的确定按钮赋值给submit变量,ec.element_to_be_clickable 判断此按钮是可点击的 submit = self.wait.until(ec.element_to_be_clickable((by.css_selector, '#j_bottompage > span.p-skip > a'))) input.clear() input.send_keys(page) submit.click() # 点击按钮 time.sleep(5) # 判断当前页码出现在了输入的页面中,ec.text_to_be_present_in_element 判断元素在指定字符串中出现 self.wait.until(ec.text_to_be_present_in_element((by.css_selector, '#j_bottompage > span.p-num > a.curr'),str(page))) # 等待 #j_goodslist 加载出来,为页面数据,加载出来之后,在返回网页源代码 self.wait.until(ec.text_to_be_present_in_element((by.css_selector, '#j_bottompage > span.p-num > a.curr'),str(page))) return htmlresponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',status=200) except timeoutexception: return htmlresponse(url=request.url, status=500, request=request)
1.4 解析页面
def parse(self, response): soup = beautifulsoup(response.text, 'lxml') lis = soup.find_all(name='li', class_="gl-item") for li in lis: proc_dict = {} dp = li.find(name='span', class_="j_im_icon") if dp: proc_dict['dp'] = dp.get_text().strip() else: continue id = li.attrs['data-sku'] title = li.find(name='div', class_="p-name p-name-type-2") proc_dict['title'] = title.get_text().strip() price = li.find(name='strong', class_="j_" + id) proc_dict['price'] = price.get_text() comment = li.find(name='a', id="j_comment_" + id) proc_dict['comment'] = comment.get_text() + '条评论' url = 'https://item.jd.com/' + id + '.html' proc_dict['url'] = url proc_dict['type'] = 'jingdong' yield proc_dict
1.5 储存结果
提取完页面数据之后,数据会发送到item pipeline处进行数据处理,清洗,入库等操作,所以我们此时当然需要定义项目管道了,在此我们将数据存储在mongodb数据库中。
# -*- coding: utf-8 -*- # define your item pipelines here # # don't forget to add your pipeline to the item_pipelines setting # see: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo class mongopipeline(object): def __init__(self,mongo_url,mongo_db,collection): self.mongo_url = mongo_url self.mongo_db = mongo_db self.collection = collection @classmethod #from_crawler是一个类方法,由 @classmethod标识,是一种依赖注入的方式,它的参数就是crawler #通过crawler我们可以拿到全局配置的每个配置信息,在全局配置settings.py中的配置项都可以取到。 #所以这个方法的定义主要是用来获取settings.py中的配置信息 def from_crawler(cls,crawler): return cls( mongo_url=crawler.settings.get('mongo_url'), mongo_db = crawler.settings.get('mongo_db'), collection = crawler.settings.get('collection') ) def open_spider(self,spider): self.client = pymongo.mongoclient(self.mongo_url) self.db = self.client[self.mongo_db] def process_item(self,item, spider): # name = item.__class__.collection name = self.collection self.db[name].insert(dict(item)) return item def close_spider(self,spider): self.client.close()
1.6 配置settings文件
keywords=['ipad'] max_page=2 mongo_url = 'localhost' mongo_db = 'test' collection = 'productitem' selenium_timeout = 30
以及修改配置项,激活下载器中间件和item pipeline。
downloader_middlewares = { 'scrapyseleniumtest.middlewares.seleniummiddleware': 543, } item_pipelines = { 'scrapyseleniumtest.pipelines.mongopipeline': 300, }
1.7 执行结果
scrapy crawl jd
1.8 完整代码
# -*- coding: utf-8 -*- # define here the models for your scraped items # # see documentation in: # https://docs.scrapy.org/en/latest/topics/items.html from scrapy import item,field class productitem(item): # define the fields for your item here like: # name = scrapy.field() # dp = field() # title = field() # price = field() # comment = field() # url = field() # type = field() pass
# -*- coding: utf-8 -*- from scrapy import request,spider from urllib.parse import quote from bs4 import beautifulsoup class jdspider(spider): name = 'jd' allowed_domains = ['www.jd.com'] base_url = 'https://search.jd.com/search?keyword=' def start_requests(self): for keyword in self.settings.get('keywords'): for page in range(1, self.settings.get('max_page') + 1): url = self.base_url + quote(keyword) # dont_filter = true 不去重 yield request(url=url, callback=self.parse, meta={'page': page}, dont_filter=true) def parse(self, response): soup = beautifulsoup(response.text, 'lxml') lis = soup.find_all(name='li', class_="gl-item") for li in lis: proc_dict = {} dp = li.find(name='span', class_="j_im_icon") if dp: proc_dict['dp'] = dp.get_text().strip() else: continue id = li.attrs['data-sku'] title = li.find(name='div', class_="p-name p-name-type-2") proc_dict['title'] = title.get_text().strip() price = li.find(name='strong', class_="j_" + id) proc_dict['price'] = price.get_text() comment = li.find(name='a', id="j_comment_" + id) proc_dict['comment'] = comment.get_text() + '条评论' url = 'https://item.jd.com/' + id + '.html' proc_dict['url'] = url proc_dict['type'] = 'jingdong' yield proc_dict
# -*- coding: utf-8 -*- # define here the models for your spider middleware # # see documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from selenium import webdriver from selenium.webdriver.common.by import by from selenium.webdriver.common.keys import keys from selenium.webdriver.support import expected_conditions as ec from selenium.webdriver.support.wait import webdriverwait from urllib.parse import urlencode from scrapy.http import htmlresponse from logging import getlogger from selenium.common.exceptions import timeoutexception import time class scrapyseleniumtestspidermiddleware(object): # not all methods need to be defined. if a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # this method is used by scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # called for each response that goes through the spider # middleware and into the spider. # should return none or raise an exception. return none def process_spider_output(self, response, result, spider): # called with the results returned from the spider, after # it has processed the response. # must return an iterable of request, dict or item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # should return either none or an iterable of response, dict # or item objects. pass def process_start_requests(self, start_requests, spider): # called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info('spider opened: %s' % spider.name) class seleniummiddleware(object): # not all methods need to be defined. if a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def __init__(self,timeout=none): self.logger=getlogger(__name__) self.timeout = timeout self.browser = webdriver.chrome() self.browser.set_window_size(1400,700) self.browser.set_page_load_timeout(self.timeout) self.wait = webdriverwait(self.browser,self.timeout) def __del__(self): self.browser.close() @classmethod def from_crawler(cls, crawler): # this method is used by scrapy to create your spiders. return cls(timeout=crawler.settings.get('selenium_timeout')) def process_request(self, request, spider): ''' 在下载器中间件中对接使用selenium,输出源代码之后,构造htmlresponse对象,直接返回 给spider解析页面,提取数据 并且也不在执行下载器下载页面动作 htmlresponse对象的文档: :param request: :param spider: :return: ''' print('phantomjs is starting') page = request.meta.get('page', 1) self.wait = webdriverwait(self.browser, self.timeout) # self.browser.set_page_load_timeout(30) # self.browser.set_script_timeout(30) try: self.browser.get(request.url) if page > 1: input = self.wait.until(ec.presence_of_element_located((by.css_selector, '#j_bottompage > span.p-skip > input'))) input.clear() input.send_keys(page) time.sleep(5) # 将网页中输入跳转页的输入框赋值给input变量 ec.presence_of_element_located,判断输入框已经被加载出来 input = self.wait.until(ec.presence_of_element_located((by.css_selector, '#j_bottompage > span.p-skip > input'))) # 将网页中调准页面的确定按钮赋值给submit变量,ec.element_to_be_clickable 判断此按钮是可点击的 submit = self.wait.until(ec.element_to_be_clickable((by.css_selector, '#j_bottompage > span.p-skip > a'))) input.clear() input.send_keys(page) submit.click() # 点击按钮 time.sleep(5) # 判断当前页码出现在了输入的页面中,ec.text_to_be_present_in_element 判断元素在指定字符串中出现 self.wait.until(ec.text_to_be_present_in_element((by.css_selector, '#j_bottompage > span.p-num > a.curr'),str(page))) # 等待 #j_goodslist 加载出来,为页面数据,加载出来之后,在返回网页源代码 self.wait.until(ec.text_to_be_present_in_element((by.css_selector, '#j_bottompage > span.p-num > a.curr'),str(page))) return htmlresponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',status=200) except timeoutexception: return htmlresponse(url=request.url, status=500, request=request) def process_response(self, request, response, spider): # called with the response returned from the downloader. # must either; # - return a response object # - return a request object # - or raise ignorerequest return response def process_exception(self, request, exception, spider): # called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # must either: # - return none: continue processing this exception # - return a response object: stops process_exception() chain # - return a request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('spider opened: %s' % spider.name)
# -*- coding: utf-8 -*- # define your item pipelines here # # don't forget to add your pipeline to the item_pipelines setting # see: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo class mongopipeline(object): def __init__(self,mongo_url,mongo_db,collection): self.mongo_url = mongo_url self.mongo_db = mongo_db self.collection = collection @classmethod #from_crawler是一个类方法,由 @classmethod标识,是一种依赖注入的方式,它的参数就是crawler #通过crawler我们可以拿到全局配置的每个配置信息,在全局配置settings.py中的配置项都可以取到。 #所以这个方法的定义主要是用来获取settings.py中的配置信息 def from_crawler(cls,crawler): return cls( mongo_url=crawler.settings.get('mongo_url'), mongo_db = crawler.settings.get('mongo_db'), collection = crawler.settings.get('collection') ) def open_spider(self,spider): self.client = pymongo.mongoclient(self.mongo_url) self.db = self.client[self.mongo_db] def process_item(self,item, spider): # name = item.__class__.collection name = self.collection self.db[name].insert(dict(item)) return item def close_spider(self,spider): self.client.close()
# -*- coding: utf-8 -*- # scrapy settings for scrapyseleniumtest project # # for simplicity, this file contains only settings considered important or # commonly used. you can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html bot_name = 'scrapyseleniumtest' spider_modules = ['scrapyseleniumtest.spiders'] newspider_module = 'scrapyseleniumtest.spiders' # crawl responsibly by identifying yourself (and your website) on the user-agent #user_agent = 'scrapyseleniumtest (+http://www.yourdomain.com)' # obey robots.txt rules robotstxt_obey = false # configure maximum concurrent requests performed by scrapy (default: 16) #concurrent_requests = 32 # configure a delay for requests for the same website (default: 0) # see https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # see also autothrottle settings and docs #download_delay = 3 # the download delay setting will honor only one of: #concurrent_requests_per_domain = 16 #concurrent_requests_per_ip = 16 # disable cookies (enabled by default) #cookies_enabled = false # disable telnet console (enabled by default) #telnetconsole_enabled = false # override the default request headers: #default_request_headers = { # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'accept-language': 'en', #} # enable or disable spider middlewares # see https://docs.scrapy.org/en/latest/topics/spider-middleware.html #spider_middlewares = { # 'scrapyseleniumtest.middlewares.scrapyseleniumtestspidermiddleware': 543, #} # enable or disable downloader middlewares # see https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #downloader_middlewares = { # 'scrapyseleniumtest.middlewares.scrapyseleniumtestdownloadermiddleware': 543, #} downloader_middlewares = { 'scrapyseleniumtest.middlewares.seleniummiddleware': 543, } # enable or disable extensions # see https://docs.scrapy.org/en/latest/topics/extensions.html #extensions = { # 'scrapy.extensions.telnet.telnetconsole': none, #} # configure item pipelines # see https://docs.scrapy.org/en/latest/topics/item-pipeline.html #item_pipelines = { # 'scrapyseleniumtest.pipelines.scrapyseleniumtestpipeline': 300, #} item_pipelines = { 'scrapyseleniumtest.pipelines.mongopipeline': 300, } # enable and configure the autothrottle extension (disabled by default) # see https://docs.scrapy.org/en/latest/topics/autothrottle.html #autothrottle_enabled = true # the initial download delay #autothrottle_start_delay = 5 # the maximum download delay to be set in case of high latencies #autothrottle_max_delay = 60 # the average number of requests scrapy should be sending in parallel to # each remote server #autothrottle_target_concurrency = 1.0 # enable showing throttling stats for every response received: #autothrottle_debug = false # enable and configure http caching (disabled by default) # see https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #httpcache_enabled = true #httpcache_expiration_secs = 0 #httpcache_dir = 'httpcache' #httpcache_ignore_http_codes = [] #httpcache_storage = 'scrapy.extensions.httpcache.filesystemcachestorage' keywords=['ipad'] max_page=2 mongo_url = 'localhost' mongo_db = 'test' collection = 'productitem' selenium_timeout = 30
