scrapy-redis 分布式哔哩哔哩网站用户爬虫
程序员文章站
2022-05-21 20:26:26
scrapy里面,对每次请求的url都有一个指纹,这个指纹就是判断url是否被请求过的。默认是开启指纹即一个URL请求一次。如果我们使用分布式在多台机上面爬取数据,为了让爬虫的数据不重复,我们也需要一个指纹。但是scrapy默认的指纹是保持到本地的。所有我们可以使用redis来保持指纹,并且用red ......
scrapy里面,对每次请求的url都有一个指纹,这个指纹就是判断url是否被请求过的。默认是开启指纹即一个url请求一次。如果我们使用分布式在多台机上面爬取数据,为了让爬虫的数据不重复,我们也需要一个指纹。但是scrapy默认的指纹是保持到本地的。所有我们可以使用redis来保持指纹,并且用redis里面的set集合来判断是否重复。
setting.py
# -*- coding: utf-8 -*-
# scrapy settings for bilibili project
#
# for simplicity, this file contains only settings considered important or
# commonly used. you can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
bot_name = 'bilibili'
spider_modules = ['bilibili.spiders']
newspider_module = 'bilibili.spiders'
# crawl responsibly by identifying yourself (and your website) on the user-agent
#user_agent = 'bilibili (+http://www.yourdomain.com)'
# obey robots.txt rules
# robotstxt_obey = true
# configure maximum concurrent requests performed by scrapy (default: 16)
#concurrent_requests = 32
# configure a delay for requests for the same website (default: 0)
# see https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# see also autothrottle settings and docs
download_delay = 1
# the download delay setting will honor only one of:
#concurrent_requests_per_domain = 16
#concurrent_requests_per_ip = 16
# disable cookies (enabled by default)
#cookies_enabled = false
# disable telnet console (enabled by default)
#telnetconsole_enabled = false
# override the default request headers:
default_request_headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'accept-language': 'en',
}
# enable or disable spider middlewares
# see https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#spider_middlewares = {
# 'bilibili.middlewares.bilibilispidermiddleware': 543,
#}
# enable or disable downloader middlewares
# see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
downloader_middlewares = {
'bilibili.middlewares.bilibilidownloadermiddleware': 543,
'bilibili.middlewares.randomuseragentmiddleware':400
}
# enable or disable extensions
# see https://doc.scrapy.org/en/latest/topics/extensions.html
#extensions = {
# 'scrapy.extensions.telnet.telnetconsole': none,
#}
# configure item pipelines
# see https://doc.scrapy.org/en/latest/topics/item-pipeline.html
item_pipelines = {
'bilibili.pipelines.bilibilipipeline': 300,
'scrapy_redis.pipelines.redispipeline':300
}
# enable and configure the autothrottle extension (disabled by default)
# see https://doc.scrapy.org/en/latest/topics/autothrottle.html
#autothrottle_enabled = true
# the initial download delay
#autothrottle_start_delay = 5
# the maximum download delay to be set in case of high latencies
#autothrottle_max_delay = 60
# the average number of requests scrapy should be sending in parallel to
# each remote server
#autothrottle_target_concurrency = 1.0
# enable showing throttling stats for every response received:
#autothrottle_debug = false
# enable and configure http caching (disabled by default)
# see https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#httpcache_enabled = true
#httpcache_expiration_secs = 0
#httpcache_dir = 'httpcache'
#httpcache_ignore_http_codes = []
#httpcache_storage = 'scrapy.extensions.httpcache.filesystemcachestorage'
scheduler = 'scrapy_redis.scheduler.scheduler'
dupefilter_class = 'scrapy_redis.dupefilter.rfpdupefilter'
redis_url = 'redis://@127.0.0.1:6379'
scheduler_queue_class = 'scrapy_redis.queue.priorityqueue'
spider.py
# -*- coding: utf-8 -*- import scrapy import json,re from bilibili.items import bilibiliitem class bilibiliappspider(scrapy.spider): name = 'bilibiliapp' # allowed_domains = ['www.bilibili.com'] # start_urls = ['http://www.bilibili.com/'] def start_requests(self): for i in range(1, 300): url = 'https://api.bilibili.com/x/relation/stat?vmid={}&jsonp=jsonp&callback=__jp3'.format(i) url_ajax = 'https://space.bilibili.com/{}/'.format(i) # get的时候是这个东东, scrapy.request(url=, callback=) req = scrapy.request(url=url,callback=self.parse,meta={'id':i}) req.headers['referer'] = url_ajax yield req def parse(self, response): # print(response.text) comm = re.compile(r'({.*})') text = re.findall(comm,response.text)[0] data = json.loads(text) # print(data) follower = data['data']['follower'] following = data['data']['following'] id = response.meta.get('id') url = 'https://space.bilibili.com/ajax/member/getsubmitvideos?mid={}&page=1&pagesize=25'.format(id) yield scrapy.request(url=url,callback=self.getsubmit,meta={ 'id':id, 'follower':follower, 'following':following }) def getsubmit(self, response): # print(response.text) data = json.loads(response.text) tilst = data['data']['tlist'] tlist_list = [] if tilst != []: # print(tilst) for tils in tilst.values(): # print(tils['name']) tlist_list.append(tils['name']) else: tlist_list = ['无爱好'] follower = response.meta.get('follower') following = response.meta.get('following') id = response.meta.get('id') url = 'https://api.bilibili.com/x/space/acc/info?mid={}&jsonp=jsonp'.format(id) yield scrapy.request(url=url,callback=self.space,meta={ 'id':id, 'follower':follower, 'following':following, 'tlist_list':tlist_list }) def space(self, respinse): # print(respinse.text) data = json.loads(respinse.text) name = data['data']['name'] sex = data['data']['sex'] level = data['data']['level'] birthday = data['data']['birthday'] tlist_list = respinse.meta.get('tlist_list') animation = 0 life = 0 music = 0 game = 0 dance = 0 documentary = 0 ghost = 0 science = 0 opera = 0 entertainment = 0 movies = 0 national = 0 digital = 0 fashion = 0 for tlist in tlist_list: if tlist == '动画': animation = 1 elif tlist == '生活': life = 1 elif tlist == '音乐': music = 1 elif tlist == '游戏': game = 1 elif tlist == '舞蹈': dance = 1 elif tlist == '纪录片': documentary = 1 elif tlist == '鬼畜': ghost = 1 elif tlist == '科技': science = 1 elif tlist == '番剧': opera =1 elif tlist == '娱乐': entertainment = 1 elif tlist == '影视': movies = 1 elif tlist == '国创': national = 1 elif tlist == '数码': digital = 1 elif tlist == '时尚': fashion = 1 item = bilibiliitem() item['name'] = name item['sex'] = sex item['level'] = level item['birthday'] = birthday item['follower'] = respinse.meta.get('follower') item['following'] = respinse.meta.get('following') item['animation'] = animation item['life'] = life item['music'] = music item['game'] = game item['dance'] = dance item['documentary'] = documentary item['ghost'] = ghost item['science'] = science item['opera'] = opera item['entertainment'] = entertainment item['movies'] = movies item['national'] = national item['digital'] = digital item['fashion'] = fashion yield item
设置ua池
from scrapy.downloadermiddlewares.useragent import useragentmiddleware import random class randomuseragentmiddleware(useragentmiddleware): def __init__(self,user_agent=''): self.user_agent = user_agent def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: request.headers.setdefault('user-agent', ua) user_agent_list = [ \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.1 (khtml, like gecko) chrome/22.0.1207.1 safari/537.1" \ "mozilla/5.0 (x11; cros i686 2268.111.0) applewebkit/536.11 (khtml, like gecko) chrome/20.0.1132.57 safari/536.11", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.6 (khtml, like gecko) chrome/20.0.1092.0 safari/536.6", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.6 (khtml, like gecko) chrome/20.0.1090.0 safari/536.6", \ "mozilla/5.0 (windows nt 6.2; wow64) applewebkit/537.1 (khtml, like gecko) chrome/19.77.34.5 safari/537.1", \ "mozilla/5.0 (x11; linux x86_64) applewebkit/536.5 (khtml, like gecko) chrome/19.0.1084.9 safari/536.5", \ "mozilla/5.0 (windows nt 6.0) applewebkit/536.5 (khtml, like gecko) chrome/19.0.1084.36 safari/536.5", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1063.0 safari/536.3", \ "mozilla/5.0 (windows nt 5.1) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1063.0 safari/536.3", \ "mozilla/5.0 (macintosh; intel mac os x 10_8_0) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1063.0 safari/536.3", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1062.0 safari/536.3", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1062.0 safari/536.3", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.1 safari/536.3", \ "mozilla/5.0 (windows nt 6.1; wow64) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.1 safari/536.3", \ "mozilla/5.0 (windows nt 6.1) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.1 safari/536.3", \ "mozilla/5.0 (windows nt 6.2) applewebkit/536.3 (khtml, like gecko) chrome/19.0.1061.0 safari/536.3", \ "mozilla/5.0 (x11; linux x86_64) applewebkit/535.24 (khtml, like gecko) chrome/19.0.1055.1 safari/535.24", \ "mozilla/5.0 (windows nt 6.2; wow64) applewebkit/535.24 (khtml, like gecko) chrome/19.0.1055.1 safari/535.24" ]
git地址:https://github.com/18370652038/scrapy-bilibili