哔哩哔哩视频信息爬虫(实时爬取)
程序员文章站
2022-05-29 13:06:08
...
结合 哔哩哔哩小助手程序
爬取思路:
自定义模块构建及框架设计:
文件目录:
__init__.py:
#__init__
"""
浏览json数据
videoinfo = [
data['aid'], # av号
data['view'], # 播放量
data['like'], # 点赞数
data['favorite'], # 收藏数
data['share'], # 转发数
data['reply'], # 评论
data['danmaku'], # 弹幕
data['coin'], # 硬币数
data['title'], # 标题
data['tname'], # 分类
]
"""
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
WebDownloader模块:(请求并加载网页模块)
#WebDownloader
import requests
from BilibiliSpider import headers
class WebDownloader:
global headers
def __init__(self,headers=headers,timeout=6):
self.headers = headers
self.timeout = timeout
#获取待爬取json网页
def getJsonWeb(self,url):
try:
r = requests.get(url,headers=self.headers,timeout=self.timeout)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.json()
except:
return "error"
JsonParse模块:(网页内容解析)
#JsonParse
#
import json
from BilibiliSpider import WebDownloader
import threading
class JsonParse:
def __init__(self,total=1,lock = threading.Lock()):
self.lock = lock
self.total = total
#将用来存入json文件中,获取av号:[视频标题,视频分类]
def parseStat(self,dict_json,jsonPage):
try:
View = jsonPage['data']['View']
aid = View['aid']
sort = View['tname']
title = View['title']
if View['aid']!=None: #筛选出av号,并判断是否存在
dict_json[aid] = [title,sort]
with self.lock:
return ""
except:
pass
#return dict_json
def parseJsonImage(self,jsonPage):
try:
View = jsonPage['data']['View']
picHref = View['pic']
return picHref
except:
pass
#用于实时爬取视频信息
def parseJsonList(self,dict_json,jsonPage):
try:
View = jsonPage['data']['View']
aid = View['aid']
sort = View['tname']
detail = View['desc']
title = View['title']
Stat = View['stat']
play = Stat['view']
like = Stat['like']
collect = Stat['favorite']
share = Stat['share']
reply = Stat['reply']
danmaku = Stat['danmaku']
coin = Stat['coin']
dict_json['视频名称:'] = title
dict_json['AV号:'] = aid
dict_json['分类:'] = sort
dict_json['视频简介:'] = detail
dict_json['播放量:'] = play
dict_json['点赞:'] = like
dict_json['收藏:'] = collect
dict_json['转发:'] = share
dict_json['评论:'] = reply
dict_json['弹幕:'] = danmaku
dict_json['硬币:'] = coin
#for i in dict_json:
#print(i,end='')
#print(dict_json[i])
except:
pass
UrlFactory模块:(api-url工厂,获取对应标题的API链接)
#UrlFactory
"""
detail? :https://api.bilibili.com/x/web-interface/view/detail?&aid=77515252
stat? :https://api.bilibili.com/x/web-interface/archive/stat?aid=11111111
"""
#api_urlStat = 'https://api.bilibili.com/x/web-interface/archive/stat?aid='
import json
class UrlFactory:
def __init__(self,api_urlDetail='https://api.bilibili.com/x/web-interface/view/detail?&aid='):
self.api_urlDetail = api_urlDetail
#从json文件中获取apiUrl
def getUrlJson(self,title):
with open('href.json',mode='r')as fjson:
data = json.loads(fjson.read())
#av号:[视频标题,视频分类]
for i in data:
t = data[i][0]
if t == title:
apiUrl = self.api_urlDetail + i
break
return apiUrl
主函数A:
#下载av号
total = 0
dict_json={}
v = videoInfoSpider()
print('开始爬取apiUrl...')
for i in range(1,2019):
start = 10000
urls = [
"https://api.bilibili.com/x/web-interface/view/detail?&aid={}".format(j)
for j in range(start,start+10000)
]
with futures.ThreadPoolExecutor(64)as executor:
executor.map(v.apiUrlCrawl,urls)
print(total)
total += 1
with open('href.json','a')as fjson:
data = json.dumps(dict_json,indent=4)
fjson.write(data)
print("爬取结束!")
首先运行主函数A,得到一个json文件,作为后续实时爬取API
紧接着UrlFactory模块的作用就来了,调用UrlFactory中的 getUrlJson()即可获得对应搜索标题的视频信息URL,根据URL请求网页,最后调用JsonParse模块即可得到相应的信息啦