15Python爬虫---爬虫定向爬取腾讯视频网---利刃出击评论
程序员文章站
2024-03-25 18:25:34
...
先贴上代码后面补充上解析,代码缺陷没有对对评论的回复进行处理爬取
import urllib.request
import http.cookiejar
import re
# ----------对象赋值--------------------------------------------
class point():
pass
# ----------对象赋值--------------------------------------------
# ----------emoji表情去除--------------------------------------------
emoji_pattern = re.compile(
u"(\ud83d[\ude00-\ude4f])|" # emoticons
u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2)
u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2)
u"(\ud83d[\ude80-\udeff])|" # transport & map symbols
u"(\ud83c[\udde0-\uddff])|" # flags (iOS)
u"(\\n)"
"+", flags=re.UNICODE)
def remove_emoji(text):
return emoji_pattern.sub(r'', text)
# ------------------------------------------------------------------
# 设置视频编号
# 利刃出击
vid = "2457683703"
# 设置评论起始标编号
comid = "0"
# 请求的评论数
num = "10"
# 构造真实评论请求网址
# 设置头信息伪装成浏览器
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gb2312, utf-8",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
"Host": "video.coral.qq.com",
}
# 设置cookie
cjar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
headall = []
for key, value in headers.items():
item = (key, value)
headall.append(item)
# 添加头部
opener.addheaders = headall
urllib.request.install_opener(opener)
# 爬取该网页
def craw(vid, comid):
url = "https://video.coral.qq.com/varticle/" + vid + "/comment/v2?orinum=" + num + "&cursor=" + comid + "&pageflag=1&oriorder=o"
data = urllib.request.urlopen(url).read().decode('unicode_escape')
return data
# 正则
useridpat = '"userid":"(.*?)","content":'
idpat = '"id":"(.*?)"'
userpat = '{"userid":"(.*?)","head":'
conpat = '"content":"(.*?)"'
print("-------------------------利剑出击---------------------------")
for i in range(1, 10): # 循环9页评论
print("------------------------------------------------------------")
print("第 " + str(i) + " 页评论内容")
# 正则找出所有符合的内容
data = craw(vid, comid)
useridlist = re.compile(useridpat, re.S).findall(data) # 用户id列表
idlist = re.compile(idpat, re.S).findall(data) # 评论id列表
userlist = re.compile(userpat, re.S).findall(data) # 用户列表
conlist = re.compile(conpat, re.S).findall(data) # 评论内容列表
"""
处理用户JSON,使得数组变为[{'userid':'111','username':'如意'},{'userid':'122','username':'二哈'}...]
"""
uselist = []
name = []
for k in range(0, len(userlist)):
user = userlist[k].split('","nick":"')
obj = point()
for j in range(len(user)):
obj.userid = user[0]
obj.username = remove_emoji(user[1]) # 去除用户名中的表情符号和换行符
uselist.append(obj)
# 循环打印评论内容
for k in range(0, 10):
# 输出对应的信息,并对字符串进行unicode编码,从而正常显示
for j in range(0, len(uselist)): # ID和用户姓名对应 显示
if uselist[j].userid == str(useridlist[k]):
print("用户名是:" + eval('u"' + uselist[j].username + '"'))
content = remove_emoji(conlist[k]) # 去除内容中的表情符号和换行符
print("评论内容是:" + eval('u"' + content + '"'))
print("\n")
comid = idlist[9]
结果展示