欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

15Python爬虫---爬虫定向爬取腾讯视频网---利刃出击评论

程序员文章站 2024-03-25 18:25:34
...

先贴上代码后面补充上解析,代码缺陷没有对对评论的回复进行处理爬取

import urllib.request
import http.cookiejar
import re


# ----------对象赋值--------------------------------------------
class point():
    pass
# ----------对象赋值--------------------------------------------

# ----------emoji表情去除--------------------------------------------
emoji_pattern = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])|"  # flags (iOS)
    u"(\\n)"
    "+", flags=re.UNICODE)

def remove_emoji(text):
    return emoji_pattern.sub(r'', text)
# ------------------------------------------------------------------

# 设置视频编号
# 利刃出击
vid = "2457683703"
# 设置评论起始标编号
comid = "0"
# 请求的评论数
num = "10"
# 构造真实评论请求网址
# 设置头信息伪装成浏览器
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gb2312, utf-8",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
    "Host": "video.coral.qq.com",
}
# 设置cookie
cjar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
headall = []
for key, value in headers.items():
    item = (key, value)
    headall.append(item)
# 添加头部
opener.addheaders = headall
urllib.request.install_opener(opener)
# 爬取该网页
def craw(vid, comid):
    url = "https://video.coral.qq.com/varticle/" + vid + "/comment/v2?orinum=" + num + "&cursor=" + comid + "&pageflag=1&oriorder=o"
    data = urllib.request.urlopen(url).read().decode('unicode_escape')
    return data

# 正则
useridpat = '"userid":"(.*?)","content":'
idpat = '"id":"(.*?)"'
userpat = '{"userid":"(.*?)","head":'
conpat = '"content":"(.*?)"'
print("-------------------------利剑出击---------------------------")
for i in range(1, 10):  # 循环9页评论
    print("------------------------------------------------------------")
    print("第 " + str(i) + " 页评论内容")
    # 正则找出所有符合的内容
    data = craw(vid, comid)
    useridlist = re.compile(useridpat, re.S).findall(data)  # 用户id列表
    idlist = re.compile(idpat, re.S).findall(data)  # 评论id列表
    userlist = re.compile(userpat, re.S).findall(data)  # 用户列表
    conlist = re.compile(conpat, re.S).findall(data)  # 评论内容列表
    """
        处理用户JSON,使得数组变为[{'userid':'111','username':'如意'},{'userid':'122','username':'二哈'}...]
    """
    uselist = []
    name = []
    for k in range(0, len(userlist)):
        user = userlist[k].split('","nick":"')
        obj = point()
        for j in range(len(user)):
            obj.userid = user[0]
            obj.username = remove_emoji(user[1])  # 去除用户名中的表情符号和换行符
        uselist.append(obj)

# 循环打印评论内容
    for k in range(0, 10):
        # 输出对应的信息,并对字符串进行unicode编码,从而正常显示
        for j in range(0, len(uselist)):  # ID和用户姓名对应 显示
            if uselist[j].userid == str(useridlist[k]):
                print("用户名是:" + eval('u"' + uselist[j].username + '"'))
        content = remove_emoji(conlist[k])  # 去除内容中的表情符号和换行符
        print("评论内容是:" + eval('u"' + content + '"'))
        print("\n")
    comid = idlist[9]

结果展示
15Python爬虫---爬虫定向爬取腾讯视频网---利刃出击评论
15Python爬虫---爬虫定向爬取腾讯视频网---利刃出击评论