欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

微博爬虫爬取正文及其评论

程序员文章站 2022-05-02 20:42:36
...

        现在有很多微博吃瓜,有了这个爬虫吃瓜会不会跟方便哪

现在我就来分享一个我想的微博爬虫,代码如下:

import requests
import json
import jsonpath
import re
import sys
import time


next_url='https://m.weibo.cn/comments/hotflow?max_id={}&max_id_type=0'
#next_url2='https://m.weibo.cn/comments/hotflow?max_id={}&max_id_type=0'
#url='https://weibo.com/ajax/search/all?containerid=100103type%3D1%26q%3D%E5%A5%BD%E7%88%B6%E4%BA%B2%E7%9A%84%E6%A0%87%E5%87%86%E6%98%AF%E4%BB%80%E4%B9%88%26t%3D1&page=1&count=20'
start_url='https://weibo.com/ajax/search/all?containerid=100103type%3D1%26q%3D%E5%A5%BD%E7%88%B6%E4%BA%B2%E7%9A%84%E6%A0%87%E5%87%86%E6%98%AF%E4%BB%80%E4%B9%88%26t%3D1&page={0}&count=20'
con_url = 'https://m.weibo.cn/comments/hotflow?max_id_type=0'
headers={
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'cookie': 'SINAGLOBAL=5247530668819.884.1622160611865; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFoni98LoBWdOUx0-Cu2O2S5JpX5KMhUgL.Foq0e0eESKepeoB2dJLoI7_SIPiyqcpL1K.R15tt; UOR=,,login.sina.com.cn; ULV=1626055646666:4:2:2:6223248291248.269.1626055646567:1625491979289; WBPSESS=OweSB0Q9wGw7I40GtkEai7H6sRRts-vVBBxLtL3RmX1gWM1N4LHSDvCUJo4Ts8MkA24apEBR5VSLQPlC2g2jQ5j06zcnMrZKjaQ8vBjORPj2k2MYEo4Tu8rtJy2MM4J4; ALF=1658100708; SSOLoginState=1626564709; SCF=Ap6brGmEqpQq5NnmcRxcK9nIeJ-7tfUouZTX8aAHssj2l1FDigpcReY-xa91rp0C7b-aPg1xETZPizBjgl4Gvkk.; SUB=_2A25N9xg1DeRhGeBN6FET9S3NyTiIHXVuhQ79rDV8PUNbmtAKLXDnkW9NRDlaaIztMCC8P74pjMmEnS3rCG3TsAqJ; XSRF-TOKEN=suWUUzEKdrNs70NVxF44EG_F',
    'accept': 'application/json, text/plain, */*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9'
}
def get_content():  #获取正文内容
    for i in range(1,10):
        resp=requests.get(start_url.format(i),headers=headers)
        json_data=resp.json()
    #print(json_data)
        content=jsonpath.jsonpath(json_data,'$..text_raw')  #获取正文内容
        print(content)
        with open('wb.txt', 'a+', encoding='utf-8') as f: #保存
            for i in content:
                f.write(i)

lst=[]
def get_comments(url):
    headers1={
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
        'cookie': 'WEIBOCN_FROM=1110005030; SUB=_2A25N9wUGDeRhGeFL6FYY-CfKzzqIHXVvG6tOrDV6PUJbkdAKLVLwkW1NQj7MU04d8NXbiy_rw1YKW0qLCf7HZvcL; MLOGIN=1; _T_WM=55482865906; XSRF-TOKEN=dad078; M_WEIBOCN_PARAMS=oid%3D4650309697668535%26luicode%3D20000061%26lfid%3D4650309697668535%26uicode%3D20000061%26fid%3D4650309697668535'
    }
    data = {
        'id':'4650309697668535',  # weibo_id
        'mid':'4650309697668535',  # weibo_id
        'max_id':None
    }
    resp = requests.get(url, headers=headers1,params=data)#
    json_data=resp.json()
    comments=jsonpath.jsonpath(json_data,'$..text') #评论
    #print(len(comments))
    if comments != False:
        with open('wb.txt', 'a+', encoding='utf-8') as f:  # 保存
            f.write('以下为评论内容:')
            for com in comments:
                c = re.sub('<.*?>', '', com)
                c=c+'\n'
                f.writelines(c)
                print(c)
    time.sleep(2)
    try:
        max_ids = jsonpath.jsonpath(json_data, '$..max_id')[0]
        #print(max_ids)
    except:
        sys.exit('已经爬取完毕')
    get_comments(next_url.format(max_ids))

def get_comments2(url):
    headers1={
        'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
        'cookie': 'WEIBOCN_FROM=1110005030; SUB=_2A25N9wUGDeRhGeFL6FYY-CfKzzqIHXVvG6tOrDV6PUJbkdAKLVLwkW1NQj7MU04d8NXbiy_rw1YKW0qLCf7HZvcL; MLOGIN=1; _T_WM=55482865906; XSRF-TOKEN=dad078; M_WEIBOCN_PARAMS=oid%3D4650309697668535%26luicode%3D20000061%26lfid%3D4650309697668535%26uicode%3D20000061%26fid%3D4650309697668535'
    }
    data = {
        'id':'4660946494822166',  # weibo_id
        'mid':'4660946494822166',  # weibo_id
        'max_id':None
    }
    resp = requests.get(url, headers=headers1,params=data)#
    json_data=resp.json()
    comments=jsonpath.jsonpath(json_data,'$..text') #评论
    #print(len(comments))
    if comments != False:
        with open('wb.txt', 'a+', encoding='utf-8') as f:  # 保存
            f.write('以下为评论内容:')
            for com in comments:
                c = re.sub('<.*?>', '', com)
                c=c+'\n'
                #f.write('以下为评论内容:')
                f.writelines(c)
                print(c)
    time.sleep(2)
    try:
        max_ids = jsonpath.jsonpath(json_data, '$..max_id')[0]
        #print(max_ids)
    except:
        sys.exit('已经爬取完毕')
    get_comments2(next_url.format(max_ids))

if __name__ == '__main__':
    #get_content()
    get_comments2(con_url)
    #get_comments(con_url)


#

在微博爬虫中,最主要的就是找出数据所在的位置,分析出url

相关标签: 微博爬虫 python