微博爬虫爬取正文及其评论
程序员文章站
2022-05-02 20:42:36
...
现在有很多微博吃瓜,有了这个爬虫吃瓜会不会跟方便哪
现在我就来分享一个我想的微博爬虫,代码如下:
import requests
import json
import jsonpath
import re
import sys
import time
next_url='https://m.weibo.cn/comments/hotflow?max_id={}&max_id_type=0'
#next_url2='https://m.weibo.cn/comments/hotflow?max_id={}&max_id_type=0'
#url='https://weibo.com/ajax/search/all?containerid=100103type%3D1%26q%3D%E5%A5%BD%E7%88%B6%E4%BA%B2%E7%9A%84%E6%A0%87%E5%87%86%E6%98%AF%E4%BB%80%E4%B9%88%26t%3D1&page=1&count=20'
start_url='https://weibo.com/ajax/search/all?containerid=100103type%3D1%26q%3D%E5%A5%BD%E7%88%B6%E4%BA%B2%E7%9A%84%E6%A0%87%E5%87%86%E6%98%AF%E4%BB%80%E4%B9%88%26t%3D1&page={0}&count=20'
con_url = 'https://m.weibo.cn/comments/hotflow?max_id_type=0'
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'cookie': 'SINAGLOBAL=5247530668819.884.1622160611865; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFoni98LoBWdOUx0-Cu2O2S5JpX5KMhUgL.Foq0e0eESKepeoB2dJLoI7_SIPiyqcpL1K.R15tt; UOR=,,login.sina.com.cn; ULV=1626055646666:4:2:2:6223248291248.269.1626055646567:1625491979289; WBPSESS=OweSB0Q9wGw7I40GtkEai7H6sRRts-vVBBxLtL3RmX1gWM1N4LHSDvCUJo4Ts8MkA24apEBR5VSLQPlC2g2jQ5j06zcnMrZKjaQ8vBjORPj2k2MYEo4Tu8rtJy2MM4J4; ALF=1658100708; SSOLoginState=1626564709; SCF=Ap6brGmEqpQq5NnmcRxcK9nIeJ-7tfUouZTX8aAHssj2l1FDigpcReY-xa91rp0C7b-aPg1xETZPizBjgl4Gvkk.; SUB=_2A25N9xg1DeRhGeBN6FET9S3NyTiIHXVuhQ79rDV8PUNbmtAKLXDnkW9NRDlaaIztMCC8P74pjMmEnS3rCG3TsAqJ; XSRF-TOKEN=suWUUzEKdrNs70NVxF44EG_F',
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9'
}
def get_content(): #获取正文内容
for i in range(1,10):
resp=requests.get(start_url.format(i),headers=headers)
json_data=resp.json()
#print(json_data)
content=jsonpath.jsonpath(json_data,'$..text_raw') #获取正文内容
print(content)
with open('wb.txt', 'a+', encoding='utf-8') as f: #保存
for i in content:
f.write(i)
lst=[]
def get_comments(url):
headers1={
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
'cookie': 'WEIBOCN_FROM=1110005030; SUB=_2A25N9wUGDeRhGeFL6FYY-CfKzzqIHXVvG6tOrDV6PUJbkdAKLVLwkW1NQj7MU04d8NXbiy_rw1YKW0qLCf7HZvcL; MLOGIN=1; _T_WM=55482865906; XSRF-TOKEN=dad078; M_WEIBOCN_PARAMS=oid%3D4650309697668535%26luicode%3D20000061%26lfid%3D4650309697668535%26uicode%3D20000061%26fid%3D4650309697668535'
}
data = {
'id':'4650309697668535', # weibo_id
'mid':'4650309697668535', # weibo_id
'max_id':None
}
resp = requests.get(url, headers=headers1,params=data)#
json_data=resp.json()
comments=jsonpath.jsonpath(json_data,'$..text') #评论
#print(len(comments))
if comments != False:
with open('wb.txt', 'a+', encoding='utf-8') as f: # 保存
f.write('以下为评论内容:')
for com in comments:
c = re.sub('<.*?>', '', com)
c=c+'\n'
f.writelines(c)
print(c)
time.sleep(2)
try:
max_ids = jsonpath.jsonpath(json_data, '$..max_id')[0]
#print(max_ids)
except:
sys.exit('已经爬取完毕')
get_comments(next_url.format(max_ids))
def get_comments2(url):
headers1={
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1',
'cookie': 'WEIBOCN_FROM=1110005030; SUB=_2A25N9wUGDeRhGeFL6FYY-CfKzzqIHXVvG6tOrDV6PUJbkdAKLVLwkW1NQj7MU04d8NXbiy_rw1YKW0qLCf7HZvcL; MLOGIN=1; _T_WM=55482865906; XSRF-TOKEN=dad078; M_WEIBOCN_PARAMS=oid%3D4650309697668535%26luicode%3D20000061%26lfid%3D4650309697668535%26uicode%3D20000061%26fid%3D4650309697668535'
}
data = {
'id':'4660946494822166', # weibo_id
'mid':'4660946494822166', # weibo_id
'max_id':None
}
resp = requests.get(url, headers=headers1,params=data)#
json_data=resp.json()
comments=jsonpath.jsonpath(json_data,'$..text') #评论
#print(len(comments))
if comments != False:
with open('wb.txt', 'a+', encoding='utf-8') as f: # 保存
f.write('以下为评论内容:')
for com in comments:
c = re.sub('<.*?>', '', com)
c=c+'\n'
#f.write('以下为评论内容:')
f.writelines(c)
print(c)
time.sleep(2)
try:
max_ids = jsonpath.jsonpath(json_data, '$..max_id')[0]
#print(max_ids)
except:
sys.exit('已经爬取完毕')
get_comments2(next_url.format(max_ids))
if __name__ == '__main__':
#get_content()
get_comments2(con_url)
#get_comments(con_url)
#
在微博爬虫中,最主要的就是找出数据所在的位置,分析出url