爬虫学习
程序员文章站
2022-03-02 19:35:07
...
关于爬虫(1)
爬取微博评论
# coding=utf-8
import bs4
import requests
def getUrl(url):
head = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) '
'Version/14.0 Safari/605.1.15',
'cookie':'cross_origin_proto=SSL; Apache=9248610096172.86.1601702374292; '
'SINAGLOBAL=9248610096172.86.1601702374292; '
'ULV=1601702374297:5:1:3:9248610096172.86.1601702374292:1601205183902; UOR=baidu.com,weibo.com,'
'baidu.com; _s_tentry=baidu.com; wb_view_log=1440*9002; WBStorage=70753a84f86f85ff|undefined; '
'login_sid_t=ea6532b90a65382499bc8bc3b8c411d2; '
'SUB=_2AkModLgCf8NxqwJRmP0VzWnnbY92wwzEieKeKEnZJRMxHRl-yT_nqhQbtRB6A_SW7UzYV2u4jCUtL'
'-JGazVDIFpfpGH4; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFMlkjK8_2...kALQdYsmdD '
}
req = requests.get(url,headers=head)
return req.json()["data"]["html"]
baseUrl = "https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4555570424058617&from=singleWeiBo&__rnd=1601702735666"
#用于生成下一个url
root_comment_max_id = ""#上一页最后一条评论减一
sum_comment_number = 0 #已展示的comment
for i in range(1,44):
data=getUrl(baseUrl+"&page="+str(i)+"&sum_comment_number="+str(i*15)+"&root_comment_max_id="+str(root_comment_max_id))
soup = bs4.BeautifulSoup(data, "lxml")
comment = soup.findAll('div',class_='list_li S_line1 clearfix')
root_comment_max_id = int(comment[-1]["comment_id"])-1
for j in comment:
print(j.find("div",class_='WB_text').text[1:])
print(i)