爬取东方财富股吧中评论数据
程序员文章站
2022-05-02 22:12:53
...
东方财富股吧是有反爬取机制的,我们通过减慢爬取速率的方法来避免爬取检测。这种方法爬取网页数据的速率很慢,并不会对网站的访问造成影响,当然我们可以改进方法,使得爬取数据更快。但并不鼓励大家非法爬取数据,只用作学习交流。
直接上代码吧:
import requests
from bs4 import BeautifulSoup
import time
import random
import csv # 导入CSV安装包
#f = open('test.csv', 'w', encoding='utf-8')
f = open('comment.csv', 'w', newline="") #创建文件对象
csv_writer = csv.writer(f) #基于文件对象构建 csv写入对象
csv_writer.writerow(["序号", "commentId", "text", "userId", "date", "likeCount", "fans"])
count = 0
def getHtml(url):#下载网页源代码
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; LCTE; rv:11.0) like Gecko'}
try:
r = requests.get(url,headers=header)
r.encoding = 'utf-8'
#print(r.status_code)
r.raise_for_status()
return r.text
except:
getHtml(url)
# region 随机延时
# 固定延时x秒
def delay_x_0_s(fixed_delay_num):
x = float(fixed_delay_num)
time.sleep(x)
# 随机延时 0~y 秒
def delay_0_y_s(random_delay_num):
y = float(random_delay_num)
time.sleep(random.random() * y)
# 先固定延时x秒,再随机延时 0~y 秒
# 延时区间,包前不包后
def delay_x_y_s(fixed_delay_num, random_delay_num):
delay_x_0_s(fixed_delay_num)
delay_0_y_s(random_delay_num)
# 随机延时 x~y 秒
# 延时区间,包前不包后
def delay_between_x_y_s(start_delay_num, end_delay_num):
x = float(start_delay_num)
y = float(end_delay_num)
delay_x_0_s(x)
delay_0_y_s(y - x)
for page in range(100, 1001):
delay_between_x_y_s(2, 5)
url = "http://guba.eastmoney.com/list,zssh000001,f_" + str(page) + ".html"
print(url)
# url = 'http://guba.eastmoney.com/list,zssh000001,f_1.html'
html = getHtml(url)
soup = BeautifulSoup(html, "html.parser")
#print(soup)
contain = soup.find_all("div", {"class": "articleh"}) #获取存有数据的div标签,存在contain中,因为一个页面有多条评论,所以contain是一个列表。
for i in contain[:]: #遍历contain
try:
delay_between_x_y_s(2, 5)
content = i.find("span", {"class": "l3 a3"}).find("a") #获取一个div便签中第三个span标签下的a标签,其有href和title两个属性
print(content)
contentUrl = "http://guba.eastmoney.com"+content["href"] #content["href"]是该评论的详细界面网址,因为其是相对地址,所以需要在前添加网址的前缀,得到完整的界面网址
print("contentUrl: " + contentUrl + "\n")
commentId = content["href"][-14:-5] #我们观察content["href"]属性的值,发现其是具有规则的字符串,从该字符串的倒数第14个位置到倒数第5个位置是该条评论的id
print("commentId: " + commentId + "\n")
text = content.attrs["title"] #获取评论文本(标题)
print("text: " + text + "\n")
userUrl = i.find("span", {"class": "l4 a4"}).find("a").attrs["href"] #用同样的方法获取用户主页链接
if userUrl == "/list,cjpl.html":
continue
print("userUrl: " + userUrl + "\n")
userId = userUrl[23:] #获取用户ID
if userId == "": #跳过讨论帖子
continue
if userId == "3006113720930996": #跳过股吧账号
continue
if userId == "/3006113720930996":
continue
if userId == "7428111481466798":
continue
if userId == "6712111507146464":
continue
if userId == "6255325874333310":
continue
print("userId: " + userId + "\n")
delay_between_x_y_s(2, 5)
commentHtml = getHtml(contentUrl) # 获取评论详细信息源代码
#print("commentHtml: " + commentHtml + "\n")
soup = BeautifulSoup(commentHtml, "html.parser")
#print(soup)
date = soup.find("div", {"class": "zwfbtime"}).text[4:14] # 获取评论发表时间
print("date: " + date + "\n")
#if date == "2020-07-01":
# continue
likeCount = int(soup.find("div", {"data-like_count": True}).attrs['data-like_count']) # 获取评论点赞数,并转换成整数类型。(因为从html中获取会认为是字符串类型)
print("likeCount: ", likeCount, "\n")
#likeCount = int(soup.find("div", {"data-like_count": True}).attrs['data-like_count'])
#print(likeCount)
delay_between_x_y_s(2, 5)
userHtml = getHtml(userUrl) # 获取用户主页源代码
soup = BeautifulSoup(userHtml, "html.parser")
#print(soup)
fans = int(soup.find("a", {"id": "tafansa"}).find("span").text) # 获取用户粉丝数
print("fans: ", fans, "\n")
count = count + 1
csv_writer.writerow([count, commentId, text, userId, date, likeCount, fans])
except:
print('出现异常,继续查看下一篇文章')
continue
f.close()