python爬虫微博爬取以及分析
程序员文章站
2023-12-31 12:47:04
对微博评论进行爬取获取评论时间和文本进行简单分析任务:1.爬取评论和时间(request和re)2.词频统计(jieba)3.词云展示(wordcloud)4.时间分布(matplotlib)#heheyangimport requestsimport reimport jiebaimport wordcloudimport time as tiimport pandas as pdimport numpy as npimport matplotlib.pyplot as plt...
对微博评论进行简单爬取并进行分析
任务:
1.爬取评论和时间(request和re)
2.词频统计(jieba)
3.词云展示(wordcloud)
4.时间分布(matplotlib)
代码如下:
#heheyang
import requests
import re
import jieba
import wordcloud
import time as ti
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#数据爬取
start_url='weibo_path&&page='
header = {'cookie':'your cookie',
'user-agent':'your agent'}
f=open('path','w',encoding='utf-8')
#f.write('time\tcomment\n')
all_comment=''
all_time=[]
for i in range(1,51 ):
#print('正在爬取第%d页....' %i)
url=start_url+str(i)
r = requests.get(url,timeout=30,headers = header)
r.raise_for_status()
r.encoding='utf-8'
if i==1:
comment=re.findall('<span class="ctt">(.*?)</span>',r.text)
time=re.findall('<span class="ct">(.*?) ',r.text)
del comment[0:4]
del time[0:4]
else:
comment = re.findall('<span class="ctt">(.*?)</span>', r.text)
time = re.findall('<span class="ct">(.*?) ', r.text)
for i in range(len(comment)):
f.write(time[i]+'\t')
f.write(comment[i]+'\n')
all_comment+=comment[i]
all_time.append(time[i])
f.close()
#print(all_comment)
#print(all_time)
#评论词云展示
pattern="[\u4e00-\u9fa5]+"
regex = re.compile(pattern)
comment_chinese=regex.findall(all_comment)
text=''
for i in comment_chinese:
text+=i
#词频统计
words=jieba.lcut(text)
counts={}
for word in words:
if word not in counts:
counts[word]=1
else:
counts[word]+=1
words_counts=list(counts.items())
words_counts.sort(key=lambda x:x[1],reverse=True)
#格式输出
tplt="{0:^10}\t{1:^10}\t"
print(tplt.format("word", " count"))
for i in range(10):
word,count=words_counts[i]
#输出前10
print(tplt.format(word,count))
#词云展示
cut=' '.join(words)
w=wordcloud.WordCloud(font_path='msyh.ttc',collocations=False,height=600, width=1000,background_color='white')
w.generate(cut)
w.to_file('word.png')
#热度分析
time_list=[]
for i in all_time:
i=i.replace('月','-')
i=i.replace('日','')
i='2020-'+i+':00'
ts=int(ti.mktime(ti.strptime(''.join(i), "%Y-%m-%d %H:%M:%S")))#转换为时间戳
time_list.append(ts)
dict={
'timeStamp':time_list
}
df=pd.DataFrame(dict)
mean=df['timeStamp'].mean()
std=df['timeStamp'].std()
x=np.arange(1591889940,1593856680,0.1)
y=np.exp(-((x-mean) ** 2) / (2 * std ** 2)) / (std * np.sqrt(2 * np.pi))
plt.plot(x,y)
#plt.hist(df['timeStamp'], bins=12, rwidth=0.9, density=True)
plt.title('time distribution')
plt.xlabel('Time')
plt.ylabel('Attention to events')
plt.savefig('time distribution.png')
plt.show()
爬虫写得不多,小白学习ing,欢迎大家交流…
本文地址:https://blog.csdn.net/heheyangxyy/article/details/107475931