欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

python爬虫微博爬取以及分析

程序员文章站 2023-12-31 12:47:04
对微博评论进行爬取获取评论时间和文本进行简单分析任务:1.爬取评论和时间(request和re)2.词频统计(jieba)3.词云展示(wordcloud)4.时间分布(matplotlib)#heheyangimport requestsimport reimport jiebaimport wordcloudimport time as tiimport pandas as pdimport numpy as npimport matplotlib.pyplot as plt...

对微博评论进行简单爬取并进行分析
任务:
1.爬取评论和时间(request和re)
2.词频统计(jieba)
3.词云展示(wordcloud)
4.时间分布(matplotlib)
代码如下:

#heheyang
import requests
import re
import jieba
import wordcloud
import time as ti
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#数据爬取
start_url='weibo_path&&page='
header = {'cookie':'your cookie',
          'user-agent':'your agent'}
f=open('path','w',encoding='utf-8')
#f.write('time\tcomment\n')
all_comment=''
all_time=[]
for i in range(1,51 ):
    #print('正在爬取第%d页....' %i)
    url=start_url+str(i)
    r = requests.get(url,timeout=30,headers = header)
    r.raise_for_status()
    r.encoding='utf-8'
    if i==1:
        comment=re.findall('<span class="ctt">(.*?)</span>',r.text)
        time=re.findall('<span class="ct">(.*?)&nbsp',r.text)
        del comment[0:4]
        del time[0:4]
    else:
        comment = re.findall('<span class="ctt">(.*?)</span>', r.text)
        time = re.findall('<span class="ct">(.*?)&nbsp', r.text)
    for i in range(len(comment)):
        f.write(time[i]+'\t')
        f.write(comment[i]+'\n')
        all_comment+=comment[i]
        all_time.append(time[i])
f.close()
#print(all_comment)
#print(all_time)
#评论词云展示
pattern="[\u4e00-\u9fa5]+"
regex = re.compile(pattern)
comment_chinese=regex.findall(all_comment)
text=''
for i in comment_chinese:
    text+=i
#词频统计
words=jieba.lcut(text)
counts={}
for word in words:
    if word not in counts:
        counts[word]=1
    else:
        counts[word]+=1
words_counts=list(counts.items())
words_counts.sort(key=lambda x:x[1],reverse=True)
#格式输出
tplt="{0:^10}\t{1:^10}\t"
print(tplt.format("word", "      count"))
for i in range(10):
    word,count=words_counts[i]
    #输出前10
    print(tplt.format(word,count))
#词云展示
cut=' '.join(words)
w=wordcloud.WordCloud(font_path='msyh.ttc',collocations=False,height=600, width=1000,background_color='white')
w.generate(cut)
w.to_file('word.png')

#热度分析
time_list=[]
for i in all_time:
    i=i.replace('月','-')
    i=i.replace('日','')
    i='2020-'+i+':00'
    ts=int(ti.mktime(ti.strptime(''.join(i), "%Y-%m-%d %H:%M:%S")))#转换为时间戳
    time_list.append(ts)
dict={
    'timeStamp':time_list
}
df=pd.DataFrame(dict)
mean=df['timeStamp'].mean()
std=df['timeStamp'].std()
x=np.arange(1591889940,1593856680,0.1)
y=np.exp(-((x-mean) ** 2) / (2 * std ** 2)) / (std * np.sqrt(2 * np.pi))
plt.plot(x,y)
#plt.hist(df['timeStamp'], bins=12, rwidth=0.9, density=True)
plt.title('time distribution')
plt.xlabel('Time')
plt.ylabel('Attention to events')
plt.savefig('time distribution.png')
plt.show()

爬虫写得不多,小白学习ing,欢迎大家交流…

本文地址:https://blog.csdn.net/heheyangxyy/article/details/107475931

相关标签: python

上一篇:

下一篇: