python财经新闻分析,分词,统计词频,词云,matplotlib画交互式统计图
程序员文章站
2024-03-07 18:30:39
...
前言
由于时间紧急所以这里只有大概的描述,而且完成度不是那么高,请各位见谅。
数据来源就是我的上一篇文章https://blog.csdn.net/qq_35000950/article/details/104379091
代码
import json
from multiprocessing import Pool, Lock
import os
import jieba
import wordcloud
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import jieba.analyse
import jieba.posseg as psg
import matplotlib.pyplot as plt
import pylab as pl
from matplotlib.widgets import Button
import random
file_list_index=0
plot_file_index=''
plot_file_list=[]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family']='sans-serif'
plt.rcParams['axes.unicode_minus'] = False
Pause=True
def delete_singal_word(text,word_list):
for word in word_list:
text=text.replace(word,"")
return text
init_word='基金分类.txt'
data_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\data\\'
pinlu_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\频率数据存档\\'
png_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\词云存档\\'
work_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\'
start_date='2014-03-06'
date_remain=1200
start_index=-1
cnames = {
'aliceblue': '#F0F8FF',
'antiquewhite': '#FAEBD7',
'aqua': '#00FFFF',
'aquamarine': '#7FFFD4',
'azure': '#F0FFFF',
'beige': '#F5F5DC',
'bisque': '#FFE4C4',
'black': '#000000',
'blanchedalmond': '#FFEBCD',
'blue': '#0000FF',
'blueviolet': '#8A2BE2',
'brown': '#A52A2A',
'burlywood': '#DEB887',
'cadetblue': '#5F9EA0',
'chartreuse': '#7FFF00',
'chocolate': '#D2691E',
'coral': '#FF7F50',
'cornflowerblue': '#6495ED',
'cornsilk': '#FFF8DC',
'crimson': '#DC143C',
'cyan': '#00FFFF',
'darkblue': '#00008B',
'darkcyan': '#008B8B',
'darkgoldenrod': '#B8860B',
'darkgray': '#A9A9A9',
'darkgreen': '#006400',
'darkkhaki': '#BDB76B',
'darkmagenta': '#8B008B',
'darkolivegreen': '#556B2F',
'darkorange': '#FF8C00',
'darkorchid': '#9932CC',
'darkred': '#8B0000',
'darksalmon': '#E9967A',
'darkseagreen': '#8FBC8F',
'darkslateblue': '#483D8B',
'darkslategray': '#2F4F4F',
'darkturquoise': '#00CED1',
'darkviolet': '#9400D3',
'deeppink': '#FF1493',
'deepskyblue': '#00BFFF',
'dimgray': '#696969',
'dodgerblue': '#1E90FF',
'firebrick': '#B22222',
'floralwhite': '#FFFAF0',
'forestgreen': '#228B22',
'fuchsia': '#FF00FF',
'gainsboro': '#DCDCDC',
'ghostwhite': '#F8F8FF',
'gold': '#FFD700',
'goldenrod': '#DAA520',
'gray': '#808080',
'green': '#008000',
'greenyellow': '#ADFF2F',
'honeydew': '#F0FFF0',
'hotpink': '#FF69B4',
'indianred': '#CD5C5C',
'indigo': '#4B0082',
'ivory': '#FFFFF0',
'khaki': '#F0E68C',
'lavender': '#E6E6FA',
'lavenderblush': '#FFF0F5',
'lawngreen': '#7CFC00',
'lemonchiffon': '#FFFACD',
'lightblue': '#ADD8E6',
'lightcoral': '#F08080',
'lightcyan': '#E0FFFF',
'lightgoldenrodyellow': '#FAFAD2',
'lightgreen': '#90EE90',
'lightgray': '#D3D3D3',
'lightpink': '#FFB6C1',
'lightsalmon': '#FFA07A',
'lightseagreen': '#20B2AA',
'lightskyblue': '#87CEFA',
'lightslategray': '#778899',
'lightsteelblue': '#B0C4DE',
'lightyellow': '#FFFFE0',
'lime': '#00FF00',
'limegreen': '#32CD32',
'linen': '#FAF0E6',
'magenta': '#FF00FF',
'maroon': '#800000',
'mediumaquamarine': '#66CDAA',
'mediumblue': '#0000CD',
'mediumorchid': '#BA55D3',
'mediumpurple': '#9370DB',
'mediumseagreen': '#3CB371',
'mediumslateblue': '#7B68EE',
'mediumspringgreen': '#00FA9A',
'mediumturquoise': '#48D1CC',
'mediumvioletred': '#C71585',
'midnightblue': '#191970',
'mintcream': '#F5FFFA',
'mistyrose': '#FFE4E1',
'moccasin': '#FFE4B5',
'navajowhite': '#FFDEAD',
'navy': '#000080',
'oldlace': '#FDF5E6',
'olive': '#808000',
'olivedrab': '#6B8E23',
'orange': '#FFA500',
'orangered': '#FF4500',
'orchid': '#DA70D6',
'palegoldenrod': '#EEE8AA',
'palegreen': '#98FB98',
'paleturquoise': '#AFEEEE',
'palevioletred': '#DB7093',
'papayawhip': '#FFEFD5',
'peachpuff': '#FFDAB9',
'peru': '#CD853F',
'pink': '#FFC0CB',
'plum': '#DDA0DD',
'powderblue': '#B0E0E6',
'purple': '#800080',
'red': '#FF0000',
'rosybrown': '#BC8F8F',
'royalblue': '#4169E1',
'saddlebrown': '#8B4513',
'salmon': '#FA8072',
'sandybrown': '#FAA460',
'seagreen': '#2E8B57',
'seashell': '#FFF5EE',
'sienna': '#A0522D',
'silver': '#C0C0C0',
'skyblue': '#87CEEB',
'slateblue': '#6A5ACD',
'slategray': '#708090',
'snow': '#FFFAFA',
'springgreen': '#00FF7F',
'steelblue': '#4682B4',
'tan': '#D2B48C',
'teal': '#008080',
'thistle': '#D8BFD8',
'tomato': '#FF6347',
'turquoise': '#40E0D0',
'violet': '#EE82EE',
'wheat': '#F5DEB3',
'white': '#FFFFFF',
'whitesmoke': '#F5F5F5',
'yellow': '#FFFF00',
'yellowgreen': '#9ACD32'}
def mkdir(path):
# 引入模块
import os
path=path.strip()
path=path.rstrip("\\")
isExists=os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
return False
def cal_word_num(start_date,init_word=False,draw_wordclouds=True):
# try:
# print(start_date)
# init_word='基金分类.txt'
date_remain=int(start_date.split(',')[1])
start_date=start_date.split(',')[0]
# jieba.analyse.set_stop_words(work_path+"stopword3.txt")
# jieba.analyse.set_idf_path('C:\\other_file\\coal_dict.txt')
news_list_all=[]
title_all=''
file_list=os.listdir(data_path)
file_list=sorted(file_list)
if start_date+'.json' in file_list:
start_index=file_list.index(start_date+'.json')
else:
print('没有%s的数据'%start_date)
exit(0)
# if (start_index+date_remain+1)>len(file_list):
# print('%s %s天后的数据'%(start_date,date_remain))
# exit(0)
if (start_index+date_remain+1)<=len(file_list):
endindex=start_index+date_remain+1
else:
endindex=len(file_list)
for i in range(start_index,endindex):
with open(data_path+file_list[i], 'r',encoding='utf-8') as f:
news_list=json.load(f)
for j in range(len(news_list)):
news_list_all.append(news_list[str(j)])
for i in range(len(news_list_all)):
title_all+=news_list_all[i]['rich_text']
title_all=delete_singal_word(title_all,'0123456789.')
title_all=delete_singal_word(title_all,'abcdefghigklmnopqrstuvwxyz')
title_all=delete_singal_word(title_all,'ABCDEFGHIGKLMNOPQRSTUVWXYZ')
title_all=delete_singal_word(title_all,r'!@#¥%……&*()aaa@qq.com#$%^&*()_+-=[];\',./{}{|:"<>??》《:“【】、;‘,。`~· ')
# sentence_seged =psg.cut(title_all)
ls=jieba.cut(title_all)
# ls=jieba.analyse.extract_tags(title_all)
# ls=[("%s,%s" %(x.flag,x.word)) for x in sentence_seged]
# ls1=[x.split(',')[1] for x in ls]
# i=0
# while i <len(ls1):
# if ls1[i]in drop_list:
# index=ls1.index(ls1[i])
# ls1.pop(index)
# ls.pop(index)
# i-=1
# i+=1
counts = {}
divide_list=list(ls)
if not init_word:
drop_list=open(work_path+'stopword3.txt',encoding='utf-8').read().split('\n')
for word in divide_list:
counts[word] = counts.get(word,0) + 1
keys=list(counts.keys())
for i in keys:
if len(i)==1 or i in drop_list:
counts.pop(i)
else:
with open(init_word,'r',encoding='utf-8') as f:
init_word=f.read().split('\n')
ndarray=np.array(divide_list)
for i in init_word:
counts[i]=np.sum(ndarray==i)
keys=list(counts.keys())
for i in keys:
if counts[i]==0:
counts.pop(i)
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
mkdir(pinlu_path[:-1]+str(date_remain)+'\\')
with open(pinlu_path[:-1]+str(date_remain)+'\\'+str(date_remain)+','+start_date+'.txt', 'w',encoding='utf-8') as f:
f.writelines("word,num"+'\n')
for i in range(len(items)):
word, count = items[i]
f.writelines(word+","+str(count)+'\n')
if draw_wordclouds:
draw(start_date,date_remain)
# except:
# print(start_date,date_remain)
def cal_word_num2(start_date,draw_wordclouds=True):
# try:
# print(start_date)
global init_word
date_remain=int(start_date.split(',')[1])
start_date=start_date.split(',')[0]
# jieba.analyse.set_stop_words(work_path+"stopword3.txt")
# jieba.analyse.set_idf_path('C:\\other_file\\coal_dict.txt')
news_list_all=[]
title_all=''
file_list=os.listdir(data_path)
file_list=sorted(file_list)
if start_date+'.json' in file_list:
start_index=file_list.index(start_date+'.json')
else:
print('没有%s的数据'%start_date)
exit(0)
# if (start_index+date_remain+1)>len(file_list):
# print('%s %s天后的数据'%(start_date,date_remain))
# exit(0)
if (start_index+date_remain+1)<=len(file_list):
endindex=start_index+date_remain+1
else:
endindex=len(file_list)
for i in range(start_index,endindex):
with open(data_path+file_list[i], 'r',encoding='utf-8') as f:
news_list=json.load(f)
for j in range(len(news_list)):
news_list_all.append(news_list[str(j)])
for i in range(len(news_list_all)):
title_all+=news_list_all[i]['rich_text']
title_all=delete_singal_word(title_all,'0123456789.')
title_all=delete_singal_word(title_all,'abcdefghigklmnopqrstuvwxyz')
title_all=delete_singal_word(title_all,'ABCDEFGHIGKLMNOPQRSTUVWXYZ')
title_all=delete_singal_word(title_all,r'!@#¥%……&*()aaa@qq.com#$%^&*()_+-=[];\',./{}{|:"<>??》《:“【】、;‘,。`~· ')
# sentence_seged =psg.cut(title_all)
ls=jieba.cut(title_all)
# ls=jieba.analyse.extract_tags(title_all)
# ls=[("%s,%s" %(x.flag,x.word)) for x in sentence_seged]
# ls1=[x.split(',')[1] for x in ls]
# i=0
# while i <len(ls1):
# if ls1[i]in drop_list:
# index=ls1.index(ls1[i])
# ls1.pop(index)
# ls.pop(index)
# i-=1
# i+=1
counts = {}
divide_list=list(ls)
if not init_word:
drop_list=open(work_path+'stopword3.txt',encoding='utf-8').read().split('\n')
for word in divide_list:
counts[word] = counts.get(word,0) + 1
keys=list(counts.keys())
for i in keys:
if len(i)==1 or i in drop_list:
counts.pop(i)
else:
with open(init_word,'r',encoding='utf-8') as f:
init_word=f.read().split('\n')
ndarray=np.array(divide_list)
for i in init_word:
counts[i]=np.sum(ndarray==i)
keys=list(counts.keys())
for i in keys:
if counts[i]==0:
counts.pop(i)
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
mkdir(pinlu_path[:-1]+str(date_remain)+'\\')
with open(pinlu_path[:-1]+str(date_remain)+'\\'+str(date_remain)+','+start_date+'.txt', 'w',encoding='utf-8') as f:
f.writelines("word,num"+'\n')
for i in range(len(items)):
word, count = items[i]
f.writelines(word+","+str(count)+'\n')
if draw_wordclouds:
draw(start_date,date_remain)
# except:
# print(start_date,date_remain)
def init(l):
global lock
lock = l
def draw(start_date,date_remain):
df=pd.read_csv(pinlu_path[:-1]+str(date_remain)+'\\'+str(date_remain)+','+start_date+'.txt',encoding='utf-8')
word_frequence = {df['word'].iat[x]:int(df['num'].iat[x]) for x in range(len(df))}
W = wordcloud.WordCloud(font_path = "msyh.ttc",width = 1920,height = 1080,background_color = "white",prefer_horizontal=1) #设置词云背景,找到字体路径(否则会乱码)
W = W.fit_words(word_frequence)
W.to_file(png_path[:-1]+str(date_remain)+'\\'+str(date_remain)+','+start_date+".png") #保存词云图
def 历史热度图(start,divide):
work_lsit=周期分配(start,divide)
jieba.load_userdict('C:\\other_file\\coal_dict.txt')
lock = Lock()
pool = Pool(16, initializer=init, initargs=(lock,))
pool.map_async(cal_word_num,work_lsit)
pool.close()
pool.join()
def 历史热度图2(start,divide):
global pinlu_path,png_path
pinlu_path=pinlu_path.replace('频率数据存档','init频率数据存档')
png_path=png_path.replace('词云存档','init词云存档')
work_lsit=周期分配(start,divide)
jieba.load_userdict('C:\\other_file\\coal_dict.txt')
lock = Lock()
pool = Pool(16, initializer=init, initargs=(lock,))
pool.map_async(cal_word_num2,work_lsit)
pool.close()
pool.join()
pinlu_path=pinlu_path.replace('init频率数据存档','频率数据存档')
png_path=png_path.replace('init词云存档','词云存档')
def 周期分配(start,divide):
tasklist=[]
file_list=os.listdir(data_path)
file_list=sorted(file_list)
start_index=file_list.index(start+'.json')
while (start_index+divide+1)<=len(file_list):
start=file_list[start_index]
tasklist.append(start[:-5]+","+str(divide))
start_index+=divide
start=file_list[start_index]
tasklist.append(start[:-5]+","+str(divide))
return tasklist
def autolabel(rects):
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x()+rect.get_width()/2.-0.2, 1.03*height, '%s' % int(height))
def last_page(event):
global Pause,file_list_index
Pause=False
if file_list_index>0:
file_list_index-=1
def next_page(event):
global Pause,file_list_index,plot_file_list
Pause=False
if file_list_index<len(plot_file_list):
file_list_index+=1
def data_flow(pinlu_path):
file_list=os.listdir(pinlu_path)
file_list=sorted(file_list)
plt.ion()
fig=plt.figure(figsize=(16, 8))
fig.tight_layout()#调整整体空白
plt.subplots_adjust(left=0.06,right=0.97,top=0.95,bottom=0.1)#调整子图间距
# plt.show()
global file_list_index,plot_file_list,Pause
plot_file_list =file_list
color_all={}
color_list=list(cnames.keys())
df=pd.read_csv(pinlu_path+file_list[0],encoding='utf-8')[:30]
random.seed(10)
max_index=-1
while(file_list_index<len(file_list)):
new_dict={}
old_dict={}
color_list_now=[]
Pause=True
df1=pd.read_csv(pinlu_path+file_list[file_list_index],encoding='utf-8')[:30]
#交集只更新数据
same_list=list(set(list(df['word'])).intersection(set(list(df1['word']))))
for i in same_list:
df['num'].iat[list(df[df['word']==i].index)[0]]=df1['num'].iat[list(df1[df1['word']==i].index)[0]]
#新有,旧没有 旧有,新没有
new_list=list(set(list(df1['word'])).difference(set(list(df['word']))))
old_list=list(set(list(df['word'])).difference(set(list(df1['word']))))
#转化为字典
for i in new_list:
new_dict[i]=df1['num'].iat[list(df1[df1['word']==i].index)[0]]
for i in old_list:
old_dict[i]=df['num'].iat[list(df[df['word']==i].index)[0]]
#用旧的替换新的
while(len(new_dict)>0 and max(new_dict.values())>min(old_dict.values())):
index_old=list(df[df['word']==min(zip(old_dict.values(), old_dict.keys()))[1]].index)[0]
index_new=list(df1[df1['word']==max(zip(new_dict.values(), new_dict.keys()))[1]].index)[0]
df['word'][index_old]=df1['word'][index_new]
df['num'][index_old]=df1['num'][index_new]
new_dict.pop(max(zip(new_dict.values(), new_dict.keys()))[1])
old_dict.pop(min(zip(old_dict.values(), old_dict.keys()))[1])
#颜色记录
if max_index<file_list_index:
max_index=file_list_index
for i in range(len(df)):
if df['word'].iat[i]not in color_all.keys():
index=random.randint(0,len(color_list)-1)
color_all[df['word'].iat[i]]=color_list.pop(index)
#生成对应的颜色列表
for i in range(len(df)):
color_list_now.append(color_all[df['word'].iat[i]])
#数值
name_list =df['word']
num_list = df['num']
plt.clf()
#画图
a=plt.bar(range(len(num_list)), num_list,color=color_list_now,tick_label=name_list)
#X轴标题
plt.title(file_list[file_list_index][:-4])
#旋转
pl.xticks(rotation=45)
#柱状图上的数值
autolabel(a)
plt.plot(name_list, num_list, "r", marker='*', ms=10, label="a")
plt.xticks(rotation=45)
plt.legend(loc="upper left")
#按钮
axnext = plt.axes([0.9, 0.65, 0.07, 0.05])
bnext = Button(axnext, 'Next')
bnext.on_clicked(next_page)
axprev = plt.axes([0.9, 0.7, 0.07, 0.05])
bprev = Button(axprev, 'Previous')
bprev.on_clicked(last_page)
plt.show()#注意此函数需要调用
axnext._button = bnext
axprev._button = bprev
while Pause:
plt.pause(0.01)
if __name__ == '__main__':
历史热度图('2020-02-11',7)
历史热度图('2020-01-17',30)
历史热度图2('2020-02-11',7)
历史热度图2('2020-01-17',30)
函数变量说明
#函数
def delete_singal_word(text,word_list)
#从text中删去word_list中的每一个字符,这样比停用词筛选要快
def mkdir(path)
#用于保存词频和词云生成目录,让文件不至于很混乱(我的已经很混乱了)
def cal_word_num(start_date,init_word=False,draw_wordclouds=True)
#计算并输出词频start_date=开始日期,持续天数(日期个位加0)。例如:'2019-03-05,10'统计范围是2019-03-05到2019-04-15,这样是为了多进程不用写进程之间参数传递
def cal_word_num2(start_date,draw_wordclouds=True):
#计算并输出词频start_date=开始日期(日期个位加0),持续天数。例如:'2019-03-05,10'统计范围是2019-03-05到2019-04-15,这样是为了多进程不用写进程之间参数传递,使用自定义统计词,统计词路径在全局变量init_word里。
def draw(start_date,date_remain):
#画词云,start_date=开始日期(日期个位加0),date_remain=持续天数。需要先计算出频率文件。
def 历史热度图(start,divide):
#多进程分配任务
def 历史热度图2(start,divide)
#多进程分配任务,使用自定义词库
def 周期分配(start,divide):
#生成从start,间隔是divide,到数据库最新日期的任务列表
def autolabel(rects):
#生成matplotlib中统计图上的数值
def last_page(event):
# matplotlib中统计图上一页按钮
def next_page(event):
# matplotlib中统计图下一页按钮
def data_flow(pinlu_path):
#画交互式统计图
#变量
cnames[]
#matplotlib颜色表,因为matplotlib在统计图切换时没有过渡动画,因此每一项都有一个唯一的颜色,有利于发现变化
init_word='基金分类.txt'
#自己的目标词汇表
data_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\data\\'
#新闻数据保存的路径
pinlu_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\频率数据存档\\'
#最后的频率统计输出路径
png_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\词云存档\\'
#词云输出路径
work_path='C:\\Users\\13756\\OneDrive - integrate collaborative models\\量化1\\news_analyse\\'
#项目路径
总结
1.由于时间原因,这个只能说是一个半成品,写在这里主要是为了提供一种思路,实在很抱歉。
2.函数的入口,出口,文件的管理 有很大的优化空间。
3.程序的灵活性和强壮性不足。
4.自己的词汇表插入的位置不合理
相关文件下载:链接:https://pan.baidu.com/s/1SfSai_Yb47KxXBejxi3NJw
提取码:8zj6
上一篇: 声源数目测定
下一篇: Python九行代码实现疫情分布图