简单的数据处理(二)——python数据预处理生成词频统计图、词云
程序员文章站
2022-04-03 21:15:41
...
python数据预处理生成词频统计图、词云
数据来源请看本人博客下简单的数据处理(一)
一、对转换后的结果进行预处理
1、我们需要去除每一句里的不可用字符, 例如"",aaa@qq.com#$%^&*(){}+=-…以及数字[0-9]等这些不可用数据
import os
import re
with open("result.txt") as f:
for line in f:
line = re.sub(u"[0-9\s+.!/,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、aaa@qq.com#¥%……&*()><-]+", "", line)
print(line)
处理后部分结果:
2、去掉换⾏符以及经过上⼀步处理后为空的数据
import os
import re
with open("result.txt") as f:
for line in f:
line = re.sub(u"[0-9\s+.!/,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、aaa@qq.com#¥%……&*()><-]+", "", line) # 去掉多余字符
if line == "":continue
line = line.replace("\n", "") # 去掉换行符
print(line)
处理后的部分结果:
3、按行对处理后的数据通过jieba分词进行分词操作
安装jieba分词 pip install jieba
import os
import re
import jieba
words_str = ""
with open("result.txt") as f:
for line in f:
line = re.sub(u"[0-9\s+.!/,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、aaa@qq.com#¥%……&*()><-]+", "", line) # 去掉多余字符
if line == "":continue
line = line.replace("\n", "") # 去掉换行符
seg_list = jieba.cut(line, cut_all=False)
words_str += (" ".join(seg_list))
print(line)
部分结果:
结果中可以看到一些 冗余的词汇个 单个字 ”的、大“这些文字
我们需要进行去除停用词处理 和 去除单个文字的操作
停用词表已上传到网盘下载链接: 链接:https://pan.baidu.com/s/1x0SQxXiB_ZG-Nq4hMntKGA 密码:web6
import os
import re
import jieba
import jieba.analyse
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords.txt',encoding='UTF-8').readlines()]
return stopwords
words_str = ""
with open("result.txt") as f:
for line in f:
line = re.sub(u"[0-9\s+.!/,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、aaa@qq.com#¥%……&*()><-]+", "", line) # 去掉多余字符
if line == "":continue
line = line.replace("\n", "") # 去掉换行符
seg_list = jieba.cut(line, cut_all=False)
words_str += (" ".join(seg_list))
stopwords = stopwordslist()
words = [word for word in words_str.split(" ") if word not in stopwords and len(word) > 1] # 去除停用词和去除单个文字
print(words)
处理后的分词结果 共有7748个词:
部分结果:
4、进行词频统计
我们通过python的collections方法来进行词频统计
import os
import re
import jieba
import jieba.analyse
from collections import Counter
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords.txt',encoding='UTF-8').readlines()]
return stopwords
words_str = ""
with open("result.txt") as f:
for line in f:
line = re.sub(u"[0-9\s+.!/,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、aaa@qq.com#¥%……&*()><-]+", "", line) # 去掉多余字符
if line == "":continue
line = line.replace("\n", "") # 去掉换行符
seg_list = jieba.cut(line, cut_all=False)
words_str += (" ".join(seg_list))
stopwords = stopwordslist()
words = [word for word in words_str.split(" ") if word not in stopwords and len(word) > 1]
word_counts = Counter() # 词频统计
for x in words:
word_counts[x] += 1
词频统计部分结果(不重复词共有2709个):
5、生成词频统计图
通过python matplotlib来做词频统计图 matplotlib是python中一个比较好用的画图库
安装matplotlib: pip install matplotlib
import re
import matplotlib.pyplot as plt
import jieba.analyse
from collections import Counter
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords.txt',encoding='UTF-8').readlines()]
return stopwords
def get_plt(data, title):
x = [i[0] for i in data]
y = [i[1] for i in data]
fig, ax = plt.subplots()
ax.barh(range(len(x)), y, color='gold')
ax.set_yticks(range(len(x)))
ax.set_yticklabels(x)
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.title(title, fontsize=10)
plt.ylabel("词")
plt.xlabel("次数")
plt.show()
words_str = ""
with open("result.txt") as f:
for line in f:
line = re.sub(u"[0-9\s+.!/,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、aaa@qq.com#¥%……&*()><-]+", "", line) # 去掉多余字符
if line == "":continue
line = line.replace("\n", "") # 去掉换行符
seg_list = jieba.cut(line, cut_all=False)
words_str += (" ".join(seg_list))
stopwords = stopwordslist()
words = [word for word in words_str.split(" ") if word not in stopwords and len(word) > 1]
word_counts = Counter() # 词频统计
for x in words:
word_counts[x] += 1
get_plt(word_counts.most_common(30), "词频统计top30") #生成统计图
结果:
6、生成词云
通过python中的wordcloud来生成词云
wordcloud安装: pip install wordcloud
首先选一张漂亮的背景图 图片任意
全部代码:
import os
import re
import PIL
import wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
import jieba.analyse
from collections import Counter
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
def stopwordslist():
stopwords = [line.strip() for line in open('stopwords.txt',encoding='UTF-8').readlines()]
return stopwords
def get_plt(data, title):
x = [i[0] for i in data]
y = [i[1] for i in data]
fig, ax = plt.subplots()
ax.barh(range(len(x)), y, color='gold')
ax.set_yticks(range(len(x)))
ax.set_yticklabels(x)
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.title(title, fontsize=10)
plt.ylabel("词")
plt.xlabel("次数")
plt.show()
def _wordcloud(word_counts): # 词云生成
mask = np.array(PIL.Image.open(r'./backgroup.png'))
wc = WordCloud(font_path='/System/Library/Fonts/Hiragino Sans GB.ttc', max_words=2000, mask=mask, repeat=False,
mode='RGBA')
wc.generate_from_frequencies(word_counts)
image_colors = wordcloud.ImageColorGenerator(mask) # 可以去掉 # 基于彩色图像生成相应彩色 文字颜色跟随背景图颜色
wc.recolor(color_func=image_colors)
wc.to_file("./wc_result.png")
plt.imshow(wc)
plt.axis('off')
plt.show()
words_str = ""
with open("result.txt") as f:
for line in f:
line = re.sub(u"[0-9\s+.!/,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、aaa@qq.com#¥%……&*()><-]+", "", line) # 去掉多余字符
if line == "":continue
line = line.replace("\n", "") # 去掉换行符
seg_list = jieba.cut(line, cut_all=False)
words_str += (" ".join(seg_list))
stopwords = stopwordslist()
words = [word for word in words_str.split(" ") if word not in stopwords and len(word) > 1]
word_counts = Counter() # 词频统计
for x in words:
word_counts[x] += 1
get_plt(word_counts.most_common(30), "词频统计top30")
hotword_top30 = word_counts.most_common(30)
hotword_top30 = {x[0]: x[1] for x in hotword_top30}
get_plt(hotword_top30, "词频统计图top30")
_wordcloud(word_counts) # 词云生成
结果:
简单的数据处理全部完成,有什么不懂的可以留言