三、(3)PCA降3维下的K-means聚类可视化
程序员文章站
2024-02-13 19:41:52
...
三、(3)PCA降3维下的K-means聚类可视化
完整代码如下:
# -*- coding: utf-8 -*-
"""
Created on Wed May 15 11:40:27 2019
@author: sun
"""
import codecs
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
if __name__ == "__main__":
#文档预料 空格连接
corpus = []
#读取预料 一行预料为一个文档
for line in open('聚类4类.txt', 'r',encoding='UTF-8').readlines():
#print line
corpus.append(line.strip())
#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer(min_df=10)
#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()
#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
#tfidf = vectorizer.fit_transform(corpus).toarray() #LI
#获取词袋模型中的所有词语
word = vectorizer.get_feature_names()
#将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
weight = tfidf.toarray()
#打印特征向量文本内容
#print( 'Features length: ' + str(len(word)))
resName = "BHTfidf_Result.txt"
result = codecs.open(resName, 'w', 'utf-8')
for j in range(len(word)):
result.write(word[j] + ' ')
result.write('\r\n\r\n')
#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
for i in range(len(weight)):
#print u"-------这里输出第", i, u"类文本的词语tf-idf权重------"
for j in range(len(word)):
#print weight[i][j],
result.write(str(weight[i][j]) + ' ')
result.write('\r\n\r\n')
result.close()
print( 'Start Kmeans:')
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=4) #科技 医学 汽车 国家
s = clf.fit(weight)
#每个样本所属的簇
label = []
#print(clf.labels_)
i = 1
while i <= len(clf.labels_):
#print( clf.labels_[i-1])
label.append(clf.labels_[i-1])
i = i + 1
from sklearn.decomposition import PCA
pca = PCA(n_components=3) #输出三维
newData = pca.fit_transform(weight) #载入N维
x = [n[0] for n in newData]
y = [n[1] for n in newData]
z = [n[2] for n in newData]
x1, y1,z1 = [], [],[]
x2, y2,z2 = [], [],[]
x3, y3,z3 = [], [],[]
x4, y4,z4 = [], [],[]
#分布获取类标为0、1、2、3的数据 赋值给(x1,y1) (x2,y2) (x3,y3) (x4,y4)
i = 0
while i < len(newData):
if y_pred[i]==0:
x1.append(newData[i][0])
y1.append(newData[i][1])
z1.append(newData[i][2])
elif y_pred[i]==1:
x2.append(newData[i][0])
y2.append(newData[i][1])
z2.append(newData[i][2])
elif y_pred[i]==2:
x3.append(newData[i][0])
y3.append(newData[i][1])
z3.append(newData[i][2])
elif y_pred[i]==3:
x4.append(newData[i][0])
y4.append(newData[i][1])
z4.append(newData[i][2])
i = i + 1
fig=plt.figure()
ax=fig.add_subplot(111,projection='3d')
ax.scatter(x1,y1,z1,c='m')
ax.scatter(x2,y2,z2,c='r')
ax.scatter(x3,y3,z3,c='g')
ax.scatter(x4,y4,z4,c='c')
#颜色包括 g b y r c m k
p#lt.show()
可视化图形如下:
上一篇: 分享:nuxt路由