TFIDF与scikitlearin的LDA代码,进行主体聚类,为每个句子打标签
程序员文章站
2022-03-03 10:19:41
...
# -*- coding: utf-8 -*-
import jieba
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('京州', True)
jieba.suggest_freq('桓温', True)
#
cors=[]
files=['1.txt','2.txt','3.txt']
for i in files:
with open(i,'r',encoding='utf-8') as f:
tmp=f.read()
with open("trans"+i,'w',encoding='utf-8') as f:
document_cut = jieba.cut(tmp)
#print ' '.join(jieba_cut)
result = ' '.join(document_cut)
cors.append(result)
f.write(result)
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [cors[0],cors[1]]
vector = TfidfVectorizer()
tfidf = vector.fit_transform(corpus)
print (tfidf)
wordlist = vector.get_feature_names()#获取词袋模型中的所有词
# tf-idf矩阵 元素a[i][j]表示j词在i类文本中的tf-idf权重
weightlist = tfidf.toarray()
#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
for i in range(len(weightlist)):
print ("-------第",i,"段文本的词语tf-idf权重------" )
for j in range(len(wordlist)):
print (wordlist[j],weightlist[i][j])
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
corpus = [cors[0],cors[1],cors[2]]
cntVector = CountVectorizer()#stop_words=stpwrdlst
cntTf = cntVector.fit_transform(corpus)
cntTf
lda = LatentDirichletAllocation(n_topics=2, max_iter=5,
learning_method='online',
learning_offset=50.,
random_state=0)
docres = lda.fit_transform(cntTf)
len(lda.components_[1]) #98
docres
上一篇: Python语言-NL-1‰的力量
下一篇: 抓取武汉市空气质量