欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

TFIDF与scikitlearin的LDA代码,进行主体聚类,为每个句子打标签

程序员文章站 2022-03-03 10:19:41
...
# -*- coding: utf-8 -*-

import jieba
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('京州', True)
jieba.suggest_freq('桓温', True)
#
cors=[]
files=['1.txt','2.txt','3.txt']
for i in files:
    with open(i,'r',encoding='utf-8') as f:
        tmp=f.read()
    with open("trans"+i,'w',encoding='utf-8') as f:
        document_cut = jieba.cut(tmp)
    #print  ' '.join(jieba_cut)
        result = ' '.join(document_cut)
        cors.append(result)
        f.write(result)

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [cors[0],cors[1]]
vector = TfidfVectorizer()
tfidf = vector.fit_transform(corpus)
print (tfidf)



wordlist = vector.get_feature_names()#获取词袋模型中的所有词
# tf-idf矩阵 元素a[i][j]表示j词在i类文本中的tf-idf权重
weightlist = tfidf.toarray()
#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
for i in range(len(weightlist)):
    print ("-------第",i,"段文本的词语tf-idf权重------" )
    for j in range(len(wordlist)):
        print (wordlist[j],weightlist[i][j])



from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
corpus = [cors[0],cors[1],cors[2]]
cntVector = CountVectorizer()#stop_words=stpwrdlst
cntTf = cntVector.fit_transform(corpus)
cntTf

lda = LatentDirichletAllocation(n_topics=2, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
docres = lda.fit_transform(cntTf)

len(lda.components_[1]) #98
docres