NLP训练一个可以寻找相似度最匹配的句子的模型(LSI、LDA、TFIDF)
程序员文章站
2022-05-19 13:16:49
...
全套代码,不多解释,即插即用~
英文句子预处理模块
# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys
import re
cacheStopWords=pw.words("english")
def English_processing(sentence):
if sentence:
sentence = sentence.lower() # 大写转小写
for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
sentence = sentence.lower().replace(ch," ") # 去除符号
sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
return sentence
相似度模型训练—评估
import gc
import tqdm
import numpy as np
from gensim import corpora, models, similarities
from collections import defaultdict
import time
class SentenceSimilarity():
def __init__(self, sentences,min_frequency= 1):
self.sentences = []
for i in range(0, len(sentences)):
self.sentences.append(English_processing(sentences[i]))
self.sentences_num = len(self.sentences)
self.min_frequency = min_frequency
# 获取切过词的句子
def get_cuted_sentences(self):
cuted_sentences = []
for sentence in self.sentences:
cuted_sentences.append(sentence.strip().split())
return cuted_sentences
# 构建其他复杂模型前需要的简单模型
def simple_model(self):
self.texts = self.get_cuted_sentences()
# 删除低频词
frequency = defaultdict(int)
for text in self.texts:
for token in text:
frequency[token] += 1
self.texts = [[token for token in text if frequency[token] > self.min_frequency] for text in self.texts]
self.dictionary = corpora.Dictionary(self.texts)
self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]
# tfidf模型
def TfidfModel(self):
self.simple_model()
# 转换模型
self.model = models.TfidfModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lsi模型
def LsiModel(self):
self.simple_model()
# 转换模型
self.model = models.LsiModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# lda模型
def LdaModel(self):
self.simple_model()
# 转换模型
self.model = models.LdaModel(self.corpus_simple)
self.corpus = self.model[self.corpus_simple]
# 创建相似度矩阵
self.index = similarities.MatrixSimilarity(self.corpus)
# 对新输入的句子(比较的句子)进行预处理
def sentence2vec(self, sentence):
sentence = English_processing(sentence)
vec_bow = self.dictionary.doc2bow(sentence.strip().split())
return self.model[vec_bow]
def bow2vec(self):
vec = []
length = max(self.dictionary) + 1
for content in self.corpus:
sentence_vectors = np.zeros(length)
for co in content:
sentence_vectors[co[0]] = co[1] # 将句子出现的单词的tf-idf表示放入矩阵中
vec.append(sentence_vectors)
return vec
# 求最相似的句子
# input: test sentence
def similarity(self, sentence):
sentence_vec = self.sentence2vec(sentence)
sims = self.index[sentence_vec]
sim = max(enumerate(sims), key=lambda item: item[1])
index = sim[0]
score = sim[1]
sentence = self.sentences[index]
return index,score # 返回最相似的句子的下标和相似度得分
# 求最相似前k个句子
def similarity_k(self, sentence, k):
sentence_vec = self.sentence2vec(sentence)
t1 = time.time()
sims = self.index[sentence_vec]
t2 = time.time()
print('特征检索耗时:{:.4f}ms, 检索样本总数:{}'.format(t2-t1, self.sentences_num))
sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]
indexs = [i[0] for i in sim_k]
scores = [i[1] for i in sim_k]
return indexs, scores
展开使用
训练模型,输入train_data,可更换训练模型
Similar_model = SentenceSimilarity(train_data,min_frequency = 1)
Similar_model.simple_model()
Similar_model.LsiModel()
#Similar_model.LdaModel()。
#Similar_model.TfidfModel()
预测句子,输入sentence,返回train_data中与其最相似的句子的下标index,以及相似度得分score
index,score = Similar_model.similarity(sentence)
上一篇: java抓取全国城市空气质量jsoup
下一篇: 简单计算空气质量指数