欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

NLP训练一个可以寻找相似度最匹配的句子的模型(LSI、LDA、TFIDF)

程序员文章站 2022-05-19 13:16:49
...

全套代码,不多解释,即插即用~

英文句子预处理模块

# 英文句子处理模块
from nltk.corpus import stopwords as pw
import sys 
import re
cacheStopWords=pw.words("english")

def English_processing(sentence):
    if sentence:
        sentence = sentence.lower()  # 大写转小写
           
        for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
            sentence = sentence.lower().replace(ch," ")  # 去除符号
            
        sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) # 去除停用词
        
        sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) # 去除指定特殊词
        
        return sentence

相似度模型训练—评估

import gc
import tqdm
import numpy as np
from gensim import corpora, models, similarities
from collections import defaultdict
import time


class SentenceSimilarity():

    def __init__(self, sentences,min_frequency= 1):
        self.sentences = []
        for i in range(0, len(sentences)):
            self.sentences.append(English_processing(sentences[i]))
        self.sentences_num = len(self.sentences)
        
        self.min_frequency = min_frequency

    # 获取切过词的句子
    def get_cuted_sentences(self):
        cuted_sentences = []

        for sentence in self.sentences:
            cuted_sentences.append(sentence.strip().split())

        return cuted_sentences

    # 构建其他复杂模型前需要的简单模型
    def simple_model(self):
        self.texts = self.get_cuted_sentences()
        
        # 删除低频词
        frequency = defaultdict(int)
        for text in self.texts:
            for token in text:
                frequency[token] += 1
        self.texts = [[token for token in text if frequency[token] > self.min_frequency] for text in self.texts]
        self.dictionary = corpora.Dictionary(self.texts)
        
        self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]

    # tfidf模型
    def TfidfModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.TfidfModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lsi模型
    def LsiModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # lda模型
    def LdaModel(self):
        self.simple_model()

        # 转换模型
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]

        # 创建相似度矩阵
        self.index = similarities.MatrixSimilarity(self.corpus)

    # 对新输入的句子(比较的句子)进行预处理
    def sentence2vec(self, sentence):
        sentence = English_processing(sentence)
        vec_bow = self.dictionary.doc2bow(sentence.strip().split())
        return self.model[vec_bow]

    def bow2vec(self):
        vec = []
        length = max(self.dictionary) + 1
        for content in self.corpus:
            sentence_vectors = np.zeros(length)
            for co in content:
                sentence_vectors[co[0]] = co[1]  # 将句子出现的单词的tf-idf表示放入矩阵中
            vec.append(sentence_vectors)
        return vec

    # 求最相似的句子
    # input: test sentence
    def similarity(self, sentence):
        sentence_vec = self.sentence2vec(sentence)
        sims = self.index[sentence_vec]
        sim = max(enumerate(sims), key=lambda item: item[1])

        index = sim[0]
        score = sim[1]
        sentence = self.sentences[index]

        return index,score  # 返回最相似的句子的下标和相似度得分

        # 求最相似前k个句子
    def similarity_k(self, sentence, k):
        sentence_vec = self.sentence2vec(sentence)
        t1 = time.time()
        sims = self.index[sentence_vec]
        t2 = time.time()
        print('特征检索耗时:{:.4f}ms, 检索样本总数:{}'.format(t2-t1, self.sentences_num))
        sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]

        indexs = [i[0] for i in sim_k]
        scores = [i[1] for i in sim_k]
        return indexs, scores

展开使用

训练模型,输入train_data,可更换训练模型

Similar_model = SentenceSimilarity(train_data,min_frequency = 1)
Similar_model.simple_model()
Similar_model.LsiModel()
#Similar_model.LdaModel()。
#Similar_model.TfidfModel()

预测句子,输入sentence,返回train_data中与其最相似的句子的下标index,以及相似度得分score

index,score = Similar_model.similarity(sentence)