欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  科技

基于词向量计算文本相似度(代码实例+测试数据)

程序员文章站 2022-03-15 15:17:55
基于词向量计算文本的余弦相似度1.测试数据:链接:https://pan.baidu.com/s/19gUVDOYS2-yfH4N6rVUWBA提取码:zfv22.实验代码:import mathimport osimport pandas as pdimport numpy as np# 计算两个向量的余弦相似度def cos_similarity(vec_dim, vector_1, vector_2): # 输入两个向量 # 计算两个向量的点积 x = 0...

基于词向量计算文本相似度

1.测试数据:

链接:https://pan.baidu.com/s/19gUVDOYS2-yfH4N6rVUWBA
提取码:zfv2

2.实验代码:

import math
import os
import pandas as pd
import numpy as np


# 计算两个向量的余弦相似度
def cos_similarity(vec_dim, vector_1, vector_2):  # 输入两个向量
    # 计算两个向量的点积
    x = 0
    i = 0
    # vec_dim = len(vector_1)  # 向量维度
    while i < vec_dim:
        x = x + vector_1[i] * vector_2[i]
        i = i + 1

    # 计算两个向量的模
    i = 0
    sq_1 = 0
    sq_2 = 0
    while i < vec_dim:
        sq_1 = sq_1 + vector_1[i] * vector_1[i]  # pow(a,2)
        sq_2 = sq_2 + vector_2[i] * vector_2[i]
        i = i + 1

    result = float(x) / (math.sqrt(sq_1) * math.sqrt(sq_2))
    return result


def get_embeddings(path):
    with open(path, encoding='utf8') as f:
        data = f.read().splitlines()  # 逐行读取文件,并去除回车,输出['','',...,]

    row = data[0].split()[0]  # embedding的行数
    col = data[0].split()[1]  # embedding的列数
    dim = int(col)

    i = 1
    embeddings = []
    while i < int(row):
        # item_list = []
        item = data[i].split(' ')
        word = item[0]
        embedding = item[1:]
        embedding = list(map(eval, embedding))
        # item_list.append(word)
        embeddings.append(embedding)
        i += 1
    # embeddings.append(item_list)  # item_list[0]为关键词,item_list[1]为embedding
    return embeddings, dim


def find_each(path):
    path_list = []
    files_dir = os.listdir(path)
    for file in files_dir:
        file_path = os.path.join('%s\%s' % (path, file))
        path_list.append(file_path)
    return path_list


def get_sim_matrix(path_1, path_2):  # 输入单embedding的路径
    # 获得两个embeddings数据
    embeddings_1, vec_dim_1 = get_embeddings(path_1)
    embeddings_2, vec_dim_2 = get_embeddings(path_2)

    # 生成词向量相似度矩阵
    if vec_dim_1 == vec_dim_2:
        matrix = []
        for em_1 in embeddings_1:
            score = []
            for em_2 in embeddings_2:
                cos_sim = cos_similarity(vec_dim_1, em_1, em_2)
                score.append(cos_sim)  # embeddings1中第i个embedding与embeddings2中每个embedding的相似度值
            matrix.append(score)
    else:
        print('input error: the dimensions are different')
    return matrix


# 卷积层,卷积核的感受野为2*2,参数表示一个输入词向量矩阵
def cnn_folding(dict_vec):
    c = len(dict_vec[1])  # 获取输入矩阵的横向长度
    r = len(dict_vec)  # 获取输入矩阵的纵向长度
    result = [[0 for col in range(c-1)] for row in range(r-1)]  # python构造的二维列表
    for i in range(r-1):  # 通过循环实现整个矩阵的运算
        for j in range(c-1):
            re = (dict_vec[i][j] + dict_vec[i][j+1] + dict_vec[i+1][j] +
                  dict_vec[i+1][j+1])/4  # 实现卷积层的运算,这里卷积核默认是[[1,1],[1,1]]
            result[i][j] = re
    return result


# 池化层,采用max-pooling方式实现池化,参数表示输入矩阵
def cnn_pooling(dict_pooling):
    c = len(dict_pooling[1])
    r = len(dict_pooling)
    result = [[0 for col in range(c - 1)] for row in range(r - 1)]  # python构造的二维列表
    for i in range(r - 1):
        for j in range(c - 1):
            re = max(dict_pooling[i][j], dict_pooling[i][j + 1], dict_pooling[i + 1][j],
                     dict_pooling[i + 1][j + 1])  # max-pooling方法实现池化
            result[i][j] = re
    return result


# 实现卷积层和池化层的连接层
def pooling_folding(matrix):
    res = []
    data_list = matrix
    while 1:  # 交替实现卷积层和池化层
        c = len(data_list[0])
        r = len(data_list)
        if c == 1 or r == 1:  # 判定池化层跳出循环条件
            for i in range(len(data_list)):
                for j in data_list[i]:
                    res.append(j)
            break
        pool = cnn_pooling(data_list)  # 实现池化层
        if len(pool) == 1 or len(pool[1]) == 1:  # 判定卷积层跳出循环的条件
            data_list = pool
            for i in range(len(data_list)):
                for j in data_list[i]:
                    res.append(j)
            break
        else:
            fold = cnn_folding(pool)  # 实现卷积层
            data_list = fold
            pool = [[0 for col in range(c - 1)] for row in range(r - 1)]
            fold = [[0 for col in range(c - 1)] for row in range(r - 1)]
    return res


jd_path = r'D:\thesis\0811\jd_graph\graph_embeddings'
user_path = r'D:\thesis\0811\user_graph\graph_embeddings'
jd_em_paths = find_each(jd_path)  # 得到目录下的
user_em_paths = find_each(user_path)
job_list = []
sim_lists = []
for jd_file in jd_em_paths:
    sim_dict = {}

    jd_file_name = os.path.basename(jd_file)
    jd_name = jd_file_name.split('.')[0]  # jd的类型名称
    job_list.append(jd_name)

    for user_file in user_em_paths:
        sim_matrix = get_sim_matrix(jd_file, user_file)  # 行代表job的embedding,列代表user的embedding,值为两个embedding的相似度
        sim_res = pooling_folding(sim_matrix)  # 送入卷积、池化层,全连接
        sim_score = sum(sim_res)/len(sim_res)  # 求和平均

        user_file_name = os.path.basename(user_file)
        user_name = user_file_name.split('.')[0]  # user id

        sim_dict.update({user_name: sim_score})  # 或.update(b=2) # 每个岗位与各用户的相似度
    sim_list = sorted(zip(sim_dict.values(), sim_dict.keys()), reverse=True)  # 降序排列
    sim_list = sim_list[:100]  # 取前100个

    sim_lists.append(sim_list)

df = pd.DataFrame()
df['jd_sub_type'] = job_list
df['sim_users'] = sim_lists
df.to_csv("../data/jd_user_sim_2.csv", encoding="utf8", index=None, header=True)  # 写入文件,每个岗位与各用户的相似度

# df = pd.read_csv("../data/jd_user_sim.csv", encoding='utf8', header=0)  # 读取文件

print('end') 

3.说明

代码实现的是两个文件夹中,文本embedding两两之间的相似度。测试只提供了两个embedding,需要更改合适的路径运行。

参考:https://blog.csdn.net/Mr_carry/article/details/80996454(有核心代码的详解。)