基于词向量计算文本相似度(代码实例+测试数据)
程序员文章站
2022-03-15 15:17:55
基于词向量计算文本的余弦相似度1.测试数据:链接:https://pan.baidu.com/s/19gUVDOYS2-yfH4N6rVUWBA提取码:zfv22.实验代码:import mathimport osimport pandas as pdimport numpy as np# 计算两个向量的余弦相似度def cos_similarity(vec_dim, vector_1, vector_2): # 输入两个向量 # 计算两个向量的点积 x = 0...
基于词向量计算文本相似度
1.测试数据:
链接:https://pan.baidu.com/s/19gUVDOYS2-yfH4N6rVUWBA
提取码:zfv2
2.实验代码:
import math
import os
import pandas as pd
import numpy as np
# 计算两个向量的余弦相似度
def cos_similarity(vec_dim, vector_1, vector_2): # 输入两个向量
# 计算两个向量的点积
x = 0
i = 0
# vec_dim = len(vector_1) # 向量维度
while i < vec_dim:
x = x + vector_1[i] * vector_2[i]
i = i + 1
# 计算两个向量的模
i = 0
sq_1 = 0
sq_2 = 0
while i < vec_dim:
sq_1 = sq_1 + vector_1[i] * vector_1[i] # pow(a,2)
sq_2 = sq_2 + vector_2[i] * vector_2[i]
i = i + 1
result = float(x) / (math.sqrt(sq_1) * math.sqrt(sq_2))
return result
def get_embeddings(path):
with open(path, encoding='utf8') as f:
data = f.read().splitlines() # 逐行读取文件,并去除回车,输出['','',...,]
row = data[0].split()[0] # embedding的行数
col = data[0].split()[1] # embedding的列数
dim = int(col)
i = 1
embeddings = []
while i < int(row):
# item_list = []
item = data[i].split(' ')
word = item[0]
embedding = item[1:]
embedding = list(map(eval, embedding))
# item_list.append(word)
embeddings.append(embedding)
i += 1
# embeddings.append(item_list) # item_list[0]为关键词,item_list[1]为embedding
return embeddings, dim
def find_each(path):
path_list = []
files_dir = os.listdir(path)
for file in files_dir:
file_path = os.path.join('%s\%s' % (path, file))
path_list.append(file_path)
return path_list
def get_sim_matrix(path_1, path_2): # 输入单embedding的路径
# 获得两个embeddings数据
embeddings_1, vec_dim_1 = get_embeddings(path_1)
embeddings_2, vec_dim_2 = get_embeddings(path_2)
# 生成词向量相似度矩阵
if vec_dim_1 == vec_dim_2:
matrix = []
for em_1 in embeddings_1:
score = []
for em_2 in embeddings_2:
cos_sim = cos_similarity(vec_dim_1, em_1, em_2)
score.append(cos_sim) # embeddings1中第i个embedding与embeddings2中每个embedding的相似度值
matrix.append(score)
else:
print('input error: the dimensions are different')
return matrix
# 卷积层,卷积核的感受野为2*2,参数表示一个输入词向量矩阵
def cnn_folding(dict_vec):
c = len(dict_vec[1]) # 获取输入矩阵的横向长度
r = len(dict_vec) # 获取输入矩阵的纵向长度
result = [[0 for col in range(c-1)] for row in range(r-1)] # python构造的二维列表
for i in range(r-1): # 通过循环实现整个矩阵的运算
for j in range(c-1):
re = (dict_vec[i][j] + dict_vec[i][j+1] + dict_vec[i+1][j] +
dict_vec[i+1][j+1])/4 # 实现卷积层的运算,这里卷积核默认是[[1,1],[1,1]]
result[i][j] = re
return result
# 池化层,采用max-pooling方式实现池化,参数表示输入矩阵
def cnn_pooling(dict_pooling):
c = len(dict_pooling[1])
r = len(dict_pooling)
result = [[0 for col in range(c - 1)] for row in range(r - 1)] # python构造的二维列表
for i in range(r - 1):
for j in range(c - 1):
re = max(dict_pooling[i][j], dict_pooling[i][j + 1], dict_pooling[i + 1][j],
dict_pooling[i + 1][j + 1]) # max-pooling方法实现池化
result[i][j] = re
return result
# 实现卷积层和池化层的连接层
def pooling_folding(matrix):
res = []
data_list = matrix
while 1: # 交替实现卷积层和池化层
c = len(data_list[0])
r = len(data_list)
if c == 1 or r == 1: # 判定池化层跳出循环条件
for i in range(len(data_list)):
for j in data_list[i]:
res.append(j)
break
pool = cnn_pooling(data_list) # 实现池化层
if len(pool) == 1 or len(pool[1]) == 1: # 判定卷积层跳出循环的条件
data_list = pool
for i in range(len(data_list)):
for j in data_list[i]:
res.append(j)
break
else:
fold = cnn_folding(pool) # 实现卷积层
data_list = fold
pool = [[0 for col in range(c - 1)] for row in range(r - 1)]
fold = [[0 for col in range(c - 1)] for row in range(r - 1)]
return res
jd_path = r'D:\thesis\0811\jd_graph\graph_embeddings'
user_path = r'D:\thesis\0811\user_graph\graph_embeddings'
jd_em_paths = find_each(jd_path) # 得到目录下的
user_em_paths = find_each(user_path)
job_list = []
sim_lists = []
for jd_file in jd_em_paths:
sim_dict = {}
jd_file_name = os.path.basename(jd_file)
jd_name = jd_file_name.split('.')[0] # jd的类型名称
job_list.append(jd_name)
for user_file in user_em_paths:
sim_matrix = get_sim_matrix(jd_file, user_file) # 行代表job的embedding,列代表user的embedding,值为两个embedding的相似度
sim_res = pooling_folding(sim_matrix) # 送入卷积、池化层,全连接
sim_score = sum(sim_res)/len(sim_res) # 求和平均
user_file_name = os.path.basename(user_file)
user_name = user_file_name.split('.')[0] # user id
sim_dict.update({user_name: sim_score}) # 或.update(b=2) # 每个岗位与各用户的相似度
sim_list = sorted(zip(sim_dict.values(), sim_dict.keys()), reverse=True) # 降序排列
sim_list = sim_list[:100] # 取前100个
sim_lists.append(sim_list)
df = pd.DataFrame()
df['jd_sub_type'] = job_list
df['sim_users'] = sim_lists
df.to_csv("../data/jd_user_sim_2.csv", encoding="utf8", index=None, header=True) # 写入文件,每个岗位与各用户的相似度
# df = pd.read_csv("../data/jd_user_sim.csv", encoding='utf8', header=0) # 读取文件
print('end')
3.说明
代码实现的是两个文件夹中,文本embedding两两之间的相似度。测试只提供了两个embedding,需要更改合适的路径运行。
参考:https://blog.csdn.net/Mr_carry/article/details/80996454(有核心代码的详解。)