cs224n笔记05-探索词向量
程序员文章站
2022-07-07 19:40:41
1. 加载词向量def load_word2vec(embeddings_fp="./GoogleNews-vectors-negative300.bin"): """ Load Word2Vec Vectors Param: embeddings_fp (string) - path to .bin file of pretrained word vectors Return: wv_from_bin: All...
1. 加载词向量
def load_word2vec(embeddings_fp="./GoogleNews-vectors-negative300.bin"):
""" Load Word2Vec Vectors
Param:
embeddings_fp (string) - path to .bin file of pretrained word vectors
Return:
wv_from_bin: All 3 million embeddings, each lengh 300
This is the KeyedVectors format: https://radimrehurek.com/gensim/models/deprecated/keyedvectors.html
"""
embed_size = 300
print("Loading 3 million word vectors from file...")
## 自己下载的文件
wv_from_bin = KeyedVectors.load_word2vec_format(embeddings_fp, binary=True)
vocab = list(wv_from_bin.vocab.keys())
print("Loaded vocab size %i" % len(vocab))
return wv_from_bin
wv_from_bin = load_word2vec()
print()
2. 降维
def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']):
""" Put the word2vec vectors into a matrix M.
将word2vec向量放入矩阵M中。
Param:
wv_from_bin: KeyedVectors object; the 3 million word2vec vectors loaded from file
从文件中加载的300万个word2vec向量
Return:
M: numpy matrix shape (num words, 300) containing the vectors
M:包含向量的numpy矩阵形状(num字,300)
word2Ind: dictionary mapping each word to its row number in M
word2Ind:字典将每个单词映射到它在M中的行号
"""
import random
words = list(wv_from_bin.vocab.keys())
print("Shuffling words ...")
random.shuffle(words)
words = words[:10000]
print("Putting %i words into word2Ind and matrix M..." % len(words))
word2Ind = {}
M = []
curInd = 0
for w in words:
try:
M.append(wv_from_bin.word_vec(w))
word2Ind[w] = curInd
curInd += 1
except KeyError:
continue
for w in required_words:
try:
M.append(wv_from_bin.word_vec(w))
word2Ind[w] = curInd
curInd += 1
except KeyError:
continue
M = np.stack(M)
print("Done.")
return M, word2Ind
3. 单词类比测试
#man和woman对应king和queen
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))
本文地址:https://blog.csdn.net/z1103757047/article/details/107167967