欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

cs224n笔记05-探索词向量

程序员文章站 2022-07-07 19:40:41
1. 加载词向量def load_word2vec(embeddings_fp="./GoogleNews-vectors-negative300.bin"): """ Load Word2Vec Vectors Param: embeddings_fp (string) - path to .bin file of pretrained word vectors Return: wv_from_bin: All...

1. 加载词向量

def load_word2vec(embeddings_fp="./GoogleNews-vectors-negative300.bin"):
    """ Load Word2Vec Vectors
        Param:
            embeddings_fp (string) - path to .bin file of pretrained word vectors
        Return:
            wv_from_bin: All 3 million embeddings, each lengh 300
                This is the KeyedVectors format: https://radimrehurek.com/gensim/models/deprecated/keyedvectors.html
    """
    embed_size = 300
    print("Loading 3 million word vectors from file...")
    ## 自己下载的文件
    wv_from_bin = KeyedVectors.load_word2vec_format(embeddings_fp, binary=True)
    vocab = list(wv_from_bin.vocab.keys())
    print("Loaded vocab size %i" % len(vocab))
    return wv_from_bin
wv_from_bin = load_word2vec()
print()

2. 降维

def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']):
    """ Put the word2vec vectors into a matrix M.
        将word2vec向量放入矩阵M中。
        Param:
            wv_from_bin: KeyedVectors object; the 3 million word2vec vectors loaded from file
            从文件中加载的300万个word2vec向量
        Return:
        
            M: numpy matrix shape (num words, 300) containing the vectors
            M:包含向量的numpy矩阵形状(num字,300)
            word2Ind: dictionary mapping each word to its row number in M
            word2Ind:字典将每个单词映射到它在M中的行号
    """
    import random
    words = list(wv_from_bin.vocab.keys())
    print("Shuffling words ...")
    random.shuffle(words)
    words = words[:10000]
    print("Putting %i words into word2Ind and matrix M..." % len(words))
    word2Ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2Ind

3. 单词类比测试

#man和woman对应king和queen
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))

 

 

本文地址:https://blog.csdn.net/z1103757047/article/details/107167967