欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

word2vec实现 注释

程序员文章站 2022-07-08 15:57:58
import tensorflow as tfimport numpy as npimport collectionsimport mathimport randomimport zipfileimport urllib.requestimport matplotlib.pyplot as pltfrom sklearn.manifold import TSNEimport osurl = "http://mattmahoney.net/dc/"# 下载数据def maybe_...
import tensorflow as tf
import numpy as np
import collections
import math
import random
import zipfile
import urllib.request
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import os

url = "http://mattmahoney.net/dc/"


# 下载数据
def maybe_download(filename, expected_btyes):
    # 1.这里可以用is False 来判断是否为假
    if os.path.exists(filename) is False:
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    # 2. os.stat()是stat系统调用的
    statinfo = os.stat(filename)  # Perform a stat system call on the given path.
    # 3.st_size 是文件大小了
    if statinfo.st_size == expected_btyes:
        print("Find and verify:" + filename)
    else:
        print(statinfo.st_size)
        raise Exception("Faise to verify:" + filename)
    return filename


x = 1
filename = maybe_download("text8.zip", 31344016)


def read_data(filename):
    # 13. zipfile是做zip格式编码的压缩和解压缩的
    # Class with methods to open, read, write, close, list zip files.
    with zipfile.ZipFile(filename, 'r') as f:
        # 14.Return a list of file names in the archive.
        a = f.namelist()
        b = a[0]
        # 15.Return file bytes for name.
        # 返回name文件的二进制
        c = f.read(b)
        # 16.as_str()Converts any string-like python input types to unicode.
        # 将input变为unicode
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


# data为所有的单词编号,count为单词对应出现的次数的列表
# index_dict为单词对应的编号(编号就是出现次数排名)
def build_dataset(words, vocabulary_size):
    count = [['UNK', -1]]
    # 17.Counter:Dict subclass for counting hashable items
    # Counter是Dict的子类,统计可哈希对象的出现频率。也是袋子。
    # most_common就是取频率最高的几个。
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    index_dict = {}
    # 给单词的频数排名
    k = 0
    # 18.这里改成for word in count:也能运行,区别就是word是一个元组。
    # 这里可以看成一个in 迭代了两次
    for word, _ in count:
        index_dict[word] = k
        k = k + 1
    # data列表记录对应word列表单词的排名
    data = []
    unk_count = 0
    # 20.看这种for 循环时,因该先看每一步必执行的部分。这里就是data.append(index)
    # 因为这必执行的是总体
    for word in words:
        # 19.字典可以直接用in关键字判断键是否存在
        if word in index_dict:
            index = index_dict[word]
        else:
            index = 0
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    # index_dict反转
    # 21.dict初始化可以用元组。其源码也标明了,mapping。
    # dict的values()和keys()方法,zip方法
    reverse_index_dict = dict(zip(index_dict.values(), index_dict.keys()))
    return data, count, index_dict, reverse_index_dict


# 生成样本数据,返回batch_size组batch[]对应label[]的数据
def generate_batch(batch_size, skip_window, num_skips):
    global data_index
    #22. assert expression 等价于:
    # if not expression:
    #     raise AssertionError
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    #23.ndarray类初始化一个数组
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    #@TODO 0.1 这里,span是3,skip_window是1,num_skips是2,有点不明白
    span = 2 * skip_window + 1
    #24.deque是一个双向队列
    buffer = collections.deque(maxlen=span)

    # 初始填充buffer数据
    # 25.这里填充初始的就是用来循环的,注意这里% range(int)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # 对每一个目标单词生成数据
    # 26.这里,第一次没看明白。这次才看到有句注释,是对每一个目标单词生成数据
    # 所以,num_skips 应该就是每个目标单词的周围词的个数
    for i in range(batch_size // num_skips):
        target_to_avoid = [skip_window]
        # 只是为了给target初值
        target = skip_window
        # 对确定的目标单词buffer[skip_window]生成样本数据
        # 26.1这里,这不是注释都说了。第一次没看
        for j in range(num_skips):
            #target = random.randint(0, span - 1)
            while target in target_to_avoid:
                #27 random.randint():
                # Return random integer in range [a, b], including both end points
                # 这竟然是个闭区间
                target = random.randint(0, span - 1)
            batch[i * num_skips + j] = buffer[target]
            labels[i * num_skips + j] = buffer[skip_window]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels


vocabulary_size = 50000

batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2

valid_size = 16
valid_window = 100
# 4. choice()函数是 Generates a random sample from a given 1-D array。
# 即:取样。replace是是否放回的取样。
# 取样,顾名思义就是排列
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64

graph = tf.Graph()
with graph.as_default():
    # 6.这里形状直接是[batch_size]竟然都行
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    # 5.tf.constant()可以用np数组作为输入
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    with tf.device('/cpu:0'):
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # 7. embeddings用的是uniform,而这里nce_embedding却是truncated_normal不知为何。
        # 也许是为了更好的传播?
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        # @TODO 1.这里应该是学这个项目最有收获的地方。负采样的损失函数
        # 应该是weights参数作为下一层,输入就是inputs,正确的标签就是labels
        # num_samped 就是负采样的个数
        # num_classes 就是总的多分类的类别数
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                             biases=nce_biases,
                                             labels=train_labels,
                                             inputs=embed,
                                             num_sampled=num_sampled,
                                             num_classes=vocabulary_size))
        # NO.好垃圾的优化器XD
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # 9.keep_dims很有用啊。这里他想把每一行都正则化,所以用了矩阵element-wise的除法
        # 如果不用keep_dims,则reduce_sum()的输出就是[batch,],这是少一个维度的
        # 如果用了,就是                           [batch,1],这样就能和原矩阵做除法
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1,
                                     keep_dims=True))
        # 10.现在应该有更好的正则化的函数了。他用了两句,其实也不复杂
        normalized_embeddings = embeddings / norm
        # 11.创建验证集
        valid_embeddings = \
            tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        # @TODO 2:12.这一句也挺关键的。A matmul B转置,输出直接是vocabulary个logit
        similarity = tf.matmul(
            valid_embeddings, normalized_embeddings, transpose_b=True)

        init = tf.global_variables_initializer()


def plot_with_labels(low_dim_embs, labels, filename='tsne,png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18, 18))
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        # (x,y)点注释,以描点为参考,向左偏移5,向上偏移2,annotate用于在图形上给数据添加文本注解,而且支持带箭头的划线工具,方便我们在合适的位置添加描述信息。
        plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
        plt.savefig(filename)


if __name__ == '__main__':
    words = read_data('text8.zip')
    # print('Data size:', len(words))
    # 保留top50000的单词
    data, count, index_dict, reverse_index_dict = build_dataset(words, 50000)
    # 删除单词列表,节省空间
    # del words

    data_index = 0

    with tf.Session(graph=graph) as sess:
        sess.run(init)

        average_loss = 0.0
        #for step in range(100001):
        for step in range(1001):
            batch_inputs, batch_labels = \
                generate_batch(batch_size, skip_window, num_skips)
            _, loss_val = sess.run([optimizer, loss],
                                   feed_dict={train_inputs: batch_inputs,
                                              train_labels: batch_labels})
            average_loss += loss_val
            # if step % 2000 == 0 :
            if step % 2000 == 0 and step != 0:
                print("Average loss at step:",
                      step, ":", average_loss / 2000)
                average_loss = 0.0

            if step % 10000 == 0:
                #28.这里eval()函数应该相当于run(similarity)
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reverse_index_dict[valid_examples[i]]
                    # 展示最近的8个单词,标号为0的为单词本身
                    #29.argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y
                    # @TODO 3 这里负号就是为了反向排序。max x = min -x这个小技巧不错
                    nearest = (-sim[i, :]).argsort()[1: 9]
                    print(sim[i, :])
                    log_str = "Nearest to " + valid_word + " :"
                    for k in range(8):
                        close_word = reverse_index_dict[nearest[k]]
                        log_str = log_str + close_word + ','
                    print(log_str)

        final_embeddings = normalized_embeddings.eval()
    #30.TSNE就是一种数据可视化的工具,能够将高维数据降到2-3维(降维),然后画成图
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
    plot_only = 100
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
    labels = [reverse_index_dict[i] for i in range(plot_only)]
    plot_with_labels(low_dim_embs, labels)
    a = 1

本文地址:https://blog.csdn.net/weixin_43963453/article/details/110929120

相关标签: NLP