word2vec实现 注释
程序员文章站
2022-07-08 15:57:58
import tensorflow as tfimport numpy as npimport collectionsimport mathimport randomimport zipfileimport urllib.requestimport matplotlib.pyplot as pltfrom sklearn.manifold import TSNEimport osurl = "http://mattmahoney.net/dc/"# 下载数据def maybe_...
import tensorflow as tf
import numpy as np
import collections
import math
import random
import zipfile
import urllib.request
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import os
url = "http://mattmahoney.net/dc/"
# 下载数据
def maybe_download(filename, expected_btyes):
# 1.这里可以用is False 来判断是否为假
if os.path.exists(filename) is False:
filename, _ = urllib.request.urlretrieve(url + filename, filename)
# 2. os.stat()是stat系统调用的
statinfo = os.stat(filename) # Perform a stat system call on the given path.
# 3.st_size 是文件大小了
if statinfo.st_size == expected_btyes:
print("Find and verify:" + filename)
else:
print(statinfo.st_size)
raise Exception("Faise to verify:" + filename)
return filename
x = 1
filename = maybe_download("text8.zip", 31344016)
def read_data(filename):
# 13. zipfile是做zip格式编码的压缩和解压缩的
# Class with methods to open, read, write, close, list zip files.
with zipfile.ZipFile(filename, 'r') as f:
# 14.Return a list of file names in the archive.
a = f.namelist()
b = a[0]
# 15.Return file bytes for name.
# 返回name文件的二进制
c = f.read(b)
# 16.as_str()Converts any string-like python input types to unicode.
# 将input变为unicode
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
# data为所有的单词编号,count为单词对应出现的次数的列表
# index_dict为单词对应的编号(编号就是出现次数排名)
def build_dataset(words, vocabulary_size):
count = [['UNK', -1]]
# 17.Counter:Dict subclass for counting hashable items
# Counter是Dict的子类,统计可哈希对象的出现频率。也是袋子。
# most_common就是取频率最高的几个。
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
index_dict = {}
# 给单词的频数排名
k = 0
# 18.这里改成for word in count:也能运行,区别就是word是一个元组。
# 这里可以看成一个in 迭代了两次
for word, _ in count:
index_dict[word] = k
k = k + 1
# data列表记录对应word列表单词的排名
data = []
unk_count = 0
# 20.看这种for 循环时,因该先看每一步必执行的部分。这里就是data.append(index)
# 因为这必执行的是总体
for word in words:
# 19.字典可以直接用in关键字判断键是否存在
if word in index_dict:
index = index_dict[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
# index_dict反转
# 21.dict初始化可以用元组。其源码也标明了,mapping。
# dict的values()和keys()方法,zip方法
reverse_index_dict = dict(zip(index_dict.values(), index_dict.keys()))
return data, count, index_dict, reverse_index_dict
# 生成样本数据,返回batch_size组batch[]对应label[]的数据
def generate_batch(batch_size, skip_window, num_skips):
global data_index
#22. assert expression 等价于:
# if not expression:
# raise AssertionError
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
#23.ndarray类初始化一个数组
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
#@TODO 0.1 这里,span是3,skip_window是1,num_skips是2,有点不明白
span = 2 * skip_window + 1
#24.deque是一个双向队列
buffer = collections.deque(maxlen=span)
# 初始填充buffer数据
# 25.这里填充初始的就是用来循环的,注意这里% range(int)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
# 对每一个目标单词生成数据
# 26.这里,第一次没看明白。这次才看到有句注释,是对每一个目标单词生成数据
# 所以,num_skips 应该就是每个目标单词的周围词的个数
for i in range(batch_size // num_skips):
target_to_avoid = [skip_window]
# 只是为了给target初值
target = skip_window
# 对确定的目标单词buffer[skip_window]生成样本数据
# 26.1这里,这不是注释都说了。第一次没看
for j in range(num_skips):
#target = random.randint(0, span - 1)
while target in target_to_avoid:
#27 random.randint():
# Return random integer in range [a, b], including both end points
# 这竟然是个闭区间
target = random.randint(0, span - 1)
batch[i * num_skips + j] = buffer[target]
labels[i * num_skips + j] = buffer[skip_window]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
vocabulary_size = 50000
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
valid_size = 16
valid_window = 100
# 4. choice()函数是 Generates a random sample from a given 1-D array。
# 即:取样。replace是是否放回的取样。
# 取样,顾名思义就是排列
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64
graph = tf.Graph()
with graph.as_default():
# 6.这里形状直接是[batch_size]竟然都行
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
# 5.tf.constant()可以用np数组作为输入
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
with tf.device('/cpu:0'):
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# 7. embeddings用的是uniform,而这里nce_embedding却是truncated_normal不知为何。
# 也许是为了更好的传播?
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# @TODO 1.这里应该是学这个项目最有收获的地方。负采样的损失函数
# 应该是weights参数作为下一层,输入就是inputs,正确的标签就是labels
# num_samped 就是负采样的个数
# num_classes 就是总的多分类的类别数
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
# NO.好垃圾的优化器XD
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
# 9.keep_dims很有用啊。这里他想把每一行都正则化,所以用了矩阵element-wise的除法
# 如果不用keep_dims,则reduce_sum()的输出就是[batch,],这是少一个维度的
# 如果用了,就是 [batch,1],这样就能和原矩阵做除法
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1,
keep_dims=True))
# 10.现在应该有更好的正则化的函数了。他用了两句,其实也不复杂
normalized_embeddings = embeddings / norm
# 11.创建验证集
valid_embeddings = \
tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
# @TODO 2:12.这一句也挺关键的。A matmul B转置,输出直接是vocabulary个logit
similarity = tf.matmul(
valid_embeddings, normalized_embeddings, transpose_b=True)
init = tf.global_variables_initializer()
def plot_with_labels(low_dim_embs, labels, filename='tsne,png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18))
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
# (x,y)点注释,以描点为参考,向左偏移5,向上偏移2,annotate用于在图形上给数据添加文本注解,而且支持带箭头的划线工具,方便我们在合适的位置添加描述信息。
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.savefig(filename)
if __name__ == '__main__':
words = read_data('text8.zip')
# print('Data size:', len(words))
# 保留top50000的单词
data, count, index_dict, reverse_index_dict = build_dataset(words, 50000)
# 删除单词列表,节省空间
# del words
data_index = 0
with tf.Session(graph=graph) as sess:
sess.run(init)
average_loss = 0.0
#for step in range(100001):
for step in range(1001):
batch_inputs, batch_labels = \
generate_batch(batch_size, skip_window, num_skips)
_, loss_val = sess.run([optimizer, loss],
feed_dict={train_inputs: batch_inputs,
train_labels: batch_labels})
average_loss += loss_val
# if step % 2000 == 0 :
if step % 2000 == 0 and step != 0:
print("Average loss at step:",
step, ":", average_loss / 2000)
average_loss = 0.0
if step % 10000 == 0:
#28.这里eval()函数应该相当于run(similarity)
sim = similarity.eval()
for i in range(valid_size):
valid_word = reverse_index_dict[valid_examples[i]]
# 展示最近的8个单词,标号为0的为单词本身
#29.argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y
# @TODO 3 这里负号就是为了反向排序。max x = min -x这个小技巧不错
nearest = (-sim[i, :]).argsort()[1: 9]
print(sim[i, :])
log_str = "Nearest to " + valid_word + " :"
for k in range(8):
close_word = reverse_index_dict[nearest[k]]
log_str = log_str + close_word + ','
print(log_str)
final_embeddings = normalized_embeddings.eval()
#30.TSNE就是一种数据可视化的工具,能够将高维数据降到2-3维(降维),然后画成图
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 100
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
labels = [reverse_index_dict[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
a = 1
本文地址:https://blog.csdn.net/weixin_43963453/article/details/110929120
上一篇: 今日头条爬虫实战