NNLM语言模型python实现（基于中文语料）

程序员文章站 2022-07-08 17:21:43

原理图运用场景：知道句子的前N-1个词，来预测第N个词。网络的流程：1.词语one-hot编码—————2.projection_layer层————3.hidden_layer层——————4.SoftMax层1准备工作这个代码是我在GitHub上看到的，原代码是针对英文，英文比较简单。我修改下针对中文。并给出Keras版代码。import numpy as npimport tensorflow as tfimport resentences = [ "我爱你", "余登武",...

原理图

运用场景：

知道句子的前N-1个词，来预测第N个词。

网络的流程：

1.词语one-hot编码—————2.projection_layer层————3.hidden_layer层——————4.SoftMax层

1准备工作

这个代码是我在GitHub上看到的，原代码是针对英文，英文比较简单。我修改下针对中文。并给出Keras版代码。

 import numpy as np import tensorflow as tf import re
sentences = [ "我爱你", "余登武", "范冰冰"] #分字 def seg_char(sent): pattern = re.compile(r'([\u4e00-\u9fa5])') chars = pattern.split(sent) chars =[w for w in chars if len(w.strip()) > 0] return chars

chars=np.array([seg_char(i)for i in sentences]) chars=chars.reshape(1,-1) #chars[['我' '爱' '你' '余' '登' '武' '范' '冰' '冰']] word_list=np.squeeze(chars)#降维 #word_list['我' '爱' '你' '余' '登' '武' '范' '冰' '冰'] word_list = list(set(word_list)) word_dict = {w: i for i, w in enumerate(word_list)} #word_dict{'余': 0, '武': 1, '你': 2, '范': 3, '登': 4, '我': 5, '冰': 6, '爱': 7} number_dict = {i: w for i, w in enumerate(word_list)} #number_dict{0: '登', 1: '武', 2: '冰', 3: '我', 4: '余', 5: '范', 6: '你', 7: '爱'} n_class = len(word_dict) # number of Vocabulary

2输入输出one-hot编码

# NNLM Parameter n_step = 2 # number of steps ['我 爱', '范 冰', '余 登'] n_hidden = 2 # number of hidden units def make_batch(sentences): input_batch = [] target_batch = [] for sen in sentences: word = seg_char(sen)#分字 input = [word_dict[n] for n in word[:-1]] target = word_dict[word[-1]] input_batch.append(np.eye(n_class)[input]) target_batch.append(np.eye(n_class)[target]) return input_batch, target_batch

input_batch, target_batch=make_batch(sentences)

3模型

# Model X = tf.placeholder(tf.float32, [None, n_step, n_class]) # [batch_size, number of steps, number of Vocabulary] Y = tf.placeholder(tf.float32, [None, n_class]) input = tf.reshape(X, shape=[-1, n_step * n_class]) # [batch_size, n_step * n_class] H = tf.Variable(tf.random_normal([n_step * n_class, n_hidden])) d = tf.Variable(tf.random_normal([n_hidden])) U = tf.Variable(tf.random_normal([n_hidden, n_class])) b = tf.Variable(tf.random_normal([n_class])) tanh = tf.nn.tanh(d + tf.matmul(input, H)) # [batch_size, n_hidden] model = tf.matmul(tanh, U) + b # [batch_size, n_class] cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y)) optimizer = tf.train.AdamOptimizer(0.001).minimize(cost) prediction =tf.argmax(model, 1)

4训练

# Training init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) for epoch in range(5000): _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch}) if (epoch + 1)%1000 == 0: print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss)) # Predict predict = sess.run([prediction], feed_dict={X: input_batch})

5测试

# Test input = [seg_char(sen)[:2] for sen in sentences] print([seg_char(sen)[:2] for sen in sentences], '预测得到->', [number_dict[n] for n in predict[0]])

全文代码

import numpy as np import tensorflow as tf import re
sentences = [ "我爱你", "余登武", "范冰冰"] def seg_char(sent): pattern = re.compile(r'([\u4e00-\u9fa5])') chars = pattern.split(sent) chars =[w for w in chars if len(w.strip()) > 0] return chars

chars=np.array([seg_char(i)for i in sentences]) chars=chars.reshape(1,-1) word_list=np.squeeze(chars) ##word_list['我' '爱' '你' '余' '登' '武' '范' '冰' '冰'] word_list = list(set(word_list)) word_dict = {w: i for i, w in enumerate(word_list)} #word_dict{'余': 0, '武': 1, '你': 2, '范': 3, '登': 4, '我': 5, '冰': 6, '爱': 7} number_dict = {i: w for i, w in enumerate(word_list)} #{0: '登', 1: '武', 2: '冰', 3: '我', 4: '余', 5: '范', 6: '你', 7: '爱'} n_class = len(word_dict) # number of Vocabulary # NNLM Parameter n_step = 2 # number of steps ['我 爱', '范 冰', '余 登'] n_hidden = 2 # number of hidden units def make_batch(sentences): input_batch = [] target_batch = [] for sen in sentences: word = seg_char(sen)#分字 input = [word_dict[n] for n in word[:-1]] target = word_dict[word[-1]] input_batch.append(np.eye(n_class)[input]) target_batch.append(np.eye(n_class)[target]) return input_batch, target_batch

input_batch, target_batch=make_batch(sentences) # Model X = tf.placeholder(tf.float32, [None, n_step, n_class]) # [batch_size, number of steps, number of Vocabulary] Y = tf.placeholder(tf.float32, [None, n_class]) input = tf.reshape(X, shape=[-1, n_step * n_class]) # [batch_size, n_step * n_class] H = tf.Variable(tf.random_normal([n_step * n_class, n_hidden])) d = tf.Variable(tf.random_normal([n_hidden])) U = tf.Variable(tf.random_normal([n_hidden, n_class])) b = tf.Variable(tf.random_normal([n_class])) tanh = tf.nn.tanh(d + tf.matmul(input, H)) # [batch_size, n_hidden] model = tf.matmul(tanh, U) + b # [batch_size, n_class] cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y)) optimizer = tf.train.AdamOptimizer(0.001).minimize(cost) prediction =tf.argmax(model, 1) # Training init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) for epoch in range(5000): _, loss = sess.run([optimizer, cost], feed_dict={X: input_batch, Y: target_batch}) if (epoch + 1)%1000 == 0: print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss)) # Predict predict = sess.run([prediction], feed_dict={X: input_batch}) # Test input = [seg_char(sen)[:2] for sen in sentences] print([seg_char(sen)[:2] for sen in sentences], '预测得到->', [number_dict[n] for n in predict[0]])

Keras 版代码

#!/usr/bin/env python3 # -*- coding: utf-8 -*- # @Author: yudengwu # @Date  : 2020/8/26 from keras.models import Sequential import numpy as np import tensorflow as tf import re
sentences = [ "我爱你", "余登武", "范冰冰"] #分字 def seg_char(sent): pattern = re.compile(r'([\u4e00-\u9fa5])') chars = pattern.split(sent) chars =[w for w in chars if len(w.strip()) > 0] return chars

chars=np.array([seg_char(i)for i in sentences]) chars=chars.reshape(1,-1) word_list=np.squeeze(chars) ##word_list['我' '爱' '你' '余' '登' '武' '范' '冰' '冰'] word_list = list(set(word_list)) word_dict = {w: i for i, w in enumerate(word_list)} #word_dict{'余': 0, '武': 1, '你': 2, '范': 3, '登': 4, '我': 5, '冰': 6, '爱': 7} number_dict = {i: w for i, w in enumerate(word_list)} #{0: '登', 1: '武', 2: '冰', 3: '我', 4: '余', 5: '范', 6: '你', 7: '爱'} n_class = len(word_dict) # number of Vocabulary # NNLM Parameter n_step = 2 # number of steps ['我 爱', '范 冰', '余 登'] #输入输出onr-hot化 def make_batch(sentences): input_batch = [] target_batch = [] for sen in sentences: word = seg_char(sen)#分字 input = [word_dict[n] for n in word[:-1]] target = word_dict[word[-1]] input_batch.append(np.eye(n_class)[input]) target_batch.append(np.eye(n_class)[target]) return input_batch, target_batch

input_batch, target_batch=make_batch(sentences) input_batch=np.array(input_batch) input_batch=input_batch.reshape(-1,n_step*n_class) target_batch=np.array(target_batch) target_batch=target_batch.reshape(-1,n_class) from keras.layers import Dense import keras #模型 def define_model(): model = Sequential() model.add(Dense(2,activation='tanh',input_shape=(n_step*n_class,))) model.add(Dense(n_class, activation='softmax')) # 输出层 model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() return model


model=define_model() model.fit(input_batch, target_batch, epochs=5000)#训练5000轮，数据少啦，一两轮没效果 #预测测试 predict=model.predict(input_batch) predict=np.argmax(predict,1)#求取最大值索引 print('输入的是:',[seg_char(sen)[:2] for sen in sentences]) print('预测得到：',[number_dict[n] for n in predict])

NNLM 缺点：如果词语太多，会造成维度过大问题。
可以取出中间层权重即wordvec方法。
NNLM也可以做文本生成的。for循环，加随机采样。
可以看下另一篇博客文本生成。

古诗文本自动生成唐诗文本生成（算例代码）

本文地址：https://blog.csdn.net/kobeyu652453/article/details/108238642

上一篇：去除中国菜刀密码的方法

下一篇： Git报错解决：OpenSSL SSL_read: Connection was reset, errno 10054 错误解决

NNLM语言模型python实现（基于中文语料）

原理图

1准备工作

2输入输出one-hot编码

3模型

4训练

5测试

Keras 版代码

python基于隐马尔可夫模型实现中文拼音输入