自然语言处理-深度学习
程序员文章站
2022-07-13 10:09:26
...
学习目标:
了解深度学习在nlp中的应用
完整代码:
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import sys
import time
import pandas as pd
class SentimentNetwork(object):
def __init__(self, reviews, labels, hidden_nodes=10, learning_rate=0.1):
"""
参数:
reviews(dataFrame), 用于训练
labels(dataFrame), 用于训练
hidden_nodes(int), 隐层的个数
learning_rate(double),学习步长
"""
np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
def pre_process_data(self, reviews, labels):
"""
数据预处理,统计reviews中出现的所有单词,并且生成word2index
"""
# 统计reviews中出现的所有单词
review_vocab = set()
for review in reviews.values:
word = review[0].split(' ')
review_vocab.update(word)
self.review_vocab = list(review_vocab)
# 统计labels中所有出现的label(其实在这里,就+1和-1两种)
label_vocab = set()
for label in labels.values:
label_vocab.add(label[0])
self.label_vocab = list(label_vocab)
# 构建word2idx,给每个单词安排一个"门牌号"
self.word2index = dict()
for idx, word in enumerate(self.review_vocab):
self.word2index[word] = idx
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
"""
初始化网络的参数
"""
self.learning_rate = learning_rate
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
self.weights_0_1 = np.random.normal(0.0, self.input_nodes ** -0.5, (self.input_nodes, self.hidden_nodes))
self.weights_1_2 = np.random.normal(0.0, self.hidden_nodes ** -0.5, (self.hidden_nodes, self.output_nodes))
self.layer_1 = np.zeros((1, self.hidden_nodes))
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_output_2_derivative(self, output):
return output * (1 - output)
def get_target_for_label(self, label):
if label == 'positive':
return 1
else:
return 0
# training_reviews_raw 用于表示纯文本数据
def train(self, training_reviews_raw, training_labels):
assert (len(training_reviews_raw) == len(training_labels))
# 将纯文本进行转换,转换成单词出现下标的集合
# 比如:"Mary is a beautiful girl",这个纯文本数据将会转成类似[30, 450, 200, 12, 50]
# 数字为单词一一对应
training_reviews = list()
for review in training_reviews_raw.values:
words = review[0].split(' ')
indicates = set()
for word in words:
word = word.lower()
if word in self.word2index.keys():
indicates.add(self.word2index[word])
training_reviews.append(list(indicates))
assert (len(training_reviews) == len(training_labels))
correct_so_far = 0
start = time.time()
# 进行训练
# 直接计算layer_1,删除所有与layer_0有关的代码
for i in range(len(training_reviews)):
review = training_reviews[i]
label = training_labels.iloc[i, 0]
self.layer_1 *= 0
for index in review:
self.layer_1 += self.weights_0_1[index]
layer_1_o = self.layer_1
layer_2_i = np.dot(layer_1_o, self.weights_1_2)
layer_2_o = self.sigmoid(layer_2_i)
layer_2_error = layer_2_o - self.get_target_for_label(label)
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2_o)
layer_1_error = np.dot(layer_2_delta, self.weights_1_2.T)
layer_1_delta = layer_1_error
# 权重更新
self.weights_1_2 -= np.dot(layer_1_o.T, layer_2_delta) * self.learning_rate
for index in review:
self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate
if (layer_2_o >= 0.5 and label == 'positive'):
correct_so_far += 1
elif (layer_2_o < 0.5 and label == 'negative'):
correct_so_far += 1
elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
sys.stdout.write(
"\rProgress:" + str(100 * i / float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(
reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(
i + 1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i + 1))[:4] + "%")
if (i % 2500 == 0):
print("")
def test(self, testing_reviews, testing_labels):
assert (len(testing_reviews) == len(testing_labels))
correct = 0
start = time.time()
for i in range(len(testing_reviews)):
review = testing_reviews.iloc[i, 0]
label = testing_labels.iloc[i, 0]
pred = self.run(review)
if pred == label:
correct += 1
elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
sys.stdout.write(
"\rProgress:" + str(100 * i / float(len(testing_reviews)))[:4] + "% Speed(reviews/sec):" + str(
reviews_per_second)[0:5] + " #Correct:" + str(correct) + " #Tested:" + str(
i + 1) + " Testing Accuracy:" + str(correct * 100 / float(i + 1))[:4] + "%")
# 不再需要layer_0
def run(self, review):
# self.update_input_layer(review)
# layer_1_i = np.dot( self.layer_0, self.weights_0_1 )
indicates = set()
for word in review.lower().split(' '):
if word in self.word2index.keys():
indicates.add(self.word2index[word])
self.layer_1 *= 0
for idx in indicates:
self.layer_1 += self.weights_0_1[idx]
layer_1_o = self.layer_1
layer_2_i = np.dot(layer_1_o, self.weights_1_2)
layer_2_o = self.sigmoid(layer_2_i)
if layer_2_o >= 0.5:
return 'positive'
else:
return 'negative'
def main():
# 读取数据
reviews = pd.read_csv('./data/reviews.txt', header=None)
labels = pd.read_csv('./data/labels.txt', header=None)
# print(reviews.head())
# print(labels.head())
mlp = SentimentNetwork(reviews, labels, hidden_nodes=12, learning_rate=0.1)
mlp.train(reviews[:-1000], labels[:-1000])
mlp.test(reviews[-1000:], labels[-1000:])
if __name__ == '__main__':
main()
数据文件
链接:https://pan.baidu.com/s/1TPq8xSkZRXdrnr3U6CyDlQ
提取码:tcch
上一篇: 自然语言处理
下一篇: docker安装单机版tidb
推荐阅读
-
PHP 面向对象程序设计(oop)学习笔记 (四) - 异常处理类Exception
-
Python中的异常处理学习笔记
-
Python学习笔记之读取文件、OS模块、异常处理、with as语法示例
-
Python中基本的日期时间处理的学习教程
-
iOS学习笔记-139.RunLoop07——Runloop处理流程
-
Dlib+OpenCV深度学习人脸识别的方法示例
-
AMD Ryzen 7 1700值得买吗?锐龙AMD Ryzen 7 1700处理器性价比深度评测
-
js学习笔记之事件处理模型
-
.NET/Dot Net学习笔记---.net理解,基本类型,字符串转义字符处理
-
100本Python机器学习、深度学习电子书,免费送!