神经网络实例Python
程序员文章站
2024-03-14 12:20:22
...
本文采用一个模拟数据集进行神经网络的训练,相关知识点包括数据预处理、BN层、神经网络模型、梯度反向传播、梯度检查、监测训练过程、超参数随机搜索等,使读者掌握一个完整的机器学习流程。
1、生成数据
生成一个线性不可分的数据集,它是一个随时间增长的振荡数据。
import numpy as np
import matplotlib.pyplot as plt
# 显示数据
def show_data(X, labels):
plt.scatter(X[:, 0], X[:, 1], c=labels, s=10, cmap=plt.cm.Spectral)
plt.show()
def gen_random_data(dim, N_class, num_samp_per_class):
num_examples = num_samp_per_class*N_class
X = np.random.randn(num_examples, dim) # data matrix (each row = single example)
labels = np.random.randint(N_class, size = num_examples) # class labels
show_data(X, labels)
return (X, labels)
dim = 2 # dimensionality
N_class = 4 # number of classes
layer_param = [dim, 10, 20, N_class]
(X, labels) = gen_random_data(dim, N_class, num_samp_per_class=20)
# 生成数据
def gen_toy_data(dim, N_class, num_samp_per_class):
num_examples = num_samp_per_class * N_class
X = np.zeros((num_examples, dim)) # data matrix
labels = np.zeros(num_examples, dtype='uint8') # class labels
for j in range(N_class):
ix = range(num_samp_per_class * j, num_samp_per_class * (j + 1))
x = np.linspace(-np.pi, np.pi, num_samp_per_class) + 5 # x axis
y = np.sin(x + j * np.pi / (0.5 * N_class))
y += 0.2 * np.sin(10 * x + j * np.pi / (0.5 * N_class))
y += 0.25 * x + 10
y += np.random.randn(num_samp_per_class) * 0.1 # noise
X[ix] = np.c_[x, y]
labels[ix] = j
show_data(X, labels)
return (X, labels)
2、数据预处理
中心化和归一化
def normalize(X):
# (x-u)/delta
mean = np.mean(X, axis=0)
X_norm = X - mean
std = np.std(X_norm, axis=0)
X_norm /= std + 10**(-5)
return (X_norm, mean, std)
PCA白化
def PCA_white(X):
mean = np.mean(X, axis=0)
X_norm = X - mean
cov = np.dot(X_norm.T, X_norm)/X_norm.shape[0]
U,S,V = np.linalg.svd(cov)
X_norm = np.dot(X_norm, U)
X_norm /= np.sqrt(S + 10**(-5))
return (X_norm, mean, U, S)
数据集按比例2:1:1随机分为训练集、验证集和测试集
def split_data(X, labels):
split1 = 2
split2 = 4
num_examples = X.shape[0]
shuffle_no = list(range(num_examples))
np.random.shuffle(shuffle_no)
X_train = X[shuffle_no[:num_examples//split1]]
labels_train = labels[shuffle_no[:num_examples//split1]]
X_val = X[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]]
labels_val = labels[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]]
X_test = X[shuffle_no[-num_examples//4:]]
labels_test = labels[shuffle_no[-num_examples//4:]]
return (X_train, labels_train, X_val, labels_val, X_test, labels_test)
进行预处理
def data_preprocess(X_train, X_val, X_test):
(X_train_pca, mean, U, S) = PCA_white(X_train)
X_val_pca = np.dot(X_val - mean, U)
X_val_pca /= np.sqrt(S + 10**(-5))
X_test_pca = np.dot(X_test - mean, U)
X_test_pca /= np.sqrt(S + 10**(-5))
return (X_train_pca, X_val_pca, X_test_pca)
3、网络模型
超参数主要是网络深度(隐含层的数量)和每层神经元的数量。可以采用list结构存储每层神经元数量,包括输入层和输出层。
初始化
# layer_param = [dim, 100, 100, N_class]
def initialize_parameters(layer_param):
weights = []
biases = []
vweights = []
vbiases = []
for i in range(len(layer_param) - 1):
in_depth = layer_param[i]
out_depth = layer_param[i+1]
std = np.sqrt(2/in_depth)*0.5
weights.append(std*np.random.randn(in_depth, out_depth))
biases.append((np.zeros((1, out_depth))))
vweights.append(np.zeros((in_depth, out_depth)))
vbiases.append(np.zeros((1, out_depth)))
return (weights, biases, vweights, vbiases)
前向计算
def forward(X, layer_param, weights, biases):
hiddens = []
hiddens.append(X)
for i in range(len(layer_param)-2):
hiddens.append(np.maximum(0, np.dot(hiddens[i], weights[i]) + biases[i]))
# 最后一层不需要非线性**函数
scores = np.dot(hiddens[-1], weights[-1]) + biases[-1]
return (hiddens, scores)
计算softmax数据损失值
def data_loss_softmax(scores, labels):
num_examples = scores.shape[0]
exp_scores = np.exp(scores)
exp_cores_sum = np.sum(exp_scores, axis=1)
corect_probs = exp_scores[range(num_examples), labels]/exp_cores_sum
corect_logprobs = -np.log(corect_probs)
data_loss = np.sum(corect_logprobs)/num_examples
return data_loss
计算L2范数损失
def reg_L2_loss(weights, reg):
reg_loss = 0
for weight in weights:
reg_loss += 0.5*reg*np.sum(weight*weight)
return reg_loss
计算分值矩阵梯度
def dscores_softmax(scores, labels):
num_examples = scores.shape[0]
exp_scores = np.exp(scores)
probs = exp_scores/np.sum(exp_scores, axis=1, keepdims=True)
dscores = probs
dscores[range(num_examples), labels] -= 1
dscores /= num_examples
return dscores
准确率预测,和前向函数几乎一致,只是不需要保存hidden神经元
def predict(X, labels, layer_param, weights, biases):
hidden = X
for i in range(len(layer_param)-2):
hidden = np.maximum(0, np.dot(hidden, weights[i]) + biases[i])
scores = np.dot(hidden, weights[-1]) + biases[-1]
predicted_class = np.argmax(scores, axis=1)
right_class = predicted_class == labels
return np.mean(right_class)
梯度反向传播算法
def gradient_backprop(dscores, hiddens, weights, biases, reg):
dweights = []
dbiases = []
dhidden = dscores
for i in range(len(hiddens)-1, -1, -1):
dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i])
dbiases.append(np.sum(dhidden, axis=0, keepdims=True))
dhidden = np.dot(dhidden, weights[i].T)
dhidden[hiddens[i]<=0] = 0
return (dweights, dbiases)
4、梯度检查
对每层的权重和偏置进行梯度检查
def check_gradient(X, labels, layer_param, check_weight_or_bias):
(X, labels) = gen_random_data(dim, N_class, num_samp_per_class=200)
(weights, biases, vweights, vbiases) = initialize_parameters(layer_param)
reg = 10**(-9)
step = 10**(-5)
for layer in range(len(weights)):
if check_weight_or_bias:
row = np.random.randint(weights[layer].shape[0])
col = np.random.randint(weights[layer].shape[1])
param = weights[layer][row][col]
else:
row = np.random.randint(weights[layer].shape[1])
param = biases[layer][0][row]
# 前向计算
(hiddens, scores) = forward(X, layer_param, weights, biases)
dscores = dscores_softmax(scores, labels)
# 反向计算
(dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
if check_weight_or_bias:
danalytic = dweights[-1-layer][row][col]
else:
danalytic = dbiases[-1-layer][0][row]
if check_weight_or_bias:
weights[layer][row][col] = param - step
else:
biases[layer][0][row] = param - step
(hiddens, scores) = forward(X, layer_param, weights, biases)
data_loss1 = data_loss_softmax(scores, labels)
reg_loss1 = reg_L2_loss(weights, reg)
loss1 = data_loss1 + reg_loss1
if check_weight_or_bias:
weights[layer][row][col] = param + step
else:
biases[layer][0][row] = param + step
(hiddens, scores) = forward(X, layer_param, weights, biases)
data_loss2 = data_loss_softmax(scores, labels)
reg_loss2 = reg_L2_loss(weights, reg)
loss2 = data_loss2 + reg_loss2
dnumeric = (loss2 - loss1)/(2*step)
print(layer, data_loss1, data_loss2)
error_relative = np.abs(danalytic - dnumeric)/np.maximum(danalytic, dnumeric)
print(danalytic, dnumeric, error_relative)
5、参数优化
采用Nesterov动量优化方法,同时函数返回了参数更新比例
def nesterov_momentumSGD(vparams, params, dparams, lr, mu):
update_ratio = []
for i in range(len(params)):
pre_vparam = vparams[i]
vparams[i] = mu*vparams[i] - lr*dparams[-1-i]
update_param = vparams[i] + mu * (vparams[i] - pre_vparam)
params[i] += update_param
update_ratio.append(np.sum(np.abs(update_param))
/np.sum(np.abs(params[i])))
return update_ratio
6、训练网络
初始化参数,前向计算,计算训练集和验证集的准确率,计算数据损失和正则化损失,梯度反向传播,进行学习率指数退化,可视化
def train_net(X_train, labels_train, layer_param, lr,
lr_decay, reg, mu, max_epoch, X_val, labels_val):
lr0 = lr
(weights, biases, vweights, vbiases) = initialize_parameters(layer_param)
epoch = 0
data_losses = []
reg_losses = []
val_accuracy = []
train_accuracy = []
weights_update_ratio = []
baises_update_ratio = []
while epoch < max_epoch:
(hiddens, scores) = forward(X_train, layer_param, weights, biases)
val_accuracy.append(predict(X_val, labels_val, layer_param, weights, biases))
train_accuracy.append(predict(X_train, labels_train, layer_param, weights, biases))
data_loss = data_loss_softmax(scores, labels_train)
reg_loss = reg_L2_loss(weights, reg)
dscores = dscores_softmax(scores, labels_train)
(dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg)
weights_update_ratio.append(nesterov_momentumSGD(
vweights, weights, dweights, lr, mu
))
baises_update_ratio.append(nesterov_momentumSGD(
vbiases, biases, dbiases, lr, mu
))
data_losses.append(data_loss)
reg_losses.append(reg_loss)
epoch += 1
lr *= lr_decay
# 可视化数据损失、训练集和验证集准确率
plt.close()
fig = plt.figure('loss')
ax = fig.add_subplot(2, 1, 1)
ax.grid(True)
ax2 = fig.add_subplot(2, 1, 2)
ax2.grid(True)
plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
fontsize=14)
plt.ylabel(' accuracy log10(data loss)', fontsize=14)
ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b', marker='.')
# ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')
ax2.scatter(np.arange(len(val_accuracy) - 0), val_accuracy[0:], c='r', marker='*')
ax2.scatter(np.arange(len(val_accuracy) - 0), train_accuracy[0:], c='g', marker='.')
# ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
# ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
plt.show()
# %% 对数显示每层权重和偏置的更新率,合理值在10**(-3)
for layer in range(len(weights)):
wur = []
for i in range(len(weights_update_ratio)):
wur.append(weights_update_ratio[i][layer])
bur = []
for i in range(len(baises_update_ratio)):
bur.append(baises_update_ratio[i][layer])
plt.close()
fig = plt.figure('update ratio')
ax = fig.add_subplot(2, 1, 1)
ax.grid(True)
ax2 = fig.add_subplot(2, 1, 2)
ax2.grid(True)
plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)),
fontsize=14)
ax.scatter(np.arange(len(wur)), np.log10(wur), c='b', marker='.')
ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r', marker='*')
plt.show()
return (data_losses, reg_losses, weights, biases, val_accuracy)
7、过拟合小数据集
def overfit_tinydata(lr=10 ** (-0.0), lr_decay=1, mu=0.9, reg=0, max_epoch=100):
(X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=2)
X,_,_,_ = PCA_white(X)
layer_param = [dim, 100, 100, N_class]
(data_losses, reg_losses, weights, biases, accuracy) = train_net(X, labels, layer_param, lr, lr_decay, reg, mu,
max_epoch, X, labels)
return (data_losses, reg_losses, accuracy)
8、超参数随机搜索
def hyperparam_random_search(X_train, labels_train,
X_val, labels_val,
layer_param, num_try=10,
lr=[-1, -5], lr_decay=1,
mu=0.9, reg=[-2.0, -5.0],
max_epoch=500):
minlr = min(lr)
maxlr = max(lr)
randn = np.random.rand(num_try*2)
lr_array = 10**(minlr + (maxlr-minlr)*randn[0:num_try])
minreg = min(reg)
maxreg = max(reg)
reg_array = 10**(minreg + (maxreg-minreg)*randn[num_try:2*num_try])
lr_regs = zip(lr_array, reg_array)
for lr_reg in lr_regs:
(data_loss, reg_loss, weights, biases, val_accuracy) = train_net(X_train, labels_train, layer_param,
lr_reg[0], lr_decay, lr_reg[1], mu, max_epoch,
X_val, labels_val)
return (weights, biases)
dim = 2 # dimensionality
N_class = 4 # number of classes
layer_param = [dim, 100, 100, N_class]
(X,labels) = gen_toy_data(dim, N_class, num_samp_per_class = 200)
(X_train, labels_train, X_val, labels_val, X_test, labels_test) = split_data(X, labels)
(X_train_pca, X_val_pca, X_test_pca) = data_preprocess(X_train, X_val, X_test)
(weights, biases) = hyperparam_random_search(X_train_pca, labels_train, X_val_pca, labels_val, layer_param,
num_try = 2, lr=[-0.1, -0.2], lr_decay=1, mu=0.9, reg=[-2, -5], max_epoch=2000)
上面设置lr=[-0.1, -0.2]时,数据损失波动很大且下降很快,参数更新率也过高
修改lr=[-1.5, -2.5],数据损失基本无波动,训练集和验证集的准确率趋于饱和,过拟合现象不明显
9、评估模型
accuracy = predict(X_test_pca, labels_test, layer_param, weights, biases)
print(accuracy) # 0.785
10、增加BN层
增加BN层,不会改变程序的总体流程。但是需要注意的是,这里不再需要偏置参数,而是增加了标准查和均值参数,同时需要保留BN层的一些中间结果,如均值和方差等。注意,由于采用全部样本进行训练,所以均值和方差不需要进行移动平均。
BN_EPSILON = 10**(-5)
def initialize_parameters_BN(layer_param):
weights = []
gammas = []
betas = []
vweights = []
vgammas = []
vbetas = []
for i in range(len(layer_param) - 1):
in_depth = layer_param[i]
out_depth = layer_param[i+1]
std = np.sqrt(2/in_depth)
weights.append( std * np.random.randn(in_depth, out_depth) )
gammas.append( np.ones((1, out_depth)) )
betas.append( np.zeros((1, out_depth)) )
vweights.append( np.zeros( (in_depth, out_depth) ) )
vgammas.append( np.zeros((1, out_depth)) )
vbetas.append( np.zeros((1, out_depth)) )
params = (weights, gammas, betas)
vparams = (vweights, vgammas, vbetas)
return (params, vparams)
def BN_forward(X, gamma, beta):
mu = np.mean(X, axis=0)
var = np.var(X, axis=0)
std = np.sqrt(var + BN_EPSILON)
X_hat = (X - mu)/std
Y = gamma*X_hat + beta
bn_cache = (mu, var, std)
return (Y, bn_cache, X_hat)
def BN_backprop(dY, X, X_hat, bn_cache, gamma, beta):
(mu, var, std) = bn_cache
batch = X.shape[0]
dX_hat = gamma*dY
dbeta = np.sum(dY, axis=0, keepdims=True)
dgamma = np.sum(X_hat*dY, axis=0, keepdims=True)
dX = dX_hat/std
dmu = -np.sum(dX_hat, axis=0, keepdims=True)/std
dstd = -1/(std*std) * np.sum((X - mu)*dX_hat, axis=0)
dvar = 1/2*(var**(-0.5))*dstd
dX += 2*(X - mu)/batch * dvar
dX += dmu/batch
return (dX, dgamma, dbeta)
def forward_BN(X, layer_param, params):
hiddens = []
bn_ins = []
bn_caches = []
bn_hats = []
(weights, gammas, betas) = params
hiddens.append(X)
for i in range(len(layer_param)-1):
hidden = np.dot(hiddens[i], weights[i]) # 1
bn_ins.append(hidden)
(hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i]) # 2
bn_hats.append(bn_hat)
bn_caches.append(bn_cache)
if i < len(layer_param)-2:
hiddens.append(np.maximum(0, hidden)) # 3
else:
scores = hidden # 4
return (hiddens, scores, bn_ins, bn_hats, bn_caches)
def predict_BN(X, labels, layer_param, params):
(weights, gammas, betas) = params
hidden = X
for i in range(0,len(layer_param)-1):
hidden = np.dot(hidden, weights[i])
(hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i])
if i < len(layer_param)-2:
hidden = np.maximum(0, hidden)
else:
scores = hidden
predicted_class = np.argmax(scores, axis=1)
rigth_class = predicted_class == labels
return np.mean(rigth_class)
def gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg):
(weights, gammas, betas) = params
dweights = []
dgammas = []
dbetas = []
dhidden = dscores
for i in range(len(hiddens)-1, -1, -1):
(dhidden, dgamma, dbeta) = BN_backprop(dhidden, bn_ins[i], bn_hats[i], bn_caches[i], gammas[i], betas[i])
dgammas.append(dgamma)
dbetas.append(dbeta)
dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i] )
dhidden = np.dot(dhidden, weights[i].T)
dhidden[hiddens[i] <= 0] = 0
return (dweights, dgammas, dbetas)
#%%
def train_net_BN(data, layer_param, super_params, max_epoch):
(X_train, labels_train, X_val, labels_val) = data
(lr_w, lr_decay, mu_w, lr_bn, mu_bn, reg) = super_params
(params, vparams) = initialize_parameters_BN(layer_param)
epoch = 0
data_losses = []
reg_losses = []
val_accuracy = []
train_accuracy = []
weights_update_ratio = []
gammas_update_ratio = []
betas_update_ratio = []
while epoch < max_epoch:
(hiddens, scores, bn_ins, bn_hats, bn_caches) = forward_BN(X_train, layer_param, params)
val_accuracy.append(predict_BN(X_val, labels_val, layer_param, params))
train_accuracy.append(predict_BN(X_train, labels_train, layer_param, params))
data_loss = data_loss_softmax(scores, labels_train)
reg_loss = reg_L2_loss(params[0], reg)
dscores = dscores_softmax(scores, labels_train)
(dweights, dgammas, dbetas) = gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg)
weights_update_ratio.append(nesterov_momentumSGD(vparams[0], params[0], dweights, lr_w, mu_w))
gammas_update_ratio.append(nesterov_momentumSGD(vparams[1], params[1], dgammas, lr_bn, mu_bn))
betas_update_ratio.append(nesterov_momentumSGD(vparams[2], params[2], dbetas, lr_bn, mu_bn))
data_losses.append(data_loss)
reg_losses.append(reg_loss)
epoch += 1
lr_w *= lr_decay
# 省略可视化结果代码
plt.close()
fig=plt.figure('loss')
ax=fig.add_subplot(2,1,1)
# ax.set_title('data loss')
ax.grid(True)
ax2=fig.add_subplot(2,1,2)
# ax2.set_title('reg loss')
ax2.grid(True)
plt.xlabel( 'log10(lr_w)=' + str(round((np.log10(lr_w)),2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)),2)), fontsize=14)
plt.ylabel(' accuracy log10(data loss)', fontsize=14)
ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b',marker='.')
# ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*')
# ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*')
# ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.')
ax2.scatter(np.arange(len(val_accuracy)-0), val_accuracy[0:], c='r',marker='*')
ax2.scatter(np.arange(len(val_accuracy)-0), train_accuracy[0:], c='g',marker='.')
plt.show()
#%%
for layer in range(len(params[0])):
wur = []
for i in range(len(weights_update_ratio)):
wur.append( weights_update_ratio[i][layer] )
bur = []
for i in range(len(betas_update_ratio)):
bur.append( betas_update_ratio[i][layer] )
plt.close()
fig=plt.figure('update ratio')
ax=fig.add_subplot(2,1,1)
ax.grid(True)
ax2=fig.add_subplot(2,1,2)
ax2.grid(True)
plt.xlabel( 'log10(lr_w)=' + str(round((np.log10(lr_w)),2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)),2)), fontsize=14)
# plt.ylabel('log10(baises_update_ratio) log10(weights_update_ratio)', fontsize=14)
ax.scatter(np.arange(len(wur)), np.log10(wur), c='b',marker='.')
ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r',marker='*')
plt.show()
上一篇: 实例分析神经网络传播过程