tensorflow解决测试时分类准确率受到batch_size的影响
之前在使用tensorflow的slim.nets模块进行finetune resnet50时,遇到了训练时的准确率和loss值正常,但在验证和测试时准确率很低的问题,具体表现是测试时batch size越小,测试的准确率越低。但按理说batchsize在测试时应该只影响了运行时间,不应该对准确率产生影响,在这里记录一下解决这个问题的过程。
1. Batch Normalization Layer的问题:
我finutune时用的网络结构是slim.nets.resnet_v2.resnet_v2_50()提供的,该方法中的BN层使用的是layers.batch_norm()实现的,在源码中有着这么一段注释:
Note: when training, the moving_mean and moving_variance need to be updated.
By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
need to be added as a dependency to the `train_op`. For example:
```python
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_op = optimizer.minimize(loss)
```
翻译一下:在训练的时候,需要更新moving_mean和moving_variance这两个参数。默认情况下更新操作被放在`tf.GraphKeys.UPDATE_OPS`里,所以它们需要作为依赖项添加到`train_op`中。
我们知道mean和variance是batch_norm的两个重要的参数,分别代表了训练时一个batch的均值和方差。这段话的意思就是让我们在train op之前设置节点去依赖update_ops,否则moving_mean和moving_variance将不会被更新,所以可能加载进来的值是初始默认值,导致测试的结果很不好。
我按照源码中给的示例进行了修改,但结果并没有得到改善,,,测试的结果还是受到了batch size的影响,于是继续debug。
2. 关于is_training参数:
在batch_norm()方法中,有一个重要的参数is_training,源码中的注释如下:
is_training: Whether or not the layer is in training mode. In training mode
it would accumulate the statistics of the moments into `moving_mean` and
`moving_variance` using an exponential moving average with the given
`decay`. When it is not in training mode then it would use the values of
the `moving_mean` and the `moving_variance`.
意思就是说当is_training为True时,它会用滑动平均法来积累moving_mean和moving_variance,当is_training为False时,它会使用moving_mean和moving_variance的值。
所以当我们训练时应该将is_training设为True以计算滑动均值和滑动方差,而当测试时应将is_training设为False,即使用训练完保存好的值,而不是当前batch里计算出的值。
问题终于解决了。。。最后贴上整个训练(包含验证)的代码:
# -*- coding: utf-8 -*-
import tensorflow as tf
from preprocess import preprocess_for_train, preprocess_for_test
from utils.parse_functions import parser_tfrecords, parse_list
from utils.generate_list import images_and_labels_list
from tensorflow.contrib.slim import nets
import os
import numpy as np
import time
slim = tf.contrib.slim
# 指定第一块gpu训练
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
"""
""net parameters
"""
num_classes = 10
image_size = 32
batch_size = 128
valid_batch_size = 128
shuffle_buffer = 50000
NUM_EPOCHS = 100
starter_learning_rate = 0.001
boundaries = [15000, 30000, 45000, 60000]
values = [0.001, 0.0005, 0.0001, 0.00005, 0.00001]
"""
""path parameters
"""
resnet_model_path = './resnet_v2_50.ckpt'
models_path = './models/temporary/train-model.ckpt' # saver保存路径
logs_dir = "./logs/temporary"
logs_train_dir = os.path.join(logs_dir, "train")
logs_valid_dir = os.path.join(logs_dir, "valid")
train_tfrecords = ["./tfrecords/cifar10/train.tfrecords"]
test_tfrecords = ["./tfrecords/cifar10/test.tfrecords"]
for dir_name in [logs_dir, logs_train_dir, logs_valid_dir]:
if not os.path.exists(dir_name):
os.mkdir(dir_name)
"""
""Datasets
"""
dataset = tf.data.TFRecordDataset(train_tfrecords)
dataset = dataset.map(parser_tfrecords)
dataset = dataset.shuffle(shuffle_buffer)
dataset = dataset.batch(batch_size)
dataset = dataset.repeat(NUM_EPOCHS)
iterator = dataset.make_initializable_iterator()
image_batch, label_batch = iterator.get_next()
label_batch = tf.one_hot(indices=tf.cast(label_batch, tf.int32), depth=num_classes)
val_dataset = tf.data.TFRecordDataset(test_tfrecords)
val_dataset = val_dataset.map(parser_tfrecords)
val_dataset = val_dataset.batch(valid_batch_size)
val_iterator = val_dataset.make_initializable_iterator()
val_image_batch, val_label_batch = val_iterator.get_next()
val_label_batch = tf.one_hot(indices=tf.cast(val_label_batch, tf.int32), depth=num_classes)
"""
""placeholder
"""
inputs = tf.placeholder(tf.float32, [None, image_size, image_size, 3])
labels = tf.placeholder(tf.int32, [None, num_classes])
keep_prob = tf.placeholder(tf.float32)
is_training = tf.placeholder(tf.bool)
"""
""inference
"""
with slim.arg_scope(nets.resnet_v2.resnet_arg_scope()):
net, end_points = nets.resnet_v2.resnet_v2_50(inputs, num_classes=None, is_training=is_training)
with tf.variable_scope('Logits'):
net = tf.squeeze(net, axis=[1, 2])
net = slim.dropout(net, keep_prob=keep_prob, scope='scope')
logits = slim.fully_connected(net, num_outputs=num_classes,
activation_fn=None, scope='fc')
"""
""Learning rate
"""
with tf.variable_scope("learning_rate") as scope:
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
tf.summary.scalar(scope.name, learning_rate)
"""
""Loss
"""
with tf.variable_scope("loss") as scope:
losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=labels)
loss = tf.reduce_mean(losses)
tf.summary.scalar(scope.name, loss)
"""
""Accuracy
"""
with tf.variable_scope("accuracy") as scope:
correct_prediction = tf.equal(tf.cast(tf.argmax(logits, 1), tf.int32), tf.cast(tf.argmax(labels, 1), tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar(scope.name, accuracy)
"""
""Optimizer
"""
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
train_step = optimizer.minimize(loss, global_step)
# train_step = slim.learning.create_train_op(loss, optimizer, global_step=global_step)
"""
""Summary
"""
summary_op = tf.summary.merge_all() # 混合所有summary类型log
"""
" Restore resnet50
"""
checkpoint_exclude_scopes = 'Logits'
exclusions = None
if checkpoint_exclude_scopes:
exclusions = [
scope.strip() for scope in checkpoint_exclude_scopes.split(',')]
variables_to_restore = []
for var in slim.get_model_variables():
excluded = False
for exclusion in exclusions:
if var.op.name.startswith(exclusion):
excluded = True
if not excluded:
variables_to_restore.append(var)
saver_restore = tf.train.Saver(var_list=variables_to_restore)
saver = tf.train.Saver()
"""
" Open Session
"""
init = tf.global_variables_initializer()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
sess.run(init)
# saver_restore.restore(sess, resnet_model_path)
train_writer = tf.summary.FileWriter(os.path.join(logs_dir, 'train'), sess.graph) # 训练日志
valid_writer = tf.summary.FileWriter(os.path.join(logs_dir, 'valid'), sess.graph) # 验证日志
# 初始化训练数据的迭代器
sess.run(iterator.initializer)
step = 0
best_acc = 0.0
start_time = time.time()
while True:
try:
step += 1
batch_x, batch_y = sess.run([image_batch, label_batch])
train_dict = {inputs: batch_x,
labels: batch_y,
is_training: True,
keep_prob: 1.0}
summary_str, _, loss_t, acc_t = sess.run([summary_op, train_step, loss, accuracy],
feed_dict=train_dict)
if step % 10 == 0:
print('Step: {},Train_Acc: {:.4f},Loss: {:.8f}'.format(step, acc_t, loss_t))
train_writer.add_summary(summary_str, step)
if step % 1000 == 0: # 验证准确率
sess.run(val_iterator.initializer)
acc_reg = []
loss_reg = []
while True:
try:
batch_x, batch_y = sess.run([val_image_batch, val_label_batch])
valid_dict = {inputs: batch_x,
labels: batch_y,
is_training: False,
keep_prob: 1.0}
loss_v, acc_v, summary_str = sess.run([loss, accuracy, summary_op],
feed_dict=valid_dict)
valid_writer.add_summary(summary_str, step)
acc_reg.append(acc_v)
loss_reg.append(loss_v)
except tf.errors.OutOfRangeError:
break
avg_acc = np.mean(np.array(acc_reg))
avg_loss = np.mean(np.array(loss_reg))
print('------------------------------------------------------')
print('Valid-----> ,Valid_Acc: {:.4f}, Valid_Loss: {:.7f}'.format(avg_acc, avg_loss))
print('------------------------------------------------------')
"""
" Save the best model
"""
if avg_acc > best_acc:
best_acc = avg_acc
saver.save(sess=sess, save_path=models_path, global_step=step)
print("模型保存成功")
print("Save the best model with val_acc %0.4f" % best_acc)
else:
print("Val_acc stay with val_acc %0.4f" % best_acc)
except tf.errors.OutOfRangeError:
train_writer.close()
valid_writer.close()
saver.save(sess=sess, save_path=models_path, global_step=step)
break
end_time = time.time()
print("总共用时:", end_time-start_time)
print('Ended......')
参考了:
上一篇: 不加图形验证码如何保证短信接口的安全?