【Tensorflow2.0】10、端到端的自定义模型训练custom training

程序员文章站 2022-05-30 21:01:28

...

文章目录

1、导入必备的包
2、配置模型的参数
3、准备开始训练

3.1、模型配置
3.2、准备数据集
3.3、定义模型
3.4、准备优化及相关训练所用函数
3.5、开始训练并验证模型
总结

1、导入必备的包

import tensorflow as tf
import shutil,os,sys,io,copy,time,itertools,argparse,matplotlib
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import pandas as pd
from functools import partial
from collections import namedtuple
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline

2、配置模型的参数

class ObjectDict(dict):
    def __getattr__(self,name):
        try:
            return self[name]
        except:
            raise Exception(name)
    def __setattr__(self,key,value):
        self[key]=value

args = ObjectDict()
args.output_folder='/tmp/out/model_out'
args.save_format='hdf5'
args.which_gpu=0
args.batch_size=200
args.epochs=10
args.regularizer=1e-4
args.num_classes=10
args.initial_learning_rate=1e-3
args.learning_rate_decay_factor=0.9 #对学习率做周期衰减
args.num_epochs_per_decay=1 #训练多少个epoch衰减一次

3、准备开始训练

# 0 = all messages are logged (default behavior)
# 1 = INFO messages are not printed
# 2 = INFO and WARNING messages are not printed
# 3 = INFO, WARNING, and ERROR messages are not printed
#系统的普通信息和警告信息不打印
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.keras.backend.clear_session()

3.1、模型配置

History = namedtuple('History', ['train_epoch_acc', 'train_epoch_loss', 'val_epoch_acc', 'val_epoch_loss'])

def configs(args = None):
    # t = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    t = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_folder = args.output_folder
    if os.path.exists(output_folder):
        inc = input("The model saved path(%s) has exist,Do you want to delete and remake it?(y/n)" % output_folder)
        while (inc.lower() not in ['y', 'n']):
            inc = input("The model saved path has exist,Do you want to delete and remake it?(y/n)")
        if inc.lower() == 'y':
            shutil.rmtree(output_folder)
            os.makedirs(output_folder)
        else:
            print("Exit and chechk the path!")
            exit(-1)

    else:
        print("The model saved path (%s) does not exist,make it!" % output_folder)
        os.makedirs(output_folder)

    if args.save_format == "hdf5":
        save_path_models = os.path.join(output_folder, "hdf5_models_{}".format(t))
        if not os.path.exists(save_path_models):
            os.makedirs(save_path_models)
        save_path = os.path.join(save_path_models, "ckpt_epoch{:02d}_val_acc{:.2f}.hdf5")
    elif args.save_format == "saved_model":
        save_path_models = os.path.join(output_folder, "saved_models_{}".format(t))
        if not os.path.exists(save_path_models):
            os.makedirs(save_path_models)
        save_path = os.path.join(save_path_models, "ckpt_epoch{:03d}_val_acc{:.4f}.ckpt")
    # 用来保存日志
    # t1 = datetime.now().strftime("%Y%m%d_%H%M")
    log_dir = os.path.join(output_folder, 'logs_{}'.format(t))
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    physical_devices = tf.config.experimental.list_physical_devices('GPU')  # 列出所有可见显卡
    # print("All the available GPUs:\n", physical_devices)
    if physical_devices:
        gpu = physical_devices[args.which_gpu]  # 显示第一块显卡
        tf.config.experimental.set_memory_growth(gpu, True)  # 根据需要自动增长显存
        tf.config.experimental.set_visible_devices(gpu, 'GPU')  # 只选择第一块
    return output_folder, save_path, log_dir

output_folder, save_path, log_dir = configs(args)

The model saved path(/tmp/out/model_out) has exist,Do you want to delete and remake it?(y/n)y

使用Tensorboard和日志监控模型训练过程

train_log_dir = os.path.join(log_dir, 'train')
validation_log_dir = os.path.join(log_dir, 'validation')
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
validation_summary_writer = tf.summary.create_file_writer(validation_log_dir)

3.2、准备数据集

fashion_mnist=tf.keras.datasets.fashion_mnist
(train_x,train_y),(validation_x,validation_y)=fashion_mnist.load_data()

train_x,validation_x = train_x[...,np.newaxis]/255.0,validation_x[...,np.newaxis]/255.0
train_x = np.array(train_x).astype(np.float32)
validation_x=np.array(validation_x).astype(np.float32)
total_train_sample = train_x.shape[0]
total_validation_sample=validation_x.shape[0]
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
train_ds = tf.data.Dataset.from_tensor_slices((train_x,train_y))
validation_ds = tf.data.Dataset.from_tensor_slices((validation_x,validation_y))
 
train_ds=train_ds.shuffle(buffer_size=args.batch_size*10).batch(args.batch_size).prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
validation_ds = validation_ds.batch(args.batch_size).prefetch(buffer_size = tf.data.experimental.AUTOTUNE)#不加repeat，执行一次就行
train_steps_per_epoch = np.ceil(total_train_sample / args.batch_size).astype(np.int32)
validation_steps_per_epoch = np.ceil(total_validation_sample / args.batch_size).astype(np.int32)
print("train_steps_per_epoch:", train_steps_per_epoch)
print("validation_steps_per_epoch:", validation_steps_per_epoch)

train_steps_per_epoch: 300
validation_steps_per_epoch: 50

3.3、定义模型

def mymodel(num_classes,regularizer):
    l2 = tf.keras.regularizers.l2(regularizer)#定义模型正则化方法
    ini = tf.keras.initializers.he_normal()#定义参数初始化方法
    conv2d = partial(tf.keras.layers.Conv2D,activation='relu',padding='same',kernel_regularizer=l2,bias_regularizer=l2)
    fc = partial(tf.keras.layers.Dense,activation='relu',kernel_regularizer=l2,bias_regularizer=l2)
    maxpool=tf.keras.layers.MaxPooling2D
    dropout=tf.keras.layers.Dropout
    x_input = tf.keras.layers.Input(shape=(28,28,1),name='input_node')
    x = conv2d(128,(5,5))(x_input)
    x = maxpool((2,2))(x)
    x = conv2d(256,(5,5))(x)
    x = maxpool((2,2))(x)
    x = tf.keras.layers.Flatten()(x)
    x = fc(128)(x)
    x_output=fc(10,activation=None,name='output_node')(x)
    model = tf.keras.models.Model(inputs=x_input,outputs=x_output) 
    return model

model = mymodel(args.num_classes,args.regularizer)
print(model.summary())
tf.keras.utils.plot_model(model, to_file = os.path.join(log_dir, 'model_arch.png'), show_shapes = True)
model_json = model.to_json()
with open(os.path.join(log_dir, 'model_json.json'), 'w') as json_file:
    json_file.write(model_json)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_node (InputLayer)      [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 28, 28, 128)       3328      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 14, 128)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 14, 256)       819456    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 7, 7, 256)         0         
_________________________________________________________________
flatten (Flatten)            (None, 12544)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               1605760   
_________________________________________________________________
output_node (Dense)          (None, 10)                1290      
=================================================================
Total params: 2,429,834
Trainable params: 2,429,834
Non-trainable params: 0
_________________________________________________________________
None

3.4、准备优化及相关训练所用函数

#优化
decay_steps = int(train_steps_per_epoch * args.num_epochs_per_decay)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    args.initial_learning_rate,
    decay_steps = decay_steps,
    decay_rate = args.learning_rate_decay_factor,
    staircase = True)
# optimizer = tf.keras.optimizers.SGD(learning_rate = lr_schedule, momentum = 0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate = lr_schedule)

#从参数名称看
@tf.function
def train_on_batch(model,optimizer,datas,labels,
                   train_batch_acc,
                   train_batch_total_loss,
                   train_batch_celoss,
                   train_batch_regloss,
                   train_epoch_acc,
                   train_epoch_ce_loss):
    """
    :param model: 模型
    :param optimizer: 优化器
    :param datas: batch size大小的数据
    :param labels: 对应标签
    :param train_batch_acc: 每个batch 的精度
    :param train_batch_total_loss: 所有损失值
    :param train_batch_celoss: 交叉熵
    :param train_batch_regloss: 正则化
    :param train_epoch_acc: 每个epoch累积的精度
    :param train_epoch_ce_loss: 每个epoch累积的交叉熵
    :return: gradients:返回每个可训练变量的梯度
    """
    with tf.GradientTape() as tape:
        logits = model(datas)
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(labels, logits)
        # ce_loss = scels.compute_loss(labels, logits,args.num_classes)
        reg_loss = tf.add_n(model.losses)
        total_loss = ce_loss + reg_loss  # 加正则化
        # total_loss = ce_loss #不加正则化
    gradients = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_batch_acc(labels, logits)
    train_batch_total_loss(total_loss)
    train_batch_celoss(ce_loss)
    train_batch_regloss(reg_loss)
    train_epoch_acc(labels, logits)
    train_epoch_ce_loss(ce_loss)
    return gradients

@tf.function
def test_on_batch(model, datas, labels, val_epoch_acc, test_epoch_loss):
    """
    :param model: 模型
    :param datas: batch数据
    :param labels:与数据对应的标签
    :param val_epoch_acc:累积精度
    :param test_epoch_loss:累积损失
    :return preds:测试样本的预测值
    """
    logits = model(datas)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(labels, logits)
    val_epoch_acc(labels, logits)
    test_epoch_loss(loss)
    preds = tf.argmax(logits, axis = -1)
    return preds

3.5、开始训练并验证模型

#用来完成训练后把所有结果进行总结
def plot_acc_loss(history=None,log_dir=None):
    """
    :param history: 训练测试完以后，每个epoch的训练测试acc和loss画图
    :param log_dir: 画完图的保存地址
    :return: 
    """
    plt.figure(figsize=(8,8))
    N = np.arange(len(history.train_epoch_acc))
    plt.subplot(2,1,1)
    plt.plot(N, history.train_epoch_acc, label = 'Training Accuracy')
    plt.scatter(N, history.train_epoch_acc)
    plt.plot(N, history.val_epoch_acc, label = 'Validation Accuracy')
    plt.scatter(N, history.val_epoch_acc)
    plt.legend(loc = 'lower right')
    plt.ylabel('Accuracy')
    plt.ylim([min(plt.ylim()), 1])
    plt.title('Training and Validation Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(N,history.train_epoch_loss,label='Training Loss')
    plt.scatter(N,history.train_epoch_loss)
    plt.plot(N,history.val_epoch_loss,label='Validation Loss')
    plt.scatter(N,history.val_epoch_loss)
    plt.legend(loc = 'upper right')
    plt.ylabel('Cross Entropy')
    # plt.ylim([0, 1.0])
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.savefig(os.path.join(log_dir,'training.png'))

#每个epoch后记录并打印测试结果
def print_metrics(labels, predictions, target_names, save = False, save_path = None, epoch = None, train_time = None,
                  test_time = None):
    """
    :param labels: 检签 
    :param predictions: 预测值
    :param target_names: 
    :param save: 是否保存结果
    :param save_path: 保存路径
    :param epoch: 传入测试的epoch数
    :param train_time: 传入训练单个epoch用时
    :param test_time: 传入测试单个epoch用时
    :return: comfusion matrix
    """
    # 计算confusion result
    assert len(predictions) == len(labels)
    confusion_result = confusion_matrix(labels, predictions)
    pd.set_option('display.max_rows', 500)

    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1500)
    index = (set(predictions) | set(labels))
    target_names = [target_names[i] for i in index]
    confusion_result = pd.DataFrame(confusion_result, index = target_names, columns = target_names)
    # classification report
    report = classification_report(labels, predictions, target_names = target_names, digits = 4)
    result_report = 'Epoch:{} with train_time:{:2f}min and test_time:{:2f}min\n' \
                    'Confuse_matrix:\n{}\n\nClassification_report:\n{} \n'.format(epoch,
                                                                                  train_time / 60,
                                                                                  test_time / 60,
                                                                                  confusion_result,
                                                                                  report)
    print(result_report)
    if save:

        savepath = os.path.join(save_path, "validation_result.txt")

        print('the result saved in %s' % savepath)  # 如果savepath相同的话,会把所有结果保存到同一个文件中

        with open(savepath, 'a') as f:
            f.write(result_report)
    return confusion_result

把confusion matrix画成图何存，并且把图存到tensorboard中一份

def plot_to_image(figure, log_dir, epoch):
    """Converts the matplotlib plot specified by 'figure' to a PNG image and
    returns it. The supplied figure is closed and inaccessible after this call."""
    # Save the plot to a PNG in memory.
    buf = io.BytesIO()
    fig = figure
    plt.savefig(buf, format = 'png')
    fig.savefig(os.path.join(log_dir, 'confusion_matrix_epoch%d.png' % epoch))  # 保存图片
    # Closing the figure prevents it from being displayed directly inside
    # the notebook.
    plt.close(figure)
    buf.seek(0)
    # Convert PNG buffer to TF image
    image = tf.image.decode_png(buf.getvalue(), channels = 4)
    # Add the batch dimension
    image = tf.expand_dims(image, 0)
    return image


def plot_confusion_matrix(cm, class_names):
    """
    Returns a matplotlib figure containing the plotted confusion matrix.

    Args:
    cm (array, shape = [n, n]): a confusion matrix of integer classes
    class_names (array, shape = [n]): String names of the integer classes
    """
    figure = plt.figure(figsize = (8, 8))
    plt.imshow(cm, interpolation = 'nearest', cmap = plt.cm.Blues)
    plt.title("Confusion matrix")
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation = 45)
    plt.yticks(tick_marks, class_names)

    # Normalize the confusion matrix.
    cm = np.around(cm.astype('float') / cm.sum(axis = 1)[:, np.newaxis], decimals = 2)

    # Use white text if squares are dark; otherwise black.
    threshold = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        color = "white" if cm.iloc[i, j] > threshold.iloc[i] else "black"
        plt.text(j, i, cm.iloc[i, j], horizontalalignment = "center", color = color)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return figure

train_batch_acc = tf.keras.metrics.SparseCategoricalAccuracy()
train_batch_total_loss = tf.keras.metrics.Mean()  # 交叉熵的loss与正则化的loss的和
train_batch_celoss = tf.keras.metrics.Mean()  # 交叉熵的loss
train_batch_regloss = tf.keras.metrics.Mean()  # 正则化的loss
train_epoch_acc = tf.keras.metrics.SparseCategoricalAccuracy()
train_epoch_ce_loss = tf.keras.metrics.Mean()

val_epoch_acc = tf.keras.metrics.SparseCategoricalAccuracy()
val_epoch_loss = tf.keras.metrics.Mean()

train_acc = []
train_loss = []
val_acc = []
val_loss = []
format_str = (
    '%s: step:%-6d epoch:%-6.3f/%d celoss:%-5.3f regloss:%-7.4f total_loss:%-6.3f '
    'batch_acc:%-5.2f%% epoch_acc:%-5.2f%% epoch_loss:%-6.3f (%.1f examples/sec; %-3.2f sec/batch)')
# format_str=('%s:step:%d, epoch:%.4f/%d loss:%.2f lr:%-7.5f train_batch_acc:%5.2f (%.1f examples/sec; %.3f sec/batch)')

for epoch in range(args.epochs):

    print("Do training Epoch=%d/%d on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:" % (epoch + 1, args.epochs))

    start = time.time()
    for i, (data,label) in enumerate(train_ds):
        if (epoch==0 and i==1):  # 只对第二次batch做porfiling            
            tf.summary.trace_on(graph = True, profiler = True)  # 开启Trace（可选）可以记录图结构和profile信息,graph=True会把图结构写入log
        
        start_time = time.time()
        #prepare moving average parameters
        # num_updates= i+epoch*train_steps_per_epoch
        # moving_average_decay = min(args.moving_average_decay_factor, (1 + num_updates) / (10 + num_updates))
        # shadow_variables=copy.deepcopy(model.trainable_variables)
        # updata variable
        grads = train_on_batch(model, optimizer, data, label, train_batch_acc,
                               train_batch_total_loss,
                               train_batch_celoss,
                               train_batch_regloss,
                               train_epoch_acc,
                               train_epoch_ce_loss)

        #do moving average
        # moving_average(model,moving_average_decay,shadow_variables)

        duration = time.time() - start_time
        if (epoch==0 and i==1):
            with train_summary_writer.as_default():
                tf.summary.trace_export(name = "model_trace", step = 1,
                                        profiler_outdir = train_log_dir)  # 保存Trace信息到文件（可选）
                tf.summary.trace_off()  # 关闭

        if (i + 1) % 100 == 0:
            examples_per_sec = args.batch_size / duration
            current_epoch = (i + 1) / ((epoch + 1) * train_steps_per_epoch)+epoch
            print(format_str % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), i + 1,
                                current_epoch, args.epochs,
                                train_batch_celoss.result(),
                                train_batch_regloss.result(),
                                train_batch_total_loss.result(),
                                100 * train_batch_acc.result(),
                                100*train_epoch_acc.result(),
                                train_epoch_ce_loss.result(),
                                examples_per_sec, duration))
            step = tf.constant(epoch * train_steps_per_epoch + i + 1)
            with train_summary_writer.as_default():  # 每50步记录一下，太频繁会影响训练速度
                tf.summary.scalar('train_batch_accuracy', train_batch_acc.result(), step = step)
                tf.summary.scalar('train_batch_celoss', train_batch_celoss.result(), step = step)
                tf.summary.scalar('train_batch_regloss', train_batch_regloss.result(), step = step)
                tf.summary.scalar('train_batch_total_loss', train_batch_total_loss.result(), step = step)
                tf.summary.scalar('train_epoch_acc', train_epoch_acc.result(), step = step)
                tf.summary.scalar('train_epoch_total_loss', train_epoch_ce_loss.result(), step = step)
                train_summary_writer.flush()

        if ((i + 1) % int(train_steps_per_epoch * 0.1))==0:
            step = tf.constant(epoch * train_steps_per_epoch + i + 1)
            # 每0.1epoch记录一下模型各层参数及其梯度的直方图，太多日志文件会很大
            with train_summary_writer.as_default():
                for grad, variable in zip(grads, model.trainable_variables):
                    v_name = variable.name.replace(':', '_')
                    # 记录变量直方图
                    tf.summary.histogram(v_name, variable, step = step)
                    # 记录变量梯度直方图
                    tf.summary.histogram('{}_grad'.format(v_name), grad, step = step)
                    #draw_variable_tb(variable, v_name, step)

                train_summary_writer.flush()

        train_batch_acc.reset_states()
        train_batch_celoss.reset_states()
        train_batch_regloss.reset_states()
        train_batch_total_loss.reset_states()

    end = time.time() - start
    print("Training Epoch:{}/{} loss:{:.2f} acc:{:.2f} fineshed usetime:{:.1f} sec".format(epoch + 1,
                                                                                           args.epochs,
                                                                                           train_epoch_ce_loss.result().numpy(),
                                                                                           train_epoch_acc.result().numpy(),
                                                                                           end))


    train_acc.append(train_epoch_acc.result().numpy())
    train_loss.append(train_epoch_ce_loss.result().numpy())
    train_epoch_acc.reset_states()
    train_epoch_ce_loss.reset_states()
    #完成一个epoch的训练
    train_summary_writer.flush()
    # 对测试集进行测试
    print("Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:")
    all_labels = []
    all_preds = []
    start_time = time.time()
    # for i, (data, label) in enumerate(validation_ds.take(1000)):
    for i,(data,label) in enumerate(validation_ds):
        preds = test_on_batch(model, data, label, val_epoch_acc, val_epoch_loss)
        all_preds.extend(preds.numpy().flatten().tolist())
        all_labels.extend(label.numpy().flatten().tolist())
        sys.stdout.write('\r %d / %d finished !' %(i+1,validation_steps_per_epoch))
    duration = time.time() - start_time
    print("Epoch %d: test_acc:%.3f test_loss:%.3f  total_time:%d sec " % ((epoch + 1),
                                                                         val_epoch_acc.result(),
                                                                         val_epoch_loss.result(),
                                                                         int(duration)))
    with validation_summary_writer.as_default():
        tf.summary.scalar('val_loss', val_epoch_loss.result(), step = epoch)
        tf.summary.scalar('val_accuracy', val_epoch_acc.result(), step = epoch)
        validation_summary_writer.flush()

    acc = val_epoch_acc.result().numpy()
    val_acc.append(val_epoch_acc.result().numpy())
    val_loss.append(val_epoch_loss.result().numpy())
    
    cm = print_metrics(all_labels, all_preds, class_names, True, validation_log_dir, (epoch + 1), train_time = end,
                       test_time = duration)
    figure = plot_confusion_matrix(cm, class_names = class_names)
    cm_image = plot_to_image(figure, validation_log_dir, epoch + 1)  # 同时保存图片到文件夹
    with validation_summary_writer.as_default():
        tf.summary.scalar('val_loss', val_epoch_loss.result(), step = epoch + 1)
        tf.summary.scalar('val_accuracy', val_epoch_acc.result(), step = epoch + 1)
        tf.summary.image("Confusion Matrix", cm_image, step = epoch + 1)  # 将测试结果confuse matrix画到tensorboard
        validation_summary_writer.flush()
    # 训练完成保存模型
    print("Model saved at Epoch %d end ." % (epoch + 1,))
    model.save(save_path.format((epoch + 1), acc))
    val_epoch_acc.reset_states()
    val_epoch_loss.reset_states()
    #测试完成
    validation_summary_writer.flush()

history = History(train_epoch_acc = train_acc, train_epoch_loss = train_loss, val_epoch_acc = val_acc,
                  val_epoch_loss = val_loss)
plot_acc_loss(history = history, log_dir = log_dir)
train_summary_writer.flush()
validation_summary_writer.flush()
train_summary_writer.close()
validation_summary_writer.close()

Do training Epoch=1/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:09:27: step:100    epoch:0.333 /10 celoss:0.372 regloss:0.0437  total_loss:0.416  batch_acc:88.00% epoch_acc:75.10% epoch_loss:0.664  (38154.3 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:29: step:200    epoch:0.667 /10 celoss:0.333 regloss:0.0458  total_loss:0.379  batch_acc:89.50% epoch_acc:80.93% epoch_loss:0.521  (35259.6 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:31: step:300    epoch:1.000 /10 celoss:0.360 regloss:0.0473  total_loss:0.407  batch_acc:88.00% epoch_acc:83.34% epoch_loss:0.457  (33926.3 examples/sec; 0.01 sec/batch)
Training Epoch:1/10 loss:0.46 acc:0.83 fineshed usetime:8.0 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 1: test_acc:0.877 test_loss:0.335  total_time:0 sec 
Epoch:1 with train_time:0.133957min and test_time:0.005964min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          746        0        19     28    12       2    184        0    9           0
Trouser                0      970         0     18     7       0      3        0    2           0
Pullover               6        1       698      4   191       0     99        0    1           0
Dress                 12        1         7    874    59       0     47        0    0           0
Coat                   1        1        17     20   898       0     62        0    1           0
Sandal                 0        0         0      1     0     980      0        8    0          11
Shirt                 82        1        43     27   101       0    736        0   10           0
Sneaker                0        0         0      0     0      35      0      952    0          13
Bag                    1        1         8      3     7       2      6        4  968           0
Ankle boot             0        0         0      0     0       4      1       48    0         947

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.8797    0.7460    0.8074      1000
     Trouser     0.9949    0.9700    0.9823      1000
    Pullover     0.8813    0.6980    0.7790      1000
       Dress     0.8964    0.8740    0.8851      1000
        Coat     0.7043    0.8980    0.7895      1000
      Sandal     0.9580    0.9800    0.9689      1000
       Shirt     0.6467    0.7360    0.6885      1000
     Sneaker     0.9407    0.9520    0.9463      1000
         Bag     0.9768    0.9680    0.9724      1000
  Ankle boot     0.9753    0.9470    0.9609      1000

    accuracy                         0.8769     10000
   macro avg     0.8854    0.8769    0.8780     10000
weighted avg     0.8854    0.8769    0.8780     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 1 end .
Do training Epoch=2/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:09:34: step:100    epoch:1.167 /10 celoss:0.257 regloss:0.0484  total_loss:0.305  batch_acc:91.50% epoch_acc:89.25% epoch_loss:0.293  (33302.7 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:36: step:200    epoch:1.333 /10 celoss:0.229 regloss:0.0490  total_loss:0.278  batch_acc:93.00% epoch_acc:89.62% epoch_loss:0.288  (33259.1 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:37: step:300    epoch:1.500 /10 celoss:0.210 regloss:0.0499  total_loss:0.260  batch_acc:92.50% epoch_acc:89.82% epoch_loss:0.281  (34271.4 examples/sec; 0.01 sec/batch)
Training Epoch:2/10 loss:0.28 acc:0.90 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 2: test_acc:0.899 test_loss:0.274  total_time:0 sec 
Epoch:2 with train_time:0.093418min and test_time:0.003458min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          875        0        29     12     6       1     69        0    8           0
Trouser                4      973         0     16     3       0      2        0    2           0
Pullover              13        0       839      8    85       0     54        0    1           0
Dress                 32        0        10    913    22       0     20        0    3           0
Coat                   1        1        56     38   848       0     56        0    0           0
Sandal                 0        0         0      0     0     972      0       16    0          12
Shirt                157        0        68     23    78       0    662        0   12           0
Sneaker                0        0         0      0     0       7      0      979    0          14
Bag                    3        1         2      2     4       4      1        5  978           0
Ankle boot             0        0         0      0     0       3      1       43    0         953

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.8065    0.8750    0.8393      1000
     Trouser     0.9979    0.9730    0.9853      1000
    Pullover     0.8357    0.8390    0.8373      1000
       Dress     0.9022    0.9130    0.9076      1000
        Coat     0.8107    0.8480    0.8289      1000
      Sandal     0.9848    0.9720    0.9784      1000
       Shirt     0.7653    0.6620    0.7099      1000
     Sneaker     0.9386    0.9790    0.9584      1000
         Bag     0.9741    0.9780    0.9760      1000
  Ankle boot     0.9734    0.9530    0.9631      1000

    accuracy                         0.8992     10000
   macro avg     0.8989    0.8992    0.8984     10000
weighted avg     0.8989    0.8992    0.8984     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 2 end .
Do training Epoch=3/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:09:40: step:100    epoch:2.111 /10 celoss:0.238 regloss:0.0506  total_loss:0.288  batch_acc:91.50% epoch_acc:90.84% epoch_loss:0.252  (32027.4 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:42: step:200    epoch:2.222 /10 celoss:0.239 regloss:0.0512  total_loss:0.290  batch_acc:89.50% epoch_acc:90.93% epoch_loss:0.248  (33647.3 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:44: step:300    epoch:2.333 /10 celoss:0.297 regloss:0.0522  total_loss:0.349  batch_acc:89.00% epoch_acc:91.04% epoch_loss:0.244  (32278.8 examples/sec; 0.01 sec/batch)
Training Epoch:3/10 loss:0.24 acc:0.91 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 3: test_acc:0.908 test_loss:0.255  total_time:0 sec 
Epoch:3 with train_time:0.093433min and test_time:0.003803min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          892        0        11     13     1       2     78        0    3           0
Trouser                3      973         0     20     0       0      2        0    2           0
Pullover              13        1       869     11    45       0     60        0    1           0
Dress                 22        0         8    931     6       0     32        0    1           0
Coat                   2        1        84     44   787       0     82        0    0           0
Sandal                 0        0         0      0     0     989      0        7    0           4
Shirt                124        1        57     25    47       0    740        0    6           0
Sneaker                0        0         0      0     0      20      0      959    0          21
Bag                    4        1         3      4     2       3      3        3  977           0
Ankle boot             0        0         0      0     0       6      1       28    0         965

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.8415    0.8920    0.8660      1000
     Trouser     0.9959    0.9730    0.9843      1000
    Pullover     0.8421    0.8690    0.8553      1000
       Dress     0.8884    0.9310    0.9092      1000
        Coat     0.8863    0.7870    0.8337      1000
      Sandal     0.9696    0.9890    0.9792      1000
       Shirt     0.7415    0.7400    0.7407      1000
     Sneaker     0.9619    0.9590    0.9604      1000
         Bag     0.9869    0.9770    0.9819      1000
  Ankle boot     0.9747    0.9650    0.9698      1000

    accuracy                         0.9082     10000
   macro avg     0.9089    0.9082    0.9081     10000
weighted avg     0.9089    0.9082    0.9081     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 3 end .
Do training Epoch=4/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:09:46: step:100    epoch:3.083 /10 celoss:0.159 regloss:0.0525  total_loss:0.212  batch_acc:94.00% epoch_acc:92.10% epoch_loss:0.214  (35415.9 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:48: step:200    epoch:3.167 /10 celoss:0.220 regloss:0.0530  total_loss:0.273  batch_acc:92.00% epoch_acc:92.02% epoch_loss:0.216  (31278.6 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:50: step:300    epoch:3.250 /10 celoss:0.315 regloss:0.0539  total_loss:0.369  batch_acc:88.50% epoch_acc:92.11% epoch_loss:0.215  (33379.5 examples/sec; 0.01 sec/batch)
Training Epoch:4/10 loss:0.21 acc:0.92 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 4: test_acc:0.899 test_loss:0.279  total_time:0 sec 
Epoch:4 with train_time:0.093070min and test_time:0.003471min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          938        0        17      8     2       2     30        0    3           0
Trouser                6      976         0     14     0       0      2        0    2           0
Pullover              21        1       922      8    31       0     17        0    0           0
Dress                 50        0        11    909    14       0     16        0    0           0
Coat                   2        1       135     39   789       0     33        0    1           0
Sandal                 0        0         0      0     0     966      0       24    0          10
Shirt                228        0       117     16    61       0    572        0    6           0
Sneaker                0        0         0      0     0       3      0      987    0          10
Bag                    7        1         6      3     2       2      0        5  974           0
Ankle boot             0        0         0      0     0       4      1       38    0         957

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.7492    0.9380    0.8330      1000
     Trouser     0.9969    0.9760    0.9864      1000
    Pullover     0.7632    0.9220    0.8351      1000
       Dress     0.9117    0.9090    0.9104      1000
        Coat     0.8776    0.7890    0.8310      1000
      Sandal     0.9887    0.9660    0.9772      1000
       Shirt     0.8525    0.5720    0.6846      1000
     Sneaker     0.9364    0.9870    0.9611      1000
         Bag     0.9878    0.9740    0.9809      1000
  Ankle boot     0.9795    0.9570    0.9681      1000

    accuracy                         0.8990     10000
   macro avg     0.9044    0.8990    0.8968     10000
weighted avg     0.9044    0.8990    0.8968     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 4 end .
Do training Epoch=5/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:09:53: step:100    epoch:4.067 /10 celoss:0.194 regloss:0.0543  total_loss:0.248  batch_acc:93.00% epoch_acc:92.46% epoch_loss:0.201  (35447.3 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:54: step:200    epoch:4.133 /10 celoss:0.170 regloss:0.0546  total_loss:0.224  batch_acc:95.50% epoch_acc:92.81% epoch_loss:0.197  (31472.2 examples/sec; 0.01 sec/batch)
2019-12-19 19:09:56: step:300    epoch:4.200 /10 celoss:0.204 regloss:0.0552  total_loss:0.259  batch_acc:93.50% epoch_acc:92.96% epoch_loss:0.193  (32910.7 examples/sec; 0.01 sec/batch)
Training Epoch:5/10 loss:0.19 acc:0.93 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 5: test_acc:0.910 test_loss:0.249  total_time:0 sec 
Epoch:5 with train_time:0.093259min and test_time:0.003784min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          947        0         9      6     3       1     31        0    3           0
Trouser                5      980         0     11     1       0      1        0    2           0
Pullover              27        2       877      5    47       0     41        0    1           0
Dress                 53        1         9    891    32       0     14        0    0           0
Coat                   4        1        60     16   855       0     63        0    1           0
Sandal                 0        0         0      0     0     982      0       12    0           6
Shirt                212        0        67     13    51       0    652        0    5           0
Sneaker                0        0         0      0     0       4      0      989    0           7
Bag                    8        1         3      3     3       3      0        4  975           0
Ankle boot             0        0         0      0     0       7      1       44    0         948

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.7540    0.9470    0.8395      1000
     Trouser     0.9949    0.9800    0.9874      1000
    Pullover     0.8556    0.8770    0.8662      1000
       Dress     0.9429    0.8910    0.9162      1000
        Coat     0.8619    0.8550    0.8584      1000
      Sandal     0.9850    0.9820    0.9835      1000
       Shirt     0.8120    0.6520    0.7232      1000
     Sneaker     0.9428    0.9890    0.9653      1000
         Bag     0.9878    0.9750    0.9814      1000
  Ankle boot     0.9865    0.9480    0.9669      1000

    accuracy                         0.9096     10000
   macro avg     0.9123    0.9096    0.9088     10000
weighted avg     0.9123    0.9096    0.9088     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 5 end .
Do training Epoch=6/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:09:59: step:100    epoch:5.056 /10 celoss:0.152 regloss:0.0560  total_loss:0.209  batch_acc:93.50% epoch_acc:93.50% epoch_loss:0.178  (35207.8 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:01: step:200    epoch:5.111 /10 celoss:0.148 regloss:0.0563  total_loss:0.204  batch_acc:95.00% epoch_acc:93.56% epoch_loss:0.179  (33584.0 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:02: step:300    epoch:5.167 /10 celoss:0.224 regloss:0.0569  total_loss:0.281  batch_acc:92.00% epoch_acc:93.69% epoch_loss:0.175  (33723.0 examples/sec; 0.01 sec/batch)
Training Epoch:6/10 loss:0.17 acc:0.94 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 6: test_acc:0.910 test_loss:0.247  total_time:0 sec 
Epoch:6 with train_time:0.094112min and test_time:0.003806min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          949        0         8      8     1       1     27        0    6           0
Trouser                3      988         0      6     0       0      1        0    2           0
Pullover              23        2       914      8    32       0     20        0    1           0
Dress                 50        2        11    913    16       0      8        0    0           0
Coat                   3        1       102     32   820       0     40        0    2           0
Sandal                 0        0         0      0     0     989      0        6    0           5
Shirt                220        1        90     16    57       0    604        0   12           0
Sneaker                0        0         0      0     0      12      0      974    0          14
Bag                    4        1         1      4     0       2      0        1  987           0
Ankle boot             0        0         0      0     0       8      1       26    0         965

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.7580    0.9490    0.8428      1000
     Trouser     0.9930    0.9880    0.9905      1000
    Pullover     0.8117    0.9140    0.8598      1000
       Dress     0.9250    0.9130    0.9190      1000
        Coat     0.8855    0.8200    0.8515      1000
      Sandal     0.9773    0.9890    0.9831      1000
       Shirt     0.8616    0.6040    0.7102      1000
     Sneaker     0.9672    0.9740    0.9706      1000
         Bag     0.9772    0.9870    0.9821      1000
  Ankle boot     0.9807    0.9650    0.9728      1000

    accuracy                         0.9103     10000
   macro avg     0.9137    0.9103    0.9082     10000
weighted avg     0.9137    0.9103    0.9082     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 6 end .
Do training Epoch=7/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:10:05: step:100    epoch:6.048 /10 celoss:0.157 regloss:0.0577  total_loss:0.214  batch_acc:93.50% epoch_acc:93.94% epoch_loss:0.165  (34723.9 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:07: step:200    epoch:6.095 /10 celoss:0.105 regloss:0.0578  total_loss:0.163  batch_acc:95.00% epoch_acc:93.98% epoch_loss:0.165  (33531.6 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:09: step:300    epoch:6.143 /10 celoss:0.181 regloss:0.0587  total_loss:0.240  batch_acc:92.50% epoch_acc:94.07% epoch_loss:0.162  (37954.1 examples/sec; 0.01 sec/batch)
Training Epoch:7/10 loss:0.16 acc:0.94 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 7: test_acc:0.916 test_loss:0.230  total_time:0 sec 
Epoch:7 with train_time:0.093442min and test_time:0.003851min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          914        0        16      7     2       1     55        0    5           0
Trouser                2      991         0      4     1       0      1        0    1           0
Pullover              17        1       887      7    33       0     54        0    1           0
Dress                 33        1         8    926    21       0     11        0    0           0
Coat                   2        1        72     23   827       0     75        0    0           0
Sandal                 0        0         0      0     0     964      0       23    0          13
Shirt                149        1        56     22    53       0    713        0    6           0
Sneaker                0        0         0      0     0       1      0      986    1          12
Bag                    5        2         2      4     1       1      1        0  984           0
Ankle boot             0        0         0      0     0       3      1       30    0         966

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.8146    0.9140    0.8615      1000
     Trouser     0.9940    0.9910    0.9925      1000
    Pullover     0.8521    0.8870    0.8692      1000
       Dress     0.9325    0.9260    0.9293      1000
        Coat     0.8817    0.8270    0.8535      1000
      Sandal     0.9938    0.9640    0.9787      1000
       Shirt     0.7827    0.7130    0.7462      1000
     Sneaker     0.9490    0.9860    0.9671      1000
         Bag     0.9860    0.9840    0.9850      1000
  Ankle boot     0.9748    0.9660    0.9704      1000

    accuracy                         0.9158     10000
   macro avg     0.9161    0.9158    0.9153     10000
weighted avg     0.9161    0.9158    0.9153     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 7 end .
Do training Epoch=8/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:10:11: step:100    epoch:7.042 /10 celoss:0.140 regloss:0.0589  total_loss:0.199  batch_acc:93.50% epoch_acc:94.71% epoch_loss:0.146  (32723.3 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:13: step:200    epoch:7.083 /10 celoss:0.171 regloss:0.0591  total_loss:0.230  batch_acc:96.00% epoch_acc:94.65% epoch_loss:0.148  (33557.1 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:15: step:300    epoch:7.125 /10 celoss:0.123 regloss:0.0597  total_loss:0.183  batch_acc:96.00% epoch_acc:94.78% epoch_loss:0.145  (34612.2 examples/sec; 0.01 sec/batch)
Training Epoch:8/10 loss:0.14 acc:0.95 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 8: test_acc:0.921 test_loss:0.225  total_time:0 sec 
Epoch:8 with train_time:0.092996min and test_time:0.003475min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          870        0        16     12     5       1     90        0    6           0
Trouser                0      984         1     10     2       0      1        0    2           0
Pullover              14        1       862      7    78       0     37        0    1           0
Dress                 14        1         9    933    28       0     15        0    0           0
Coat                   2        0        21     24   928       0     25        0    0           0
Sandal                 1        0         0      0     0     966      0       22    0          11
Shirt                 85        1        53     21    95       0    739        0    6           0
Sneaker                0        0         0      0     0       3      0      978    0          19
Bag                    3        2         2      3     2       1      1        2  984           0
Ankle boot             0        0         0      0     0       4      2       24    0         970

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.8797    0.8700    0.8748      1000
     Trouser     0.9949    0.9840    0.9894      1000
    Pullover     0.8942    0.8620    0.8778      1000
       Dress     0.9238    0.9330    0.9284      1000
        Coat     0.8155    0.9280    0.8681      1000
      Sandal     0.9908    0.9660    0.9782      1000
       Shirt     0.8121    0.7390    0.7738      1000
     Sneaker     0.9532    0.9780    0.9654      1000
         Bag     0.9850    0.9840    0.9845      1000
  Ankle boot     0.9700    0.9700    0.9700      1000

    accuracy                         0.9214     10000
   macro avg     0.9219    0.9214    0.9211     10000
weighted avg     0.9219    0.9214    0.9211     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 8 end .
Do training Epoch=9/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:10:17: step:100    epoch:8.037 /10 celoss:0.122 regloss:0.0600  total_loss:0.182  batch_acc:95.50% epoch_acc:95.35% epoch_loss:0.130  (33997.8 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:19: step:200    epoch:8.074 /10 celoss:0.142 regloss:0.0602  total_loss:0.203  batch_acc:95.00% epoch_acc:95.23% epoch_loss:0.132  (33875.6 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:21: step:300    epoch:8.111 /10 celoss:0.152 regloss:0.0607  total_loss:0.212  batch_acc:94.50% epoch_acc:95.32% epoch_loss:0.130  (34940.9 examples/sec; 0.01 sec/batch)
Training Epoch:9/10 loss:0.13 acc:0.95 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 9: test_acc:0.917 test_loss:0.236  total_time:0 sec 
Epoch:9 with train_time:0.093057min and test_time:0.003536min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          896        0        18      5     3       1     68        0    9           0
Trouser                4      982         1      7     3       0      1        0    2           0
Pullover              12        1       887      4    66       0     29        0    1           0
Dress                 33        2        13    870    54       0     27        0    1           0
Coat                   2        0        32     10   938       0     18        0    0           0
Sandal                 1        0         0      0     0     981      0       11    0           7
Shirt                123        0        62     13   112       0    682        0    8           0
Sneaker                0        0         0      0     0       4      0      986    0          10
Bag                    2        0         1      1     3       1      1        1  990           0
Ankle boot             0        0         0      0     0       4      2       33    0         961

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.8350    0.8960    0.8644      1000
     Trouser     0.9970    0.9820    0.9894      1000
    Pullover     0.8748    0.8870    0.8808      1000
       Dress     0.9560    0.8700    0.9110      1000
        Coat     0.7956    0.9380    0.8609      1000
      Sandal     0.9899    0.9810    0.9854      1000
       Shirt     0.8237    0.6820    0.7462      1000
     Sneaker     0.9564    0.9860    0.9710      1000
         Bag     0.9792    0.9900    0.9846      1000
  Ankle boot     0.9826    0.9610    0.9717      1000

    accuracy                         0.9173     10000
   macro avg     0.9190    0.9173    0.9165     10000
weighted avg     0.9190    0.9173    0.9165     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 9 end .
Do training Epoch=10/10 on train dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
2019-12-19 19:10:24: step:100    epoch:9.033 /10 celoss:0.120 regloss:0.0610  total_loss:0.181  batch_acc:95.50% epoch_acc:95.62% epoch_loss:0.121  (32010.3 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:26: step:200    epoch:9.067 /10 celoss:0.123 regloss:0.0611  total_loss:0.185  batch_acc:93.50% epoch_acc:95.70% epoch_loss:0.120  (35502.8 examples/sec; 0.01 sec/batch)
2019-12-19 19:10:27: step:300    epoch:9.100 /10 celoss:0.094 regloss:0.0615  total_loss:0.156  batch_acc:97.50% epoch_acc:95.79% epoch_loss:0.118  (31961.5 examples/sec; 0.01 sec/batch)
Training Epoch:10/10 loss:0.12 acc:0.96 fineshed usetime:5.6 sec
Do testing on validation dataset>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>:
 50 / 50 finished !Epoch 10: test_acc:0.921 test_loss:0.221  total_time:0 sec 
Epoch:10 with train_time:0.093466min and test_time:0.003928min
Confuse_matrix:
             T-shirt/top  Trouser  Pullover  Dress  Coat  Sandal  Shirt  Sneaker  Bag  Ankle boot
T-shirt/top          880        0        21     12     4       1     76        0    6           0
Trouser                1      981         1     12     2       0      1        0    2           0
Pullover              11        1       917      7    34       0     29        0    1           0
Dress                 13        0         9    939    25       0     14        0    0           0
Coat                   2        0        70     20   873       0     35        0    0           0
Sandal                 1        0         0      0     0     968      0       19    0          12
Shirt                100        1        73     25    71       0    725        0    5           0
Sneaker                0        0         0      0     0       4      0      973    0          23
Bag                    3        1         2      3     3       1      2        3  982           0
Ankle boot             0        0         0      0     0       4      1       22    0         973

Classification_report:
              precision    recall  f1-score   support

 T-shirt/top     0.8704    0.8800    0.8752      1000
     Trouser     0.9970    0.9810    0.9889      1000
    Pullover     0.8390    0.9170    0.8763      1000
       Dress     0.9224    0.9390    0.9306      1000
        Coat     0.8626    0.8730    0.8678      1000
      Sandal     0.9898    0.9680    0.9788      1000
       Shirt     0.8211    0.7250    0.7700      1000
     Sneaker     0.9567    0.9730    0.9648      1000
         Bag     0.9859    0.9820    0.9840      1000
  Ankle boot     0.9653    0.9730    0.9691      1000

    accuracy                         0.9211     10000
   macro avg     0.9210    0.9211    0.9205     10000
weighted avg     0.9210    0.9211    0.9205     10000
 

the result saved in /tmp/out/model_out/logs_20191219_190839/validation/validation_result.txt
Model saved at Epoch 10 end .

【Tensorflow2.0】10、端到端的自定义模型训练custom training

文件结构

!tree /tmp/out/model_out

[01;34m/tmp/out/model_out[00m
├── [01;34mhdf5_models_20191219_190839[00m
│   ├── ckpt_epoch01_val_acc0.88.hdf5
│   ├── ckpt_epoch02_val_acc0.90.hdf5
│   ├── ckpt_epoch03_val_acc0.91.hdf5
│   ├── ckpt_epoch04_val_acc0.90.hdf5
│   ├── ckpt_epoch05_val_acc0.91.hdf5
│   ├── ckpt_epoch06_val_acc0.91.hdf5
│   ├── ckpt_epoch07_val_acc0.92.hdf5
│   ├── ckpt_epoch08_val_acc0.92.hdf5
│   ├── ckpt_epoch09_val_acc0.92.hdf5
│   └── ckpt_epoch10_val_acc0.92.hdf5
└── [01;34mlogs_20191219_190839[00m
    ├── [01;35mmodel_arch.png[00m
    ├── model_json.json
    ├── [01;34mtrain[00m
    │   ├── events.out.tfevents.1576753723.cuda10.6152.8.v2
    │   ├── events.out.tfevents.1576753766.cuda10.profile-empty
    │   └── [01;34mplugins[00m
    │       └── [01;34mprofile[00m
    │           └── [01;34m2019-12-19_19-09-26[00m
    │               └── local.trace
    ├── [01;35mtraining.png[00m
    └── [01;34mvalidation[00m
        ├── [01;35mconfusion_matrix_epoch10.png[00m
        ├── [01;35mconfusion_matrix_epoch1.png[00m
        ├── [01;35mconfusion_matrix_epoch2.png[00m
        ├── [01;35mconfusion_matrix_epoch3.png[00m
        ├── [01;35mconfusion_matrix_epoch4.png[00m
        ├── [01;35mconfusion_matrix_epoch5.png[00m
        ├── [01;35mconfusion_matrix_epoch6.png[00m
        ├── [01;35mconfusion_matrix_epoch7.png[00m
        ├── [01;35mconfusion_matrix_epoch8.png[00m
        ├── [01;35mconfusion_matrix_epoch9.png[00m
        ├── events.out.tfevents.1576753723.cuda10.6152.16.v2
        └── validation_result.txt

7 directories, 28 files

7 directories, 28 files

总结

查看tensorboard只要输入：
tensorboard --logdir=/tmp/out/model_out/logs_20191219_190839 --bind_all

需要说明的是，由于在jupyter中执行，可能是因为eager模式，因此没有在tensorboard记录graph

暂时还没找到方法解决这个问题。
后期会加入如何在jupyter中显示tensorboard的代码