欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

pytorch:Kaggle房价预测

程序员文章站 2022-06-26 20:01:13
...
#-*- coding:utf-8 -*-

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython import display
import sys
import matplotlib.pyplot as plt

#处理数据
#使用pandas模块读取csv中数据
train_data = pd.read_csv('../data/kaggle_house_pred_train.csv')
test_data = pd.read_csv('../data/kaggle_house_pred_test.csv')

#将训练样本和测试样本融合在一起组成总样本,作为K折交叉验证的数据
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x : (x - x.mean()) /(x.std())) #强大的apply函数
all_features[numeric_features] = all_features[numeric_features].fillna(0)

all_features = pd.get_dummies(all_features, dummy_na=True)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values).view((-1,1))#view相当于reshape
dataset = torch.utils.data.TensorDataset(train_features, train_labels)

#定义损失函数模型
loss = nn.MSELoss()#torch中表示平均损失函数
def get_net(feature_num):
    net = nn.Linear(feature_num, 1)#全连接层
    for param in net.parameters():   #模型中要训练的参数,保存在paramsters()中
        nn.init.normal_(param,mean=0,std=0.01)#初始化方法
    return net

#定义对数均方根,用于评价模型
def log_rmse(net, features, labels):
    with torch.no_grad():#在torch.no_grad() 会影响pytorch的反向传播机制,在测试时因为确定不会使用到反向传播因此 这种模式可以帮助节省内存空间。
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        rmse = torch.sqrt(2 * loss(clipped_preds.float().log(), labels.float().log()).mean())
    return rmse.item()    #生成标量

#训练模型,使用Adam优化算法
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate,weight_decay, batch_size):
    train_ls, test_ls = [],[]
    #TensorDataset函数生成数据集,通过沿着**第一个维度**索引两个张量来恢复每个样本。https://blog.csdn.net/qq_24503095/article/details/103616129
    dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    #对数据进行处理生成可迭代对象https://blog.csdn.net/qq_24503095/article/details/103616484
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    #使用Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay)
    net = net.float()
    for epoch in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            optimizer.zero_grad()#清空所有被优化过的Variable的梯度.
            l.backward()  #进行反向传播,更新全连接中参数
            optimizer.step() #进行算法优化
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

#K折交叉验证
def get_K_fold_data(k, i, X, y):
    #返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    x_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)#切片
        x_part, y_part = X[idx,:],y[idx]
        if j == i:
            X_valid, y_valid = x_part, y_part
        elif x_train is None:
            x_train, y_train = x_part, y_part
        else:
            x_train = torch.cat((x_train, x_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return x_train, y_train, X_valid, y_valid

def use_svg_diplay():
    display.set_matplotlib_formats('svg')
def set_figsize(figsize=(3.5, 2.5)):
    use_svg_diplay()
    plt.rcParams['figure.figsize'] = figsize

def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
             legend=None, figsize=(3.5, 2.5)):
    set_figsize(figsize)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.semilogy(x_vals, y_vals)
    if x2_vals and y2_vals:
        plt.semilogy(x2_vals, y2_vals, linestyle=':')
        plt.legend(legend)
    plt.show()

def k_fold(k, X_train, y_train, num_epochs, learing_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0,0
    for i in range(k):
        data = get_K_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs, learing_rate, weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                     range(1, num_epochs + 1), valid_ls, ['train', 'valid'])
        print('flod %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k
k, num_epochs, lr, weight_decay, batch_size, = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f'%(k, train_l, valid_l))