Pytorch极简入门教程(七)—— 划分训练集和测试集
程序员文章站
2022-05-26 19:19:39
...
# -*- coding: utf-8 -*-
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
data = pd.read_csv("dataset/HR.csv")
print("data.head():\t", data.head())
data.info()
print("data.info():\t", data.info())
data.part.unique() # 查看part里面的东西
print("data.part.unique()", data.part.unique())
data.salary.unique() # 查看salary里面存在的东西
print("data.salary.unique()", data.salary.unique())
# 分组查看
data.groupby(["salary", "part"]).size()
print("data.groupby(['salary', 'part']).size()", data.groupby(['salary', 'part']).size())
# 将特征转换成数值化(独热编码)
pd.get_dummies(data.salary)
print("pd.get_dummies(data.salary):\t", pd.get_dummies(data.salary))
# 将salary转换成独热编码添加至原有数据
data = data.join(pd.get_dummies(data.salary))
print("data.head()):\t", data.head())
# 将part转换成独热编码添加至原有数据
data = data.join(pd.get_dummies(data.part)) # 10个职业 独热编码扩展10位
# 删除"salary"的特征
del data["salary"]
del data["part"]
data.head()
print("data.head():\t", data.head())
# 查看是否会离职数据
data.left.value_counts() # value_counts() 值计数的方式
print("data.left.value_counts():\n", data.left.value_counts())
Y_data = data.left.values.reshape(-1, 1)
print("Y_data.shape:\t", Y_data.shape)
# Tensor 查看行传可以用.size()或者.shape() 但是
Y = torch.from_numpy(Y_data).type(torch.float32)
print("Y.shape", Y.shape) # == print("Y.size()", Y.size())
# 将不是标签的全部元素组成列表
"""
M = [c for c in data.columns if c!= "left"]
print("M:\t", X_data)
"""
X_data = data[[c for c in data.columns if c != 'left']].values # 取出列表的中各个元素所对应的值
"""
两种方式进行数据类型转换
如果numpy上转换 则用.astype(np.float32)
如果torch上转换 则用.type(torch.float32)
"""
X = torch.from_numpy(X_data.astype(np.float32))
# X =torch.from_numpy(X_data).type(torch.float32)
print("X:\t", X)
print("X.size():\t", X.shape) # X.size()和X.shape等价
"""""""""""""""""""""""""""""""""""""""""""""""
创建模型:
from torch import nn
自定义模型:
nn.Module: 继承这个类
__init__:初始化所有的层
forward: 定义模型的运算过程 (前向传播的过程)
"""""""""""""""""""""""""""""""""""""""""""""""
"""
# 自定义类 方法一
class Model(nn.Module):
def __init__(self):
super().__init__()
self.liner_1 = nn.Linear(20, 64)
self.liner_2 = nn.Linear(64, 64)
self.liner_3 = nn.Linear(64, 1)
self.relu = nn.ReLU() # 初始化relu
self.sigmoid = nn.Sigmoid() # 初始化sigmoid
def forward(self, input):
x = self.Liner_1(input)
x = self.relu(x)
x = self.Liner_2(x)
x = self.rele(x)
x = self.Liner_3(x)
x = self.sigmod(x)
return x
"""
"""""""""""""""""""""""""""""""""""
方法的改写: 方法二
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self):
super().__init__()
self.liner_1 = nn.Linear(20, 64)
self.liner_2 = nn.Linear(64, 64)
self.liner_3 = nn.Linear(64, 1)
def forward(self, input):
x = F.relu(self.Liner_1(input))
x = F.relu(self.Liner_2(x))
x = F.sigmoid(self.Liner_3(x))
return x
"""""""""""""""""""""""""""""""""""
import torch.nn.functional as F
class Model(nn.Module):
def __init__(self):
super().__init__()
self.Liner_1 = nn.Linear(20, 64)
self.Liner_2 = nn.Linear(64, 64)
self.Liner_3 = nn.Linear(64, 1)
def forward(self, input):
x = F.relu(self.Liner_1(input))
x = F.relu(self.Liner_2(x))
x = F.sigmoid(self.Liner_3(x))
return x
"""
model = Model() # 模型的实例化
print("model:\t", model)
"""
lr = 0.001
def get_model():
model = Model()
opt = torch.optim.Adam(model.parameters(), lr=lr)
return model, opt
model, optim = get_model() # return 返回model、optim
"""
定义损失函数
"""
loss_fn = nn.BCELoss()
# 定义优化器
batch = 64
no_of_batch = len(data) // batch
epochs = 100
"""
添加验证:
了解过拟合与欠拟合
过拟合:对于训练数据过度拟合,对于未知数据预测很差
欠拟合:对于训练数据拟合不足,对于未知数据预测很差
"""
"""
需要用到机器学习的库
pip install sklearn -i https://pipy,doubanio.com/simple
在Jupyter Notebook可以采用
!pip install sklearn -i https://pypi.doubanio.com/simple
"""
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, Y)
print("type(train_x):\t", type(train_x))
print("X_data.shape:\t", X_data.shape)
print("train_x.shape:\t{}, test_x.shape:\t{}".format(train_x.shape, test_x.shape))
train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True)
test_ds = TensorDataset(test_x, test_y)
test_dl = DataLoader(test_ds, batch_size=batch) # 测试集不要乱序
data.head(): satisfaction_level last_evaluation ... part salary
0 0.38 0.53 ... sales low
1 0.80 0.86 ... sales medium
2 0.11 0.88 ... sales medium
3 0.72 0.87 ... sales low
4 0.37 0.52 ... sales low
[5 rows x 10 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 satisfaction_level 14999 non-null float64
1 last_evaluation 14999 non-null float64
2 number_project 14999 non-null int64
3 average_montly_hours 14999 non-null int64
4 time_spend_company 14999 non-null int64
5 Work_accident 14999 non-null int64
6 left 14999 non-null int64
7 promotion_last_5years 14999 non-null int64
8 part 14999 non-null object
9 salary 14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 satisfaction_level 14999 non-null float64
1 last_evaluation 14999 non-null float64
2 number_project 14999 non-null int64
3 average_montly_hours 14999 non-null int64
4 time_spend_company 14999 non-null int64
5 Work_accident 14999 non-null int64
6 left 14999 non-null int64
7 promotion_last_5years 14999 non-null int64
8 part 14999 non-null object
9 salary 14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB
data.info(): None
data.part.unique() ['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT'
'product_mng' 'marketing' 'RandD']
data.salary.unique() ['low' 'medium' 'high']
data.groupby(['salary', 'part']).size() salary part
high IT 83
RandD 51
accounting 74
hr 45
management 225
marketing 80
product_mng 68
sales 269
support 141
technical 201
low IT 609
RandD 364
accounting 358
hr 335
management 180
marketing 402
product_mng 451
sales 2099
support 1146
technical 1372
medium IT 535
RandD 372
accounting 335
hr 359
management 225
marketing 376
product_mng 383
sales 1772
support 942
technical 1147
dtype: int64
pd.get_dummies(data.salary): high low medium
0 0 1 0
1 0 0 1
2 0 0 1
3 0 1 0
4 0 1 0
... ... ... ...
14994 0 1 0
14995 0 1 0
14996 0 1 0
14997 0 1 0
14998 0 1 0
[14999 rows x 3 columns]
data.head()): satisfaction_level last_evaluation number_project ... high low medium
0 0.38 0.53 2 ... 0 1 0
1 0.80 0.86 5 ... 0 0 1
2 0.11 0.88 7 ... 0 0 1
3 0.72 0.87 5 ... 0 1 0
4 0.37 0.52 2 ... 0 1 0
[5 rows x 13 columns]
data.head(): satisfaction_level last_evaluation ... support technical
0 0.38 0.53 ... 0 0
1 0.80 0.86 ... 0 0
2 0.11 0.88 ... 0 0
3 0.72 0.87 ... 0 0
4 0.37 0.52 ... 0 0
[5 rows x 21 columns]
data.left.value_counts():
0 11428
1 3571
Name: left, dtype: int64
Y_data.shape: (14999, 1)
Y.shape torch.Size([14999, 1])
X: tensor([[0.3800, 0.5300, 2.0000, ..., 1.0000, 0.0000, 0.0000],
[0.8000, 0.8600, 5.0000, ..., 1.0000, 0.0000, 0.0000],
[0.1100, 0.8800, 7.0000, ..., 1.0000, 0.0000, 0.0000],
...,
[0.3700, 0.5300, 2.0000, ..., 0.0000, 1.0000, 0.0000],
[0.1100, 0.9600, 6.0000, ..., 0.0000, 1.0000, 0.0000],
[0.3700, 0.5200, 2.0000, ..., 0.0000, 1.0000, 0.0000]])
X.size(): torch.Size([14999, 20])
type(train_x): <class 'torch.Tensor'>
X_data.shape: (14999, 20)
train_x.shape: torch.Size([11249, 20]), test_x.shape: torch.Size([3750, 20])