工业蒸汽量预测
工业蒸汽量预测
1.导包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, RepeatedKFold
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.svm import SVR
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")
2.导入数据
train_data_file = "E:\\蒸汽模型数据\\zhengqi_train.txt"
test_data_file = "E:\\蒸汽模型数据\\zhengqi_test.txt"
dtrain = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
dtest = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
3.合并数据
dfull = pd.concat([dtrain, dtest], ignore_index=True, sort=False)
4.查看数据
print('训练集大小: ', np.shape(dtrain))
print('测试集大小: ', np.shape(dtest))
# 查看数据缺失情况
print('缺失值统计:')
print(dfull.apply(lambda x: sum(x.isnull())))
训练集大小: (2888, 39)
测试集大小: (1925, 38)
缺失值统计:
V0 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
V29 0
V30 0
V31 0
V32 0
V33 0
V34 0
V35 0
V36 0
V37 0
target 1925
dtype: int64
5.观察数据
plt.figure(figsize=(18,8),dpi=100)
dfull.boxplot(sym='r^', patch_artist=True, notch=True)
plt.title('DATA-FULL')
6.查看数据统计信息
print(dtrain.describe())
V0 V1 ... V37 target
count 2888.000000 2888.000000 … 2888.000000 2888.000000
mean 0.123048 0.056068 … -0.130330 0.126353
std 0.928031 0.941515 … 1.017196 0.983966
min -4.335000 -5.122000 … -3.630000 -3.044000
25% -0.297000 -0.226250 … -0.798250 -0.350250
50% 0.359000 0.272500 … -0.185500 0.313000
75% 0.726000 0.599000 … 0.495250 0.793250
max 2.121000 1.918000 … 3.000000 2.538000
[8 rows x 39 columns]
print(dtest.describe())
V0 V1 ... V36 V37
count 1925.000000 1925.000000 … 1925.000000 1925.000000
mean -0.184404 -0.083912 … -0.046270 0.195735
std 1.073333 1.076670 … 1.040854 0.940599
min -4.814000 -5.488000 … -2.608000 -3.346000
25% -0.664000 -0.451000 … -0.593000 -0.432000
50% 0.065000 0.195000 … 0.083000 0.152000
75% 0.549000 0.589000 … 0.651000 0.797000
max 2.100000 2.120000 … 2.861000 3.021000
[8 rows x 38 columns]
7.通过热力图观察数据相关性
def heatmap(df):
plt.figure(figsize=(20,16), dpi=100)
cols = df.columns.tolist()
mcorr = df[cols].corr(method = 'spearman')
cmap = sns.diverging_palette(220, 10, as_cmap=True)
mask = np.zeros_like(mcorr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')
plt.xticks(rotation=45)
return mcorr
dtrain_mcorr = heatmap(dtrain)
8.数据处理
# 删除dfull表中指定的列
def drop_var(var_lst):
dfull.drop(var_lst, axis=1, inplace=True)
# 将dfull重新分割为dtrain核dtest
def split_dfull():
dtrain = dfull[dfull['target'].notnull()]
dtest = dfull[dfull['target'].isnull()]
dtest.drop('target', axis=1, inplace=True)
return dtrain, dtest
剔除训练集与测试集分布不均匀的特征变量
plt.figure(figsize=(20,50),dpi=100)
for i in range(38):
plt.subplot(10,4,i+1)
sns.distplot(dtrain.iloc[:,i], color='green')
sns.distplot(dtest.iloc[:,i], color='red')
plt.legend(['Train','Test'])
plt.tight_layout()
V5、V9、V11、V17、V20、V21、V22、V27、V28 特征训练集和测试集分布差异过大,因此为了减小数据分布不均对预测结果的影响,应将上述特征进行剔除
drop_var(['V5','V9','V11','V17','V20','V21','V22','V27','V28'])
dtrain, dtest = split_dfull()
查看各特征与结果变量的相关性,以及是否服从正态分布
plt.figure(figsize=(20,60),dpi=80)
i = 0
for col in dtrain.columns:
i += 1
plt.subplot(15,4,i)
sns.regplot(col, 'target', data = dtrain,
scatter_kws = {'marker': '.', 's': 5, 'alpha': .6},
line_kws = {'color': 'k'})
coef = np.corrcoef(dtrain[col], dtrain['target'])[0][1]
plt.title(f'coef = {coef}')
plt.xlabel(col)
plt.ylabel('target')
i += 1
plt.subplot(15,4,i)
sns.distplot(dtrain[col], fit = stats.norm)
plt.title(f'skew = {stats.skew(dtrain[col])}')
plt.xlabel(col)
plt.tight_layout()
- V14、V25、V26、V32、V33、V34、V35 与结果变量相关性很低,予以 删除
- V0、V1、V6、V7、V8、V12、V16、V31 存在明显的数据左偏,可用 对数转换 降低偏态
drop_var(['V14','V25','V26','V32','V33','V34','V35'])
dtrain, dtest = split_dfull()
# 分布呈明显左偏的特征
piantai = ['V0','V1','V6','V7','V8','V12','V16','V31']
# 创建函数——找到令偏态系数绝对值最小的对数转换的底
def find_min_skew(data):
subs = list(np.arange(1.01,2,0.01))
skews = []
for x in subs:
skew = abs(stats.skew(np.power(x,data)))
skews.append(skew)
min_skew = min(skews)
i = skews.index(min_skew)
return subs[i], min_skew
对训练集和测试集偏态特征同时进行对数转换
for col in piantai:
sub = find_min_skew(dfull[col])[0]
dfull[col] = np.power(sub, dfull[col])
dtrain, dtest = split_dfull()
对剩余特征数据进行标准化,采用 z-score标准化 方法
dfull.iloc[:,:-1] = dfull.iloc[:,:-1].apply(lambda x: (x-x.mean())/x.std())
dtrain, dtest = split_dfull()
数据处理结束
查看特征方程
plt.figure(figsize=(20,20),dpi=80)
for i in range(22):
plt.subplot(6,4,i+1)
sns.distplot(dtrain.iloc[:,i], color='green')
sns.distplot(dtest.iloc[:,i], color='red')
plt.legend(['Train','Test'])
plt.tight_layout()
9.模型训练
将训练数据分割为训练集与验证集
X = np.array(dtrain.iloc[:,:-1])
y = np.array(dtrain['target'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)
创建模型评分函数
def score(y, y_pred):
# 计算均方误差 MSE
print('MSE = {0}'.format(mean_squared_error(y, y_pred)))
# 计算模型决定系数 R2
print('R2 = {0}'.format(r2_score(y, y_pred)))
# 计算预测残差,找异常点
y = pd.Series(y)
y_pred = pd.Series(y_pred, index=y.index)
resid = y - y_pred
mean_resid = resid.mean()
std_resid = resid.std()
z = (resid - mean_resid) / std_resid
n_outliers = sum(abs(z)>3)
# 图一:真实值vs预计值
plt.figure(figsize=(18,5), dpi=80)
plt.subplot(131)
plt.plot(y, y_pred, '.')
plt.xlabel('y')
plt.ylabel('y_pred')
plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
# 图二:残差分布散点图
plt.subplot(132)
plt.plot(y, y-y_pred, '.')
plt.xlabel('y')
plt.ylabel('resid')
plt.ylim([-3,3])
plt.title('std_resid = {:.3f}'.format(std_resid))
# 图三:残差z得分直方图
plt.subplot(133)
sns.distplot(z, bins=50)
plt.xlabel('z')
plt.title('{:.0f} samples with z>3'.format(n_outliers))
plt.tight_layout()
由上图热力图显示,存在多重共线性,所以开始模型训练前,利用岭回归模型预测,剔除异常样本
# 利用RidgeCV函数自动寻找最优参数
ridge = RidgeCV()
ridge.fit(X_train, y_train)
print('best_alpha = {0}'.format(ridge.alpha_))
y_pred = ridge.predict(X_train)
score(y_train, y_pred)
best_alpha = 10.0
MSE = 0.12396474687012962
R2 = 0.8695133658493562
找出异常样本点并剔除
resid = y_train - y_pred
resid = pd.Series(resid, index=range(len(y_train)))
resid_z = (resid-resid.mean()) / resid.std()
outliers = resid_z[abs(resid_z)>3].index
print(f'{len(outliers)} Outliers:')
print(outliers.tolist())
plt.figure(figsize=(14,6),dpi=60)
plt.subplot(121)
plt.plot(y_train, y_pred, '.')
plt.plot(y_train[outliers], y_pred[outliers], 'ro')
plt.title(f'MSE = {mean_squared_error(y_train,y_pred)}')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('y_train')
plt.ylabel('y_pred')
plt.subplot(122)
sns.distplot(resid_z, bins = 50)
sns.distplot(resid_z.loc[outliers], bins = 50, color = 'r')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('z')
plt.tight_layout()
29 Outliers:
[89, 407, 652, 725, 768, 797, 829, 875, 907, 1123, 1190, 1195, 1236, 1295, 1324, 1366, 1544, 1641, 1662, 1706, 1718, 1833, 1983, 2013, 2022, 2025, 2084, 2124, 2279]
异常样本点剔除
X_train = np.array(pd.DataFrame(X_train).drop(outliers,axis=0))
y_train = np.array(pd.Series(y_train).drop(outliers,axis=0))
套用模型——Lasso回归
lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)
print('best_alpha = {0}'.format(lasso.alpha_))
pred_lasso = lasso.predict(X_valid)
score(y_valid, pred_lasso)
best_alpha = 0.000814568505801466
MSE = 0.13352630896862544
R2 = 0.8715017160007512
套用模型——SVR
使用sklearn中的网格搜索方法GridSearchCV 寻找SVR最优模型参数创建GridSearchCV网格参数搜寻函数,评价标准为最小均方误差,采用K折交叉验证的检验方法
def gsearch(model, param_grid, scoring='neg_mean_squared_error', splits=5, repeats=1, n_jobs=-1):
# p次k折交叉验证
rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats, random_state=0)
model_gs = GridSearchCV(model, param_grid=param_grid, scoring=scoring, cv=rkfold, verbose=1, n_jobs=-1)
model_gs.fit(X_train, y_train)
print('参数最佳取值: {0}'.format(model_gs.best_params_))
print('最小均方误差: {0}'.format(abs(model_gs.best_score_)))
return model_gs
使用高斯核对惩罚参数C与核系数gamma进行网格搜索CV验证
svr = SVR()
cv_params = {'C': np.logspace(0, 3, 4), 'gamma': np.logspace(-4, -1, 4)}
svr = gsearch(svr, cv_params)
参数最佳取值: {‘C’: 100.0, ‘gamma’: 0.001}
最小均方误差: 0.09654885340007904
缩小参数范围进行细调
svr = SVR()
cv_params = {'C': [1,2,5,10,15,20,30,50,80,100,150,200], 'gamma': [0.0001,0.0005,0.0008,0.001,0.002,0.003,0.005]}
svr = gsearch(svr, cv_params)
参数最佳取值: {‘C’: 15, ‘gamma’: 0.005}
最小均方误差: 0.09545483790250289
验证集预测
pred_svr = svr.predict(X_valid)
score(y_valid, pred_svr)
MSE = 0.11131676709494921
R2 = 0.8928749423051452
套用模型——XGB
初始参数
params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
最佳迭代次数:n_estimators
cv_params = {'n_estimators': [100,200,300,400,500,600,700,800,900,1000,1100,1200]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘n_estimators’: 300}
最小均方误差: 0.09897718573824728
更新参数
params['n_estimators'] = 300
min_child_weight ,max_depth
cv_params = {'max_depth': [3,4,5,6,7,8,9],
'min_child_weight': [1,2,3,4,5,6,7]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘max_depth’: 5, ‘min_child_weight’: 3}
最小均方误差: 0.09652119085288388
更新参数
params['max_depth'] = 5
params['min_child_weight'] = 3
后剪枝参数 gamma
cv_params = {'gamma': [0,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘gamma’: 0.01}
最小均方误差: 0.09636636275146108
更新参数
params['gamma'] = 0.01
样本采样subsample 和 列采样colsample_bytree
cv_params = {'subsample': [0.6,0.7,0.8,0.9],
'colsample_bytree': [0.6,0.7,0.8,0.9]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘colsample_bytree’: 0.6, ‘subsample’: 0.8}
最小均方误差: 0.09607204955270585
更新参数
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
L1正则项参数reg_alpha 和 L2正则项参数reg_lambda
cv_params = {'reg_alpha': [0,0.02,0.05,0.1,1,2,3],
'reg_lambda': [0,0.02,0.05,0.1,1,2,3]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘reg_alpha’: 0.05, ‘reg_lambda’: 2}
最小均方误差: 0.09488340395713288
更新参数
params['reg_alpha'] = 0.05
params['reg_lambda'] = 2
调学习率learning_rate
cv_params = {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1, 0.2]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)
参数最佳取值: {‘learning_rate’: 0.1}
最小均方误差: 0.09488340395713288
更新参数
params['learning_rate'] = 0.1
以验证集进行模型误差验证
pred_xgb = xgb.predict(X_valid)
score(y_valid, pred_xgb)
MSE = 0.12072281382484629
R2 = 0.8838230867319299
模型加权融合
def model_mix(pred_1, pred_2, pred_3):
result = pd.DataFrame(columns=['Lasso','SVR','XGB','Combine'])
for a in range(5):
for b in range(1,6):
for c in range(5):
y_pred = (a*pred_1 + b*pred_2 + c*pred_3) / (a+b+c)
mse = mean_squared_error(y_valid, y_pred)
result = result.append([{'Lasso':a, 'SVR':b, 'XGB':c, 'Combine':mse}], ignore_index=True)
return result
model_combine = model_mix(pred_lasso, pred_svr, pred_xgb)
model_combine.sort_values(by='Combine', inplace=True)
model_combine.head()
Lasso SVR XGB Combine
16 0 4 1 0.110520
11 0 3 1 0.110541
21 0 5 1 0.110555
47 1 5 2 0.110593
46 1 5 1 0.110598
X_test = np.array(dtest)
ans_lasso = lasso.predict(X_test)
ans_svr = svr.predict(X_test)
ans_xgb = xgb.predict(X_test)
ans_mix = (ans_lasso + 4 * ans_svr + ans_xgb) / 8
pd.Series(ans_mix).to_csv('E:\\蒸汽模型数据\\最終篇.txt', sep='\t', index=False)
print('Finished!')