工业蒸汽量预测

程序员文章站 2022-05-02 20:21:09

...

工业蒸汽量预测

1.导包

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, RepeatedKFold

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.svm import SVR
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings("ignore")

2.导入数据

train_data_file = "E:\\蒸汽模型数据\\zhengqi_train.txt"
test_data_file = "E:\\蒸汽模型数据\\zhengqi_test.txt"
dtrain = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
dtest = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

3.合并数据

dfull = pd.concat([dtrain, dtest], ignore_index=True, sort=False)

4.查看数据

print('训练集大小: ', np.shape(dtrain))
print('测试集大小: ', np.shape(dtest))
# 查看数据缺失情况
print('缺失值统计：')
print(dfull.apply(lambda x: sum(x.isnull())))

训练集大小: (2888, 39)
测试集大小: (1925, 38)
缺失值统计：
V0 0
V1 0
V2 0
V3 0
V4 0
V5 0
V6 0
V7 0
V8 0
V9 0
V10 0
V11 0
V12 0
V13 0
V14 0
V15 0
V16 0
V17 0
V18 0
V19 0
V20 0
V21 0
V22 0
V23 0
V24 0
V25 0
V26 0
V27 0
V28 0
V29 0
V30 0
V31 0
V32 0
V33 0
V34 0
V35 0
V36 0
V37 0
target 1925
dtype: int64

5.观察数据

plt.figure(figsize=(18,8),dpi=100)
dfull.boxplot(sym='r^', patch_artist=True, notch=True)
plt.title('DATA-FULL')

工业蒸汽量预测

6.查看数据统计信息

print(dtrain.describe())

            V0           V1  ...          V37       target

count 2888.000000 2888.000000 … 2888.000000 2888.000000
mean 0.123048 0.056068 … -0.130330 0.126353
std 0.928031 0.941515 … 1.017196 0.983966
min -4.335000 -5.122000 … -3.630000 -3.044000
25% -0.297000 -0.226250 … -0.798250 -0.350250
50% 0.359000 0.272500 … -0.185500 0.313000
75% 0.726000 0.599000 … 0.495250 0.793250
max 2.121000 1.918000 … 3.000000 2.538000
[8 rows x 39 columns]

print(dtest.describe())

            V0           V1  ...          V36          V37

count 1925.000000 1925.000000 … 1925.000000 1925.000000
mean -0.184404 -0.083912 … -0.046270 0.195735
std 1.073333 1.076670 … 1.040854 0.940599
min -4.814000 -5.488000 … -2.608000 -3.346000
25% -0.664000 -0.451000 … -0.593000 -0.432000
50% 0.065000 0.195000 … 0.083000 0.152000
75% 0.549000 0.589000 … 0.651000 0.797000
max 2.100000 2.120000 … 2.861000 3.021000
[8 rows x 38 columns]

7.通过热力图观察数据相关性

def heatmap(df):
    plt.figure(figsize=(20,16), dpi=100)
    cols = df.columns.tolist()
    mcorr = df[cols].corr(method = 'spearman')
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    mask = np.zeros_like(mcorr, dtype = np.bool)
    mask[np.triu_indices_from(mask)] = True              
    g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')
    plt.xticks(rotation=45)
    return mcorr
    
dtrain_mcorr = heatmap(dtrain)

工业蒸汽量预测

8.数据处理

# 删除dfull表中指定的列
def drop_var(var_lst):
    dfull.drop(var_lst, axis=1, inplace=True)
 
# 将dfull重新分割为dtrain核dtest   
def split_dfull():
    dtrain = dfull[dfull['target'].notnull()]
    dtest = dfull[dfull['target'].isnull()]
    dtest.drop('target', axis=1, inplace=True)
    return dtrain, dtest

剔除训练集与测试集分布不均匀的特征变量

plt.figure(figsize=(20,50),dpi=100)
for i in range(38):
    plt.subplot(10,4,i+1)
    sns.distplot(dtrain.iloc[:,i], color='green')
    sns.distplot(dtest.iloc[:,i], color='red')
    plt.legend(['Train','Test'])
plt.tight_layout()

工业蒸汽量预测

V5、V9、V11、V17、V20、V21、V22、V27、V28 特征训练集和测试集分布差异过大，因此为了减小数据分布不均对预测结果的影响，应将上述特征进行剔除

drop_var(['V5','V9','V11','V17','V20','V21','V22','V27','V28'])
dtrain, dtest = split_dfull()

查看各特征与结果变量的相关性，以及是否服从正态分布

plt.figure(figsize=(20,60),dpi=80)
i = 0
for col in dtrain.columns:
    i += 1
    plt.subplot(15,4,i)
    sns.regplot(col, 'target', data = dtrain,
               scatter_kws = {'marker': '.', 's': 5, 'alpha': .6},
               line_kws = {'color': 'k'})
    coef = np.corrcoef(dtrain[col], dtrain['target'])[0][1]
    plt.title(f'coef = {coef}')
    plt.xlabel(col)
    plt.ylabel('target')
    
    i += 1
    plt.subplot(15,4,i)
    sns.distplot(dtrain[col], fit = stats.norm)
    plt.title(f'skew = {stats.skew(dtrain[col])}')
    plt.xlabel(col)
    
plt.tight_layout()

工业蒸汽量预测

V14、V25、V26、V32、V33、V34、V35 与结果变量相关性很低，予以删除
V0、V1、V6、V7、V8、V12、V16、V31 存在明显的数据左偏，可用 对数转换 降低偏态

drop_var(['V14','V25','V26','V32','V33','V34','V35'])
dtrain, dtest = split_dfull()

# 分布呈明显左偏的特征
piantai = ['V0','V1','V6','V7','V8','V12','V16','V31']

# 创建函数——找到令偏态系数绝对值最小的对数转换的底
def find_min_skew(data):
    subs = list(np.arange(1.01,2,0.01))
    skews = []
    for x in subs:
        skew = abs(stats.skew(np.power(x,data)))
        skews.append(skew)
    min_skew = min(skews)
    i = skews.index(min_skew)
    return subs[i], min_skew

对训练集和测试集偏态特征同时进行对数转换

for col in piantai:
    sub = find_min_skew(dfull[col])[0]
    dfull[col] = np.power(sub, dfull[col])
dtrain, dtest = split_dfull()

对剩余特征数据进行标准化，采用 z-score标准化 方法

dfull.iloc[:,:-1] = dfull.iloc[:,:-1].apply(lambda x: (x-x.mean())/x.std())
dtrain, dtest = split_dfull()

数据处理结束

查看特征方程

plt.figure(figsize=(20,20),dpi=80)
for i in range(22):
    plt.subplot(6,4,i+1)
    sns.distplot(dtrain.iloc[:,i], color='green')
    sns.distplot(dtest.iloc[:,i], color='red')
    plt.legend(['Train','Test'])
plt.tight_layout()

工业蒸汽量预测

9.模型训练

将训练数据分割为训练集与验证集

X = np.array(dtrain.iloc[:,:-1])
y = np.array(dtrain['target'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

创建模型评分函数

def score(y, y_pred):
	# 计算均方误差 MSE
    print('MSE = {0}'.format(mean_squared_error(y, y_pred)))
    # 计算模型决定系数 R2
    print('R2 = {0}'.format(r2_score(y, y_pred)))
    
    # 计算预测残差，找异常点
    y = pd.Series(y)
    y_pred = pd.Series(y_pred, index=y.index)
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()
    z = (resid - mean_resid) / std_resid
    n_outliers = sum(abs(z)>3)
    
    # 图一：真实值vs预计值
    plt.figure(figsize=(18,5), dpi=80)
    plt.subplot(131)
    plt.plot(y, y_pred, '.')
    plt.xlabel('y')
    plt.ylabel('y_pred')
    plt.title('corr = {:.3f}'.format(np.corrcoef(y,y_pred)[0][1]))
    
    # 图二：残差分布散点图
    plt.subplot(132)
    plt.plot(y, y-y_pred, '.')
    plt.xlabel('y')
    plt.ylabel('resid')
    plt.ylim([-3,3])
    plt.title('std_resid = {:.3f}'.format(std_resid))
    
    # 图三：残差z得分直方图
    plt.subplot(133)
    sns.distplot(z, bins=50)
    plt.xlabel('z')
    plt.title('{:.0f} samples with z>3'.format(n_outliers))
    plt.tight_layout()

由上图热力图显示，存在多重共线性，所以开始模型训练前，利用岭回归模型预测，剔除异常样本

# 利用RidgeCV函数自动寻找最优参数
ridge = RidgeCV()
ridge.fit(X_train, y_train)
print('best_alpha = {0}'.format(ridge.alpha_))

y_pred = ridge.predict(X_train)
score(y_train, y_pred)

工业蒸汽量预测

best_alpha = 10.0
MSE = 0.12396474687012962
R2 = 0.8695133658493562

找出异常样本点并剔除

resid = y_train - y_pred
resid = pd.Series(resid, index=range(len(y_train)))
resid_z = (resid-resid.mean()) / resid.std()
outliers = resid_z[abs(resid_z)>3].index
print(f'{len(outliers)} Outliers:')
print(outliers.tolist())

plt.figure(figsize=(14,6),dpi=60)

plt.subplot(121)
plt.plot(y_train, y_pred, '.')
plt.plot(y_train[outliers], y_pred[outliers], 'ro')
plt.title(f'MSE = {mean_squared_error(y_train,y_pred)}')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('y_train')
plt.ylabel('y_pred')

plt.subplot(122)
sns.distplot(resid_z, bins = 50)
sns.distplot(resid_z.loc[outliers], bins = 50, color = 'r')
plt.legend(['Accepted', 'Outliers'])
plt.xlabel('z')
plt.tight_layout()

29 Outliers:
[89, 407, 652, 725, 768, 797, 829, 875, 907, 1123, 1190, 1195, 1236, 1295, 1324, 1366, 1544, 1641, 1662, 1706, 1718, 1833, 1983, 2013, 2022, 2025, 2084, 2124, 2279]

工业蒸汽量预测

异常样本点剔除

X_train = np.array(pd.DataFrame(X_train).drop(outliers,axis=0))
y_train = np.array(pd.Series(y_train).drop(outliers,axis=0))

套用模型——Lasso回归

lasso = LassoCV(cv=5)
lasso.fit(X_train, y_train)
print('best_alpha = {0}'.format(lasso.alpha_))

pred_lasso = lasso.predict(X_valid)
score(y_valid, pred_lasso)

工业蒸汽量预测

best_alpha = 0.000814568505801466
MSE = 0.13352630896862544
R2 = 0.8715017160007512

套用模型——SVR

使用sklearn中的网格搜索方法GridSearchCV 寻找SVR最优模型参数创建GridSearchCV网格参数搜寻函数，评价标准为最小均方误差，采用K折交叉验证的检验方法

def gsearch(model, param_grid, scoring='neg_mean_squared_error', splits=5, repeats=1, n_jobs=-1):
    # p次k折交叉验证
    rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats, random_state=0)
    model_gs = GridSearchCV(model, param_grid=param_grid, scoring=scoring, cv=rkfold, verbose=1, n_jobs=-1)
    model_gs.fit(X_train, y_train)
    print('参数最佳取值: {0}'.format(model_gs.best_params_))
    print('最小均方误差: {0}'.format(abs(model_gs.best_score_)))
    return model_gs

使用高斯核对惩罚参数C与核系数gamma进行网格搜索CV验证

svr = SVR()
cv_params = {'C': np.logspace(0, 3, 4), 'gamma': np.logspace(-4, -1, 4)}
svr = gsearch(svr, cv_params)

参数最佳取值: {‘C’: 100.0, ‘gamma’: 0.001}
最小均方误差: 0.09654885340007904

缩小参数范围进行细调

svr = SVR()
cv_params = {'C': [1,2,5,10,15,20,30,50,80,100,150,200], 'gamma': [0.0001,0.0005,0.0008,0.001,0.002,0.003,0.005]}
svr = gsearch(svr, cv_params)

参数最佳取值: {‘C’: 15, ‘gamma’: 0.005}
最小均方误差: 0.09545483790250289

验证集预测

pred_svr = svr.predict(X_valid)
score(y_valid, pred_svr)

工业蒸汽量预测

MSE = 0.11131676709494921
R2 = 0.8928749423051452

套用模型——XGB

初始参数

params = {'learning_rate': 0.1, 'n_estimators': 500, 'max_depth': 5, 'min_child_weight': 1, 'seed': 0,
          'subsample': 0.8, 'colsample_bytree': 0.8, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}

最佳迭代次数：n_estimators

cv_params = {'n_estimators': [100,200,300,400,500,600,700,800,900,1000,1100,1200]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘n_estimators’: 300}
最小均方误差: 0.09897718573824728

更新参数

params['n_estimators'] = 300

min_child_weight ，max_depth

cv_params = {'max_depth': [3,4,5,6,7,8,9],
            'min_child_weight': [1,2,3,4,5,6,7]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘max_depth’: 5, ‘min_child_weight’: 3}
最小均方误差: 0.09652119085288388

更新参数

params['max_depth'] = 5
params['min_child_weight'] = 3

后剪枝参数 gamma

cv_params = {'gamma': [0,0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘gamma’: 0.01}
最小均方误差: 0.09636636275146108

更新参数

params['gamma'] = 0.01

样本采样subsample 和列采样colsample_bytree

cv_params = {'subsample': [0.6,0.7,0.8,0.9],
            'colsample_bytree': [0.6,0.7,0.8,0.9]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘colsample_bytree’: 0.6, ‘subsample’: 0.8}
最小均方误差: 0.09607204955270585

更新参数

params['subsample'] = 0.8
params['colsample_bytree'] = 0.6

L1正则项参数reg_alpha 和 L2正则项参数reg_lambda

cv_params = {'reg_alpha': [0,0.02,0.05,0.1,1,2,3],
             'reg_lambda': [0,0.02,0.05,0.1,1,2,3]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘reg_alpha’: 0.05, ‘reg_lambda’: 2}
最小均方误差: 0.09488340395713288

更新参数

params['reg_alpha'] = 0.05
params['reg_lambda'] = 2

调学习率learning_rate

cv_params = {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1, 0.2]}
xgb = XGBRegressor(**params)
xgb = gsearch(xgb, cv_params)

参数最佳取值: {‘learning_rate’: 0.1}
最小均方误差: 0.09488340395713288

更新参数

params['learning_rate'] = 0.1

以验证集进行模型误差验证

pred_xgb = xgb.predict(X_valid)
score(y_valid, pred_xgb)

工业蒸汽量预测

MSE = 0.12072281382484629
R2 = 0.8838230867319299

模型加权融合

def model_mix(pred_1, pred_2, pred_3):
    result = pd.DataFrame(columns=['Lasso','SVR','XGB','Combine'])
    for a in range(5):
        for b in range(1,6):
            for c in range(5):
                y_pred = (a*pred_1 + b*pred_2 + c*pred_3) / (a+b+c)
                mse = mean_squared_error(y_valid, y_pred)
                result = result.append([{'Lasso':a, 'SVR':b, 'XGB':c, 'Combine':mse}], ignore_index=True)
    return result

model_combine = model_mix(pred_lasso, pred_svr, pred_xgb)
model_combine.sort_values(by='Combine', inplace=True)
model_combine.head()

Lasso SVR XGB Combine
16 0 4 1 0.110520
11 0 3 1 0.110541
21 0 5 1 0.110555
47 1 5 2 0.110593
46 1 5 1 0.110598

X_test = np.array(dtest)
ans_lasso = lasso.predict(X_test)
ans_svr = svr.predict(X_test)
ans_xgb = xgb.predict(X_test)
ans_mix = (ans_lasso + 4 * ans_svr +  ans_xgb) / 8
pd.Series(ans_mix).to_csv('E:\\蒸汽模型数据\\最終篇.txt', sep='\t', index=False)
print('Finished!')

工业蒸汽量预测