欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

kaggle房价预测案例

程序员文章站 2022-06-26 20:01:07
...
数据处理照搬
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from scipy.stats import norm, skew

train = pd.read_csv(r'C:\Users\hp\Desktop\train.csv')
test = pd.read_csv(r'C:\Users\hp\Desktop\test.csv')

train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
train["SalePrice"] = np.log1p(train["SalePrice"])

ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})

all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
all_data["Alley"] = all_data["Alley"].fillna("None")
all_data["Fence"] = all_data["Fence"].fillna("None")
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data = all_data.drop(['Utilities'], axis=1)
all_data["Functional"] = all_data["Functional"].fillna("Typ")
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
#MSSubClass=The building class
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})

skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)
    
all_data = pd.get_dummies(all_data)    
train = all_data[:ntrain]
test = all_data[ntrain:]  
普通模型
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor,VotingRegressor
from sklearn.ensemble import StackingRegressor

# KRR
KRR = KernelRidge(alpha=0.6, kernel='polynomial')
param_grid = dict(degree = [1,1.5,2,2.5,3],coef0 = [2,2.5,3,3.5,4,4.5])
KRR_best = GridSearchCV(KRR, param_grid,  cv=5)
KRR_best.fit(train,y_train)
print("模型的最优参数:",KRR_best.best_params_)
print("最优模型分数:",KRR_best.best_score_)
print("最优模型对象:",KRR_best.best_estimator_)

# GBoost
GBoost = GradientBoostingRegressor(max_depth=4, max_features='sqrt',
                            min_samples_leaf=15,min_samples_split=10,
                            loss='huber', random_state =5)
param_grid = dict(learning_rate = [0.001,0.01,0.05],
                  n_estimators = [3000,4000,5000])
GBoost_best = GridSearchCV(GBoost, param_grid,  cv=5)
GBoost_best.fit(train,y_train)
print("模型的最优参数:",GBoost_best.best_params_)
print("最优模型分数:",GBoost_best.best_score_)
print("最优模型对象:",GBoost_best.best_estimator_)
GBoost = GradientBoostingRegressor(learning_rate =0.01,n_estimators=4000,
                    max_features='sqrt',loss='huber', random_state =5)
#二次调参
param_grid = dict(max_depth=[3,4,5],min_samples_leaf = [10,15,20],
                 min_samples_split = [5,10,15])
GBoost_best = GridSearchCV(GBoost, param_grid,  cv=5)
GBoost_best.fit(train,y_train)
print("模型的最优参数:",GBoost_best.best_params_)
print("最优模型分数:",GBoost_best.best_score_)
print("最优模型对象:",GBoost_best.best_estimator_)

# XGBoost
XGB = xgb.XGBRegressor(colsample_bytree=0.4603,gamma=0.0468,max_depth=3, 
                       min_child_weight=1.7817,reg_alpha=0.4640, 
                       reg_lambda=0.8571,subsample=0.5213,
                       random_state =7, nthread = -1)

param_grid = dict(learning_rate = [0.001,0.05,0.1],
                  n_estimators = [2200,3000,3500])

XGB_best = GridSearchCV(XGB, param_grid,  cv=5)
XGB_best.fit(train,y_train)
print("模型的最优参数:",XGB_best.best_params_)
print("最优模型分数:",XGB_best.best_score_)
print("最优模型对象:",XGB_best.best_estimator_)

# LightGBM
LGB = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                    max_bin = 55, bagging_fraction = 0.8,
                    bagging_freq = 5, feature_fraction = 0.2319,
                    feature_fraction_seed=9, bagging_seed=9,
                    min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

param_grid = dict(learning_rate = [0.001,0.05,0.1],
                  n_estimators = [600,720,840])

LGB_best = GridSearchCV(LGB, param_grid,  cv=5)
LGB_best.fit(train,y_train)
print("模型的最优参数:",LGB_best.best_params_)
print("最优模型分数:",LGB_best.best_score_)
print("最优模型对象:",LGB_best.best_estimator_)

### models scores
n_folds = 5
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

# KRR
KRR_best = KernelRidge(alpha=0.6, coef0=4, degree=2, kernel='polynomial')
score = rmsle_cv(KRR_best)
print("\nKRR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# GBoost
GBoost_best = GradientBoostingRegressor(alpha=0.9,learning_rate=0.01, loss='huber', 
              max_depth=3,max_features='sqrt', min_samples_leaf=10, min_samples_split=5,
              n_estimators=4000,random_state=5)
score = rmsle_cv(GBoost_best)
print("\nGBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# XGBoost
XGB_best = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, 
       max_depth=3, min_child_weight=1.7817,n_estimators=3500,nthread=-1,
       random_state=7, reg_alpha=0.464,reg_lambda=0.8571,subsample=0.5213)
score = rmsle_cv(XGB_best)
print("\nXGB score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

# LightGBM
LGB_best = lgb.LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
           feature_fraction=0.2319,feature_fraction_seed=9, learning_rate=0.05, 
           max_bin=55,min_data_in_leaf=6,min_sum_hessian_in_leaf=11,n_estimators=720, 
           num_leaves=5,objective='regression')
score = rmsle_cv(LGB_best)
print("\nLGB score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

组合模型

Stacking models
estimators = [('KNN', KernelRidge(alpha=0.6, coef0=4, degree=2, kernel='polynomial')),
              ('lgb', lgb.LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
           feature_fraction=0.2319,feature_fraction_seed=9, learning_rate=0.05, 
           max_bin=55,min_data_in_leaf=6,min_sum_hessian_in_leaf=11,n_estimators=720, 
           num_leaves=5,objective='regression')),
             ('xgb',xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, 
       max_depth=3, min_child_weight=1.7817,n_estimators=3500,nthread=-1,
       random_state=7, reg_alpha=0.464,reg_lambda=0.8571,subsample=0.5213))]
sta = StackingRegressor(estimators=estimators,
        final_estimator=GradientBoostingRegressor(alpha=0.9,learning_rate=0.01, loss='huber', 
              max_depth=3,max_features='sqrt', min_samples_leaf=10, min_samples_split=5,
              n_estimators=4000,random_state=5))
sta.fit(train,y_train)
score = rmsle_cv(sta)
print("\nsta score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
voting models
vot = VotingRegressor(estimators=[('KNN', KernelRidge(alpha=0.6, coef0=4, degree=2, kernel='polynomial')),
              ('lgb', lgb.LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
           feature_fraction=0.2319,feature_fraction_seed=9, learning_rate=0.05, 
           max_bin=55,min_data_in_leaf=6,min_sum_hessian_in_leaf=11,n_estimators=720, 
           num_leaves=5,objective='regression')),
             ('GBoost',GradientBoostingRegressor(alpha=0.9,learning_rate=0.01, loss='huber', 
              max_depth=3,max_features='sqrt', min_samples_leaf=10, min_samples_split=5,
              n_estimators=4000,random_state=5)),
            ('xgb',xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, 
       max_depth=3, min_child_weight=1.7817,n_estimators=3500,nthread=-1,
       random_state=7, reg_alpha=0.464,reg_lambda=0.8571,subsample=0.5213))])
vot.fit(train,y_train)
score = rmsle_cv(vot)
print("\nvot score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
加权组合
# mean_squared_error
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
# GBoost
GBoost_best.fit(train, y_train)
GBoost_train_pred = GBoost_best.predict(train)
GBoost_pred = np.expm1(GBoost_best.predict(test.values))
print(rmsle(y_train, GBoost_train_pred))
# KRR
KRR_best.fit(train, y_train)
KRR_train_pred = KRR_best.predict(train)
KRR_pred = np.expm1(KRR_best.predict(test.values))
print(rmsle(y_train, KRR_train_pred))
# LightGBM
LGB_best.fit(train, y_train)
LGB_train_pred = LGB_best.predict(train)
LGB_pred = np.expm1(LGB_best.predict(test.values))
print(rmsle(y_train, LGB_train_pred))
# voting
vot.fit(train.values, y_train)
vot_train_pred = vot.predict(train.values)
vot_pred = np.expm1(vot.predict(test.values))
print(rmsle(y_train, vot_train_pred))
# 加权组合结果
print(rmsle(y_train,vot_train_pred*0.6 + GBoost_train_pred*0.2 + LGB_train_pred*0.2 ))
相关标签: 方法总结