kaggle:Costa Rican Household Poverty Level Prediction(2):Base line
程序员文章站
2024-03-22 08:36:52
...
接上篇,地址在简单的DEA之后,开始Training Model, 工具LightGBM
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
train = pd.read_csv('../fix_train.csv')
test = pd.read_csv('../fix_test.csv')
feature_desp = pd.read_csv('../feature_description.csv', error_bad_lines=False,index_col='F_name')
特征分类
hh : household features
ind : individual features
ids : Id, idhogar, Target
ind_bool = ['v18q', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3',
'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7',
'parentesco1', 'parentesco2', 'parentesco3', 'parentesco4', 'parentesco5',
'parentesco6', 'parentesco7', 'parentesco8', 'parentesco9', 'parentesco10',
'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3',
'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8',
'instlevel9', 'mobilephone']
ind_non_bool = ['rez_esc', 'escolari', 'age','SQBescolari','SQBage','agesq']
hh_bool = ['hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo',
'paredpreb','pisocemento', 'pareddes', 'paredmad',
'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother',
'pisonatur', 'pisonotiene', 'pisomadera',
'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo',
'abastaguadentro', 'abastaguafuera', 'abastaguano',
'public', 'planpri', 'noelec', 'coopele', 'sanitario1',
'sanitario2', 'sanitario3', 'sanitario5', 'sanitario6',
'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4',
'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4',
'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3',
'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5',
'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
'lugar4', 'lugar5', 'lugar6', 'area1', 'area2']
hh_non_bool = ['v2a1', 'v18q1', 'meaneduc', 'SQBovercrowding', 'SQBdependency',
'SQBmeaned', 'overcrowding', 'rooms', 'r4h1', 'r4h2', 'r4h3', 'r4m1',
'r4m2', 'r4m3', 'r4t1', 'r4t2', 'r4t3', 'tamhog', 'tamviv', 'hhsize',
'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total', 'bedrooms',
'qmobilephone', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin']
hh_cont = [ 'dependency', 'edjefe', 'edjefa']
ids = ['Id', 'idhogar', 'Target']
Merge train test data
test['Target'] = np.nan
data = train.append(test)
data.info()
train.idhogar.nunique(), test.idhogar.nunique(), data.idhogar.nunique()
(2988, 7352, 10340)
Miss value count
# data miss value count > 0
miss_count = data.isnull().sum() > 0
# data miss value count
misvalue_counts = data.isnull().sum()[miss_count]
# miss value percent
misvalue_percent = misvalue_counts/data.shape[0]*100
misvalue_percent
v2a1 72.615449
v18q1 76.221830
rez_esc 82.545716
meaneduc 0.107742
SQBmeaned 0.107742
Target 71.397360
dtype: float64
fill miss value
from sklearn.preprocessing import MinMaxScaler,Imputer
imputer = Imputer(missing_values=np.nan, strategy='mean', axis = 0)
Train data
- hh_ : household level data
- ind_ : indiviual level data
- hh_train : 有缺失
- ind_train : 有缺失
- hh_train_df : 填充后
- ind_train_df : 填充后
hh_train = train.loc[train.parentesco1 == 1, ids+hh_bool+hh_non_bool+hh_cont].reset_index()
target = hh_train[['Target']]
hh_train_ids = hh_train[['idhogar']]
# before filling miss value
hh_train = hh_train.drop(['Id','idhogar','Target','index'], axis=1)
# after filling miss value
hh_train_df = pd.DataFrame(imputer.fit_transform(hh_train),columns=list(hh_train.columns))
# add idhogar and Target columns
hh_train['idhogar'] = hh_train_ids
hh_train_df['idhogar'] = hh_train_ids
hh_train['Target'] = target
hh_train_df['Target'] = target
# indiviual level data on train set
ind_train = train.loc[ :, ids+ind_bool+ind_non_bool].reset_index()
ind_train_ids = ind_train[['idhogar']]
ind_target = ind_train[['Target']]
# before filling miss value, drop old index
ind_train = ind_train.drop(['Id','idhogar','Target','index'], axis=1)
# after filling miss value
ind_train_df=pd.DataFrame(imputer.fit_transform(ind_train),columns=list(ind_train.columns))
# add idhogar, Target
ind_train['idhogar'] = ind_train_ids
ind_train['Target'] = ind_target
ind_train_df['idhogar'] = ind_train_ids
ind_train_df['Target'] = ind_target
miss value填充前后,KDE变化
- hh_train : 填充前
- hh_train_df : 填充后
- v2a1
- v18q1
- meaneduc
- SQBmeaned
from collections import OrderedDict
mis_cols = ['v2a1','v2a1','v18q1','v18q1','meaneduc','meaneduc','SQBmeaned','SQBmeaned']
# Color mapping
colors = OrderedDict({1: 'red', 2: 'orange', 3: 'blue', 4: 'green'})
label_mapping = OrderedDict({1: 'extreme', 2: 'moderate', 3: 'vulnerable',
4: 'non vulnerable'})
#----------------------------------------------------------------------------
plt.figure(figsize = (12, 7))
for i, col in enumerate(mis_cols):
ax = plt.subplot(4, 2, i + 1)
# Iterate through the poverty levels
for poverty_level, color in colors.items():
# 核密度估计
if (i%2 == 0):
sns.kdeplot(hh_train_df.loc[hh_train_df.Target == poverty_level,col].dropna(),
ax = ax, color = color, label = label_mapping[poverty_level])
plt.title('%s before filling KDE'%(col.capitalize()))
plt.xlabel('%s'%col)
plt.ylabel('Density')
else :
sns.kdeplot(hh_train.loc[hh_train.Target == poverty_level, col].dropna(),
ax = ax, color = color, label = label_mapping[poverty_level])
plt.title('%s after filling KDE'%(col.capitalize()))
plt.xlabel('%s'%col)
plt.ylabel('Density')
plt.subplots_adjust(top = 2.5)
indiviual data填充前后KDE变化情况
- ind_train : 填充前
- ind_train_df : 填充后
- rez_esc
cols = ['rez_esc','rez_esc']
plt.figure(figsize=(14, 2.5))
for i, col in enumerate(cols):
ax = plt.subplot(1, 2, i + 1)
for poverty_level, color in colors.items():
if (i%2 == 0):
sns.kdeplot(ind_train_df.loc[ind_train_df.Target == poverty_level,col].dropna(),
ax = ax, color = color, label = label_mapping[poverty_level])
plt.title('%s KDE'%(col.capitalize()))
plt.xlabel('%s'%col)
plt.ylabel('Density')
else :
sns.kdeplot(ind_train.loc[ind_train.Target == poverty_level, col].dropna(),
ax = ax, color = color, label = label_mapping[poverty_level])
plt.title('%s filled miss KDE'%(col.capitalize()))
plt.xlabel('%s'%col)
plt.ylabel('Density')
plt.subplots_adjust(top = 2)
Test data
test data中有18个家庭是没有,指定户主的.为了对齐hh_test和ind_test,需要指定户主
- hh_test : 填充前
- hh_test_df : 填充后
- ind_test :
- ind_test_df :
# test data中也有一些家庭没有指定户主的情况,所以需要指定一个
mis_hh = test.groupby(by='idhogar').parentesco1.agg('sum')==0
# 缺失户主的家庭idhogar
mis_idhogar = test.groupby(by='idhogar').parentesco1.agg('sum')[mis_hh].index
下面的26行数据来自测试数据中缺失户主的家庭
pd.options.display.max_columns = 10
test.loc[test.idhogar.isin(mis_idhogar),:][['Id','idhogar','parentesco1']].sort_values(by='idhogar')
Id | idhogar | parentesco1 | |
---|---|---|---|
22791 | ID_99d27ab2f | 0e2a3453d | 0 |
22790 | ID_f09603838 | 0e2a3453d | 0 |
15544 | ID_49d05f9e6 | 198fc274a | 0 |
18735 | ID_b0874f522 | 2dc45d484 | 0 |
18643 | ID_ceeb5dfe2 | 5a667591a | 0 |
23547 | ID_aa8f26c06 | 676750a21 | 0 |
23253 | ID_e42c1dde2 | 91aff0a8e | 0 |
23252 | ID_bbc0959ef | 91aff0a8e | 0 |
15090 | ID_9c12f6ebc | 9d874b0d6 | 0 |
22833 | ID_26d95edff | b115b4536 | 0 |
12753 | ID_93fa2f7cc | b59f5b526 | 0 |
17053 | ID_bca8a1dde | ce6154327 | 0 |
23711 | ID_4036d87e3 | d14b3e03a | 0 |
22006 | ID_9f025fde6 | d678c45ad | 0 |
17163 | ID_6094ce990 | df06e01c6 | 0 |
17162 | ID_d6cbeec15 | df06e01c6 | 0 |
17132 | ID_00e8a868f | e3f69768c | 0 |
19318 | ID_d0beee31f | e4df1caaf | 0 |
19317 | ID_3805bdb08 | e4df1caaf | 0 |
21654 | ID_894de66bc | f2fd28dbb | 0 |
21655 | ID_56a407d03 | f2fd28dbb | 0 |
21656 | ID_960e558e0 | f2fd28dbb | 0 |
21657 | ID_cc28b0331 | f2fd28dbb | 0 |
18549 | ID_aa650fb4a | f6d6fad32 | 0 |
19299 | ID_139a474f3 | fc6c8d241 | 0 |
19300 | ID_f447c7c54 | fc6c8d241 | 0 |
下面给这18个家庭指定一名成员为head of household
test.loc[test.Id == 'ID_99d27ab2f','parentesco1'] = 1
test.loc[test.Id == 'ID_49d05f9e6','parentesco1'] = 1
test.loc[test.Id == 'ID_b0874f522','parentesco1'] = 1
test.loc[test.Id == 'ID_ceeb5dfe2','parentesco1'] = 1
test.loc[test.Id == 'ID_aa8f26c06','parentesco1'] = 1
test.loc[test.Id == 'ID_e42c1dde2','parentesco1'] = 1
test.loc[test.Id == 'ID_9c12f6ebc','parentesco1'] = 1
test.loc[test.Id == 'ID_26d95edff','parentesco1'] = 1
test.loc[test.Id == 'ID_93fa2f7cc','parentesco1'] = 1
test.loc[test.Id == 'ID_bca8a1dde','parentesco1'] = 1
test.loc[test.Id == 'ID_4036d87e3','parentesco1'] = 1
test.loc[test.Id == 'ID_9f025fde6','parentesco1'] = 1
test.loc[test.Id == 'ID_6094ce990','parentesco1'] = 1
test.loc[test.Id == 'ID_00e8a868f','parentesco1'] = 1
test.loc[test.Id == 'ID_d0beee31f','parentesco1'] = 1
test.loc[test.Id == 'ID_894de66bc','parentesco1'] = 1
test.loc[test.Id == 'ID_aa650fb4a','parentesco1'] = 1
test.loc[test.Id == 'ID_139a474f3','parentesco1'] = 1
Filling miss value on testset
# household level test data
hh_test = test.loc[test.parentesco1 == 1, ids+hh_bool+hh_non_bool+hh_cont].reset_index()
hh_test_ids = hh_test[['idhogar']]
hh_test = hh_test.drop(['Id','idhogar','Target','index'], axis = 1)
# filling miss values
hh_test_df = pd.DataFrame(imputer.fit_transform(hh_test),columns=list(hh_test.columns))
# add idhogar columns
hh_test_df['idhogar'] = hh_test_ids
hh_test['idhogar'] = hh_test_ids
# indiviual level test data
ind_test = test.loc[:, ids+ind_bool+ind_non_bool].reset_index()
ind_test_ids = ind_test[['idhogar']]
ind_test = ind_test.drop(['Id','idhogar','Target','index'], axis = 1)
ind_test_df = pd.DataFrame(imputer.fit_transform(ind_test),columns=list(ind_test.columns))
# add idhogar columns
ind_test['idhogar'] = ind_test_ids
ind_test_df['idhogar'] = ind_test_ids
create new indiviual feature
Indiviual Train data
ind_train_groupobj = ind_train_df.groupby(by='idhogar')
ind_train_data = pd.DataFrame({'idhogar':ind_train_df.idhogar.unique()})
def AddFeatures(feature_df, cols, funcs, groupobj):
for func in funcs:
for col in cols:
group_object = groupobj[col].agg(func).reset_index()
group_object.rename(index=str, columns={col:col+'_'+func}, inplace=True)
feature_df = feature_df.merge(group_object, on='idhogar', how='left')
return feature_df
# indiviual bool features
ind_train_data = AddFeatures(ind_train_data, ind_bool, ['mean','sum'], ind_train_groupobj)
# indiviual non bool features
funcs = ['mean','min','max','median','sum','nunique']
ind_train_data = AddFeatures(ind_train_data, ind_non_bool, funcs, ind_train_groupobj)
Indiviual Test data
ind_test_groupobj = ind_test_df.groupby(by='idhogar')
ind_test_data = pd.DataFrame({'idhogar':ind_test_df.idhogar.unique()})
ind_test_data = AddFeatures(ind_test_data, ind_bool, ['mean','sum'], ind_test_groupobj)
ind_test_data = AddFeatures(ind_test_data, ind_non_bool, funcs, ind_test_groupobj)
Merge household and indiviual data
train_data = hh_train_df.merge(ind_train_data, on = 'idhogar', how='left')
test_data = hh_test_df.merge(ind_test_data, on = 'idhogar', how='left')
Now Training Model with lightGBM
import gc
import lightgbm as lgb
from tq
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
def model(train_data, test_data, n_folds = 10):
# household id
train_ids = train_data[['idhogar']]
test_ids = test_data[['idhogar']]
# Target/label
labels = train_data[['Target']].astype(int)
# drop idhogar, Target
train_data = train_data.drop(['idhogar','Target'],axis = 1)
test_data = test_data.drop(['idhogar'], axis = 1)
# feature columns name
feature_names = list(train_data.columns)
# 10 folds cross validation
k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 2018)
# test predictions
test_predictions = list()
# validation predictions
out_of_fold = np.zeros(train_data.shape[0])
# Empty array for feature importances
feature_importance_values = np.zeros(len(feature_names))
# record scores : means f1_macro
Valid_F1 = []
Train_F1 = []
# lightgbm not support f1_macro, so map
Valid_Score = []
Train_Score = []
# Iterate through each fold
for train_indices, valid_indices in k_fold.split(train_data):
# Training data for the fold
train_features = train_data.loc[train_indices, :]
train_labels = labels.loc[train_indices, :]
# Validation data for the fold
valid_features = train_data.loc[valid_indices, :]
valid_labels = labels.loc[valid_indices, :]
# Create the model
model = lgb.LGBMClassifier(boosting_type='gbdt',n_estimators=1000,
objective = 'multiclass', class_weight = 'balanced',
learning_rate = 0.03, num_leaves = 31,
reg_alpha = 0.1, reg_lambda = 0.3, num_class = 4,
subsample = 0.8, n_jobs = -1, random_state = 2018)
# Train the model
model.fit(train_features, train_labels, eval_metric = 'multi_error',
eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
eval_names = ['valid', 'train'], categorical_feature = 'auto',
early_stopping_rounds = 100, verbose = 200)
# Record the best iteration
best_iteration = model.best_iteration_
#
test_predictions.append(model.predict(test_data, num_iteration = best_iteration))
# feature importance
feature_importance_values += model.feature_importances_ /n_folds
# Record the best multi error
valid_score = model.best_score_['valid']['multi_error']
train_score = model.best_score_['train']['multi_error']
Valid_Score.append(valid_score)
Train_Score.append(train_score)
# Record F1_macro score
pred_valid = model.predict(valid_features, num_iteration = best_iteration)
pred_train = model.predict(train_features, num_iteration = best_iteration)
valid_f1 = f1_score(valid_labels, pred_valid, average='macro')
train_f1 = f1_score(train_labels, pred_train, average='macro')
Valid_F1.append(valid_f1)
Train_F1.append(train_f1)
# validation set result
out_of_fold[valid_indices] = pred_valid
# Clean up memory
gc.enable()
del model, train_features, valid_features
gc.collect()
print('................................................')
# feature importance
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
# overall valida
Valid_F1.append(f1_score(labels, out_of_fold, average='macro'))
Train_F1.append(np.mean(Train_F1))
Valid_Score.append(np.mean(Valid_Score))
Train_Score.append(np.mean(Train_Score))
# dataframe of validation scores
fold_names = list(range(n_folds))
fold_names.append('overall')
# Dataframe of validation scores
metrics = pd.DataFrame({'fold': fold_names,
'train error': Train_Score,
'valid error': Valid_Score,
'train f1' : Train_F1,
'valid f1' : Valid_F1})
# make submission.csv
predict_df = pd.DataFrame(np.array(test_predictions).T)
voting_result = [predict_df.iloc[x,:].value_counts().argmax() for x in range(predict_df.shape[0])]
submission = test_ids.copy()
submission['Target'] = voting_result
# metric, fetaure importance , househodl target
return metrics, feature_importances,submission
metric, feature_importance, submission = model(train_data, test_data, 10)
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0517394 valid's multi_error: 0.365772
Early stopping, best iteration is:
[245] train's multi_error: 0.0368913 valid's multi_error: 0.352349
................................................
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0458956 valid's multi_error: 0.389262
[400] train's multi_error: 0.012987 valid's multi_error: 0.38255
Early stopping, best iteration is:
[433] train's multi_error: 0.0108696 valid's multi_error: 0.369128
................................................
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0474782 valid's multi_error: 0.436242
[400] train's multi_error: 0.012372 valid's multi_error: 0.395973
Early stopping, best iteration is:
[399] train's multi_error: 0.012372 valid's multi_error: 0.395973
................................................
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0463667 valid's multi_error: 0.43771
[400] train's multi_error: 0.013058 valid's multi_error: 0.420875
Early stopping, best iteration is:
[393] train's multi_error: 0.0134813 valid's multi_error: 0.417508
................................................
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0446141 valid's multi_error: 0.363636
Early stopping, best iteration is:
[286] train's multi_error: 0.0266365 valid's multi_error: 0.329966
................................................
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[35] train's multi_error: 0.216065 valid's multi_error: 0.37037
................................................
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0465465 valid's multi_error: 0.407407
[400] train's multi_error: 0.0121707 valid's multi_error: 0.360269
Early stopping, best iteration is:
[478] train's multi_error: 0.00687285 valid's multi_error: 0.346801
................................................
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0458219 valid's multi_error: 0.393939
[400] train's multi_error: 0.0107405 valid's multi_error: 0.350168
Early stopping, best iteration is:
[417] train's multi_error: 0.00989259 valid's multi_error: 0.340067
................................................
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[26] train's multi_error: 0.230932 valid's multi_error: 0.410774
................................................
Training until validation scores don't improve for 100 rounds.
[200] train's multi_error: 0.0458289 valid's multi_error: 0.380471
Early stopping, best iteration is:
[226] train's multi_error: 0.0344418 valid's multi_error: 0.360269
................................................
metric
fold | train error | train f1 | valid error | valid f1 | |
---|---|---|---|---|---|
0 | 0 | 0.036891 | 0.895244 | 0.352349 | 0.470469 |
1 | 1 | 0.010870 | 0.965888 | 0.369128 | 0.459939 |
2 | 2 | 0.012372 | 0.959736 | 0.395973 | 0.377560 |
3 | 3 | 0.013481 | 0.957862 | 0.417508 | 0.401015 |
4 | 4 | 0.026636 | 0.922596 | 0.329966 | 0.408451 |
5 | 5 | 0.216065 | 0.683224 | 0.370370 | 0.487074 |
6 | 6 | 0.006873 | 0.979038 | 0.346801 | 0.392146 |
7 | 7 | 0.009893 | 0.968767 | 0.340067 | 0.467333 |
8 | 8 | 0.230932 | 0.672731 | 0.410774 | 0.451792 |
9 | 9 | 0.034442 | 0.899205 | 0.360269 | 0.447325 |
10 | overall | 0.059845 | 0.890429 | 0.369321 | 0.440105 |
make submission.csv
submit = test[['Id','idhogar']]
submit = submit.merge(submission, on = 'idhogar')
submit = submit.drop(['idhogar'],axis = 1)
submit.to_csv('../submit_0.csv',index = False)
Public score
Feature Importances
feature_importance = feature_importance.sort_values(by = 'importance')
feature_importance.set_index('feature').plot(kind='barh', figsize=(10, 40))
plt.title('Feature Importances')
Text(0.5,1,'Feature Importances')