欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

kaggle:Costa Rican Household Poverty Level Prediction(2):Base line

程序员文章站 2024-03-22 08:36:52
...

  接上篇,地址在简单的DEA之后,开始Training Model, 工具LightGBM

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
train = pd.read_csv('../fix_train.csv')
test = pd.read_csv('../fix_test.csv')
feature_desp = pd.read_csv('../feature_description.csv', error_bad_lines=False,index_col='F_name')

特征分类

hh : household features

ind : individual features

ids : Id, idhogar, Target

ind_bool = ['v18q', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 
            'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 
            'parentesco1', 'parentesco2',  'parentesco3', 'parentesco4', 'parentesco5', 
            'parentesco6', 'parentesco7', 'parentesco8',  'parentesco9', 'parentesco10', 
            'parentesco11', 'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 
            'instlevel4', 'instlevel5', 'instlevel6', 'instlevel7', 'instlevel8', 
            'instlevel9', 'mobilephone']

ind_non_bool = ['rez_esc', 'escolari', 'age','SQBescolari','SQBage','agesq']

hh_bool = ['hacdor', 'hacapo', 'v14a', 'refrig', 'paredblolad', 'paredzocalo', 
           'paredpreb','pisocemento', 'pareddes', 'paredmad',
           'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisoother', 
           'pisonatur', 'pisonotiene', 'pisomadera',
           'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 
           'abastaguadentro', 'abastaguafuera', 'abastaguano',
            'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 
           'sanitario2', 'sanitario3', 'sanitario5',   'sanitario6',
           'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 
           'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 
           'elimbasu5', 'elimbasu6', 'epared1', 'epared2', 'epared3',
           'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 
           'tipovivi1', 'tipovivi2', 'tipovivi3', 'tipovivi4', 'tipovivi5', 
           'computer', 'television', 'lugar1', 'lugar2', 'lugar3',
           'lugar4', 'lugar5', 'lugar6', 'area1', 'area2']

hh_non_bool = ['v2a1', 'v18q1', 'meaneduc', 'SQBovercrowding', 'SQBdependency',
               'SQBmeaned', 'overcrowding', 'rooms', 'r4h1', 'r4h2', 'r4h3', 'r4m1',
               'r4m2', 'r4m3', 'r4t1', 'r4t2', 'r4t3', 'tamhog', 'tamviv', 'hhsize',
               'hogar_nin', 'hogar_adul', 'hogar_mayor', 'hogar_total',  'bedrooms',
               'qmobilephone', 'SQBhogar_total', 'SQBedjefe', 'SQBhogar_nin']

hh_cont = [ 'dependency', 'edjefe', 'edjefa']


ids = ['Id', 'idhogar', 'Target']

Merge train test data

test['Target'] = np.nan
data = train.append(test)
data.info()
train.idhogar.nunique(), test.idhogar.nunique(), data.idhogar.nunique()
(2988, 7352, 10340)

Miss value count

# data miss value count > 0
miss_count = data.isnull().sum() > 0
# data miss value count
misvalue_counts = data.isnull().sum()[miss_count]
# miss value percent
misvalue_percent = misvalue_counts/data.shape[0]*100

misvalue_percent
v2a1         72.615449
v18q1        76.221830
rez_esc      82.545716
meaneduc      0.107742
SQBmeaned     0.107742
Target       71.397360
dtype: float64

fill miss value

from sklearn.preprocessing import MinMaxScaler,Imputer

imputer = Imputer(missing_values=np.nan, strategy='mean', axis = 0)

Train data

  • hh_ : household level data
  • ind_ : indiviual level data
  • hh_train : 有缺失
  • ind_train : 有缺失
  • hh_train_df : 填充后
  • ind_train_df : 填充后
hh_train = train.loc[train.parentesco1 == 1, ids+hh_bool+hh_non_bool+hh_cont].reset_index()

target = hh_train[['Target']]
hh_train_ids = hh_train[['idhogar']]
# before filling miss value
hh_train = hh_train.drop(['Id','idhogar','Target','index'], axis=1)
# after filling miss value
hh_train_df = pd.DataFrame(imputer.fit_transform(hh_train),columns=list(hh_train.columns))

# add idhogar and Target columns
hh_train['idhogar'] = hh_train_ids
hh_train_df['idhogar'] = hh_train_ids
hh_train['Target'] = target
hh_train_df['Target'] = target
# indiviual level data on train set
ind_train = train.loc[ :, ids+ind_bool+ind_non_bool].reset_index()

ind_train_ids = ind_train[['idhogar']]
ind_target = ind_train[['Target']]

# before filling miss value, drop old index
ind_train = ind_train.drop(['Id','idhogar','Target','index'], axis=1)

# after filling miss value
ind_train_df=pd.DataFrame(imputer.fit_transform(ind_train),columns=list(ind_train.columns))

# add idhogar, Target
ind_train['idhogar'] = ind_train_ids
ind_train['Target'] = ind_target
ind_train_df['idhogar'] = ind_train_ids
ind_train_df['Target'] = ind_target

miss value填充前后,KDE变化

  • hh_train : 填充前
  • hh_train_df : 填充后
    • v2a1
    • v18q1
    • meaneduc
    • SQBmeaned
from collections import OrderedDict

mis_cols = ['v2a1','v2a1','v18q1','v18q1','meaneduc','meaneduc','SQBmeaned','SQBmeaned']


# Color mapping
colors = OrderedDict({1: 'red', 2: 'orange', 3: 'blue', 4: 'green'})
label_mapping = OrderedDict({1: 'extreme', 2: 'moderate', 3: 'vulnerable', 
                               4: 'non vulnerable'})
#----------------------------------------------------------------------------

plt.figure(figsize = (12, 7))
for i, col in enumerate(mis_cols):
    ax = plt.subplot(4, 2, i + 1)
    # Iterate through the poverty levels
    for poverty_level, color in colors.items():
        # 核密度估计
        if (i%2 == 0):
            sns.kdeplot(hh_train_df.loc[hh_train_df.Target == poverty_level,col].dropna(), 
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s before filling KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
        else :
            sns.kdeplot(hh_train.loc[hh_train.Target == poverty_level, col].dropna(),
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s after filling KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
plt.subplots_adjust(top = 2.5)

kaggle:Costa Rican Household Poverty Level Prediction(2):Base line

indiviual data填充前后KDE变化情况

  • ind_train : 填充前
  • ind_train_df : 填充后
    • rez_esc
cols = ['rez_esc','rez_esc']
plt.figure(figsize=(14, 2.5))
for i, col in enumerate(cols):
    ax = plt.subplot(1, 2, i + 1)
    for poverty_level, color in colors.items():
        if (i%2 == 0):
            sns.kdeplot(ind_train_df.loc[ind_train_df.Target == poverty_level,col].dropna(), 
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
        else :
            sns.kdeplot(ind_train.loc[ind_train.Target == poverty_level, col].dropna(),
                        ax = ax, color = color, label = label_mapping[poverty_level])
            plt.title('%s filled miss KDE'%(col.capitalize()))
            plt.xlabel('%s'%col)
            plt.ylabel('Density')
plt.subplots_adjust(top = 2)

kaggle:Costa Rican Household Poverty Level Prediction(2):Base line

Test data
test data中有18个家庭是没有,指定户主的.为了对齐hh_test和ind_test,需要指定户主

  • hh_test : 填充前
  • hh_test_df : 填充后
  • ind_test :
  • ind_test_df :
# test data中也有一些家庭没有指定户主的情况,所以需要指定一个
mis_hh = test.groupby(by='idhogar').parentesco1.agg('sum')==0

# 缺失户主的家庭idhogar
mis_idhogar = test.groupby(by='idhogar').parentesco1.agg('sum')[mis_hh].index

下面的26行数据来自测试数据中缺失户主的家庭

pd.options.display.max_columns = 10
test.loc[test.idhogar.isin(mis_idhogar),:][['Id','idhogar','parentesco1']].sort_values(by='idhogar')
Id idhogar parentesco1
22791 ID_99d27ab2f 0e2a3453d 0
22790 ID_f09603838 0e2a3453d 0
15544 ID_49d05f9e6 198fc274a 0
18735 ID_b0874f522 2dc45d484 0
18643 ID_ceeb5dfe2 5a667591a 0
23547 ID_aa8f26c06 676750a21 0
23253 ID_e42c1dde2 91aff0a8e 0
23252 ID_bbc0959ef 91aff0a8e 0
15090 ID_9c12f6ebc 9d874b0d6 0
22833 ID_26d95edff b115b4536 0
12753 ID_93fa2f7cc b59f5b526 0
17053 ID_bca8a1dde ce6154327 0
23711 ID_4036d87e3 d14b3e03a 0
22006 ID_9f025fde6 d678c45ad 0
17163 ID_6094ce990 df06e01c6 0
17162 ID_d6cbeec15 df06e01c6 0
17132 ID_00e8a868f e3f69768c 0
19318 ID_d0beee31f e4df1caaf 0
19317 ID_3805bdb08 e4df1caaf 0
21654 ID_894de66bc f2fd28dbb 0
21655 ID_56a407d03 f2fd28dbb 0
21656 ID_960e558e0 f2fd28dbb 0
21657 ID_cc28b0331 f2fd28dbb 0
18549 ID_aa650fb4a f6d6fad32 0
19299 ID_139a474f3 fc6c8d241 0
19300 ID_f447c7c54 fc6c8d241 0

下面给这18个家庭指定一名成员为head of household

test.loc[test.Id == 'ID_99d27ab2f','parentesco1'] = 1
test.loc[test.Id == 'ID_49d05f9e6','parentesco1'] = 1
test.loc[test.Id == 'ID_b0874f522','parentesco1'] = 1
test.loc[test.Id == 'ID_ceeb5dfe2','parentesco1'] = 1
test.loc[test.Id == 'ID_aa8f26c06','parentesco1'] = 1
test.loc[test.Id == 'ID_e42c1dde2','parentesco1'] = 1
test.loc[test.Id == 'ID_9c12f6ebc','parentesco1'] = 1
test.loc[test.Id == 'ID_26d95edff','parentesco1'] = 1
test.loc[test.Id == 'ID_93fa2f7cc','parentesco1'] = 1
test.loc[test.Id == 'ID_bca8a1dde','parentesco1'] = 1
test.loc[test.Id == 'ID_4036d87e3','parentesco1'] = 1
test.loc[test.Id == 'ID_9f025fde6','parentesco1'] = 1
test.loc[test.Id == 'ID_6094ce990','parentesco1'] = 1
test.loc[test.Id == 'ID_00e8a868f','parentesco1'] = 1
test.loc[test.Id == 'ID_d0beee31f','parentesco1'] = 1
test.loc[test.Id == 'ID_894de66bc','parentesco1'] = 1
test.loc[test.Id == 'ID_aa650fb4a','parentesco1'] = 1
test.loc[test.Id == 'ID_139a474f3','parentesco1'] = 1

Filling miss value on testset

# household level test data
hh_test = test.loc[test.parentesco1 == 1, ids+hh_bool+hh_non_bool+hh_cont].reset_index()

hh_test_ids = hh_test[['idhogar']]

hh_test = hh_test.drop(['Id','idhogar','Target','index'], axis = 1)

# filling miss values
hh_test_df = pd.DataFrame(imputer.fit_transform(hh_test),columns=list(hh_test.columns))

# add idhogar columns
hh_test_df['idhogar'] = hh_test_ids
hh_test['idhogar'] = hh_test_ids
# indiviual level test data
ind_test = test.loc[:, ids+ind_bool+ind_non_bool].reset_index()

ind_test_ids = ind_test[['idhogar']]
ind_test = ind_test.drop(['Id','idhogar','Target','index'], axis = 1)
ind_test_df = pd.DataFrame(imputer.fit_transform(ind_test),columns=list(ind_test.columns))

# add idhogar columns
ind_test['idhogar'] = ind_test_ids
ind_test_df['idhogar'] = ind_test_ids

create new indiviual feature

Indiviual Train data

ind_train_groupobj = ind_train_df.groupby(by='idhogar')

ind_train_data = pd.DataFrame({'idhogar':ind_train_df.idhogar.unique()})
def AddFeatures(feature_df, cols, funcs, groupobj):
    for func in funcs:
        for col in cols:
            group_object = groupobj[col].agg(func).reset_index()
            group_object.rename(index=str, columns={col:col+'_'+func}, inplace=True)
            feature_df = feature_df.merge(group_object, on='idhogar', how='left')
    return feature_df
# indiviual bool features
ind_train_data = AddFeatures(ind_train_data, ind_bool, ['mean','sum'], ind_train_groupobj)

# indiviual non bool features
funcs = ['mean','min','max','median','sum','nunique']
ind_train_data = AddFeatures(ind_train_data, ind_non_bool, funcs, ind_train_groupobj)
Indiviual Test data
ind_test_groupobj = ind_test_df.groupby(by='idhogar')
ind_test_data = pd.DataFrame({'idhogar':ind_test_df.idhogar.unique()})

ind_test_data = AddFeatures(ind_test_data, ind_bool, ['mean','sum'], ind_test_groupobj)

ind_test_data = AddFeatures(ind_test_data, ind_non_bool, funcs, ind_test_groupobj)

Merge household and indiviual data

train_data = hh_train_df.merge(ind_train_data, on = 'idhogar', how='left')
test_data = hh_test_df.merge(ind_test_data, on = 'idhogar', how='left')

Now Training Model with lightGBM

import gc
import lightgbm as lgb
from tq
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

def model(train_data, test_data, n_folds = 10):
    # household id
    train_ids = train_data[['idhogar']]
    test_ids = test_data[['idhogar']]
    # Target/label
    labels = train_data[['Target']].astype(int)
    # drop idhogar, Target
    train_data = train_data.drop(['idhogar','Target'],axis = 1)
    test_data = test_data.drop(['idhogar'], axis = 1)
    # feature columns name
    feature_names = list(train_data.columns)
    # 10 folds cross validation
    k_fold = KFold(n_splits = n_folds, shuffle = True, random_state = 2018)
    # test predictions
    test_predictions = list()
    # validation predictions
    out_of_fold = np.zeros(train_data.shape[0])
    # Empty array for feature importances
    feature_importance_values = np.zeros(len(feature_names))
    # record scores : means f1_macro
    Valid_F1 = []
    Train_F1 = []
    # lightgbm not support f1_macro, so map
    Valid_Score = []
    Train_Score = []
    # Iterate through each fold
    for train_indices, valid_indices in k_fold.split(train_data):
        # Training data for the fold
        train_features = train_data.loc[train_indices, :]
        train_labels = labels.loc[train_indices, :]
        # Validation data for the fold
        valid_features = train_data.loc[valid_indices, :]
        valid_labels = labels.loc[valid_indices, :]
        # Create the model
        model = lgb.LGBMClassifier(boosting_type='gbdt',n_estimators=1000, 
                                   objective = 'multiclass', class_weight = 'balanced',
                                   learning_rate = 0.03,  num_leaves = 31,
                                   reg_alpha = 0.1, reg_lambda = 0.3, num_class = 4,
                                   subsample = 0.8, n_jobs = -1, random_state = 2018)

        # Train the model
        model.fit(train_features, train_labels, eval_metric = 'multi_error',
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'], categorical_feature = 'auto',
                  early_stopping_rounds = 100, verbose = 200)
        # Record the best iteration
        best_iteration = model.best_iteration_
        # 
        test_predictions.append(model.predict(test_data, num_iteration = best_iteration))
        # feature importance
        feature_importance_values += model.feature_importances_ /n_folds
        # Record the best multi error
        valid_score = model.best_score_['valid']['multi_error']
        train_score = model.best_score_['train']['multi_error']
        Valid_Score.append(valid_score)
        Train_Score.append(train_score)
        # Record F1_macro score
        pred_valid = model.predict(valid_features, num_iteration = best_iteration)
        pred_train = model.predict(train_features, num_iteration = best_iteration)
        valid_f1 = f1_score(valid_labels, pred_valid, average='macro')
        train_f1 = f1_score(train_labels, pred_train, average='macro')
        Valid_F1.append(valid_f1)
        Train_F1.append(train_f1)

        # validation set result
        out_of_fold[valid_indices] = pred_valid

        # Clean up memory
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        print('................................................')

    # feature importance
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    # overall valida
    Valid_F1.append(f1_score(labels, out_of_fold, average='macro'))
    Train_F1.append(np.mean(Train_F1))
    Valid_Score.append(np.mean(Valid_Score))
    Train_Score.append(np.mean(Train_Score))
    # dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train error': Train_Score,
                            'valid error': Valid_Score,
                            'train f1' : Train_F1,
                            'valid f1' : Valid_F1}) 

    # make submission.csv
    predict_df = pd.DataFrame(np.array(test_predictions).T)
    voting_result = [predict_df.iloc[x,:].value_counts().argmax() for x in range(predict_df.shape[0])]
    submission = test_ids.copy()
    submission['Target'] = voting_result
    # metric, fetaure importance , househodl target
    return metrics, feature_importances,submission
metric, feature_importance, submission = model(train_data, test_data, 10)
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0517394  valid's multi_error: 0.365772
Early stopping, best iteration is:
[245]   train's multi_error: 0.0368913  valid's multi_error: 0.352349
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0458956  valid's multi_error: 0.389262
[400]   train's multi_error: 0.012987   valid's multi_error: 0.38255
Early stopping, best iteration is:
[433]   train's multi_error: 0.0108696  valid's multi_error: 0.369128
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0474782  valid's multi_error: 0.436242
[400]   train's multi_error: 0.012372   valid's multi_error: 0.395973
Early stopping, best iteration is:
[399]   train's multi_error: 0.012372   valid's multi_error: 0.395973
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0463667  valid's multi_error: 0.43771
[400]   train's multi_error: 0.013058   valid's multi_error: 0.420875
Early stopping, best iteration is:
[393]   train's multi_error: 0.0134813  valid's multi_error: 0.417508
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0446141  valid's multi_error: 0.363636
Early stopping, best iteration is:
[286]   train's multi_error: 0.0266365  valid's multi_error: 0.329966
................................................
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[35]    train's multi_error: 0.216065   valid's multi_error: 0.37037
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0465465  valid's multi_error: 0.407407
[400]   train's multi_error: 0.0121707  valid's multi_error: 0.360269
Early stopping, best iteration is:
[478]   train's multi_error: 0.00687285 valid's multi_error: 0.346801
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0458219  valid's multi_error: 0.393939
[400]   train's multi_error: 0.0107405  valid's multi_error: 0.350168
Early stopping, best iteration is:
[417]   train's multi_error: 0.00989259 valid's multi_error: 0.340067
................................................
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[26]    train's multi_error: 0.230932   valid's multi_error: 0.410774
................................................
Training until validation scores don't improve for 100 rounds.
[200]   train's multi_error: 0.0458289  valid's multi_error: 0.380471
Early stopping, best iteration is:
[226]   train's multi_error: 0.0344418  valid's multi_error: 0.360269
................................................
metric
fold train error train f1 valid error valid f1
0 0 0.036891 0.895244 0.352349 0.470469
1 1 0.010870 0.965888 0.369128 0.459939
2 2 0.012372 0.959736 0.395973 0.377560
3 3 0.013481 0.957862 0.417508 0.401015
4 4 0.026636 0.922596 0.329966 0.408451
5 5 0.216065 0.683224 0.370370 0.487074
6 6 0.006873 0.979038 0.346801 0.392146
7 7 0.009893 0.968767 0.340067 0.467333
8 8 0.230932 0.672731 0.410774 0.451792
9 9 0.034442 0.899205 0.360269 0.447325
10 overall 0.059845 0.890429 0.369321 0.440105

make submission.csv

submit = test[['Id','idhogar']]

submit = submit.merge(submission, on = 'idhogar')

submit = submit.drop(['idhogar'],axis = 1)

submit.to_csv('../submit_0.csv',index = False)

Public score

kaggle:Costa Rican Household Poverty Level Prediction(2):Base line
Feature Importances

feature_importance = feature_importance.sort_values(by = 'importance')

feature_importance.set_index('feature').plot(kind='barh', figsize=(10, 40))
plt.title('Feature Importances')
Text(0.5,1,'Feature Importances')

kaggle:Costa Rican Household Poverty Level Prediction(2):Base line