Kaggle | IEEE Fraud Detection(Data Processing and Simplest Model)
IEEE Fraud Detection - Data Processing and Simplest Model
In this section we will do some data cleaning jobs, which is necessary before we build our models.
We have seen the distributions of all kinds of features in the last passage, so now we can process the data with the knowledge we got from the visual charts.
1 Prepare the Data
1.1 Import and Merge the Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import warnings
import time
warnings.filterwarnings('ignore')
PATH='E:/kaggle/ieee-fraud-detection/'
tran_tr=pd.read_csv(PATH+'train_transaction.csv')
iden_tr=pd.read_csv(PATH+'train_identity.csv')
tran_ts=pd.read_csv(PATH+'test_transaction.csv')
iden_ts=pd.read_csv(PATH+'test_identity.csv')
train=pd.merge(tran_tr,iden_tr,on='TransactionID',how='left')
test=pd.merge(tran_ts,iden_ts,on='TransactionID',how='left')
print(f'Train: {train.shape[0]} rows {train.shape[1]} columns.')
print(f'Test: {test.shape[0]} rows {test.shape[1]} columns.')
Train: 590540 rows 434 columns.
Test: 506691 rows 433 columns.
We can see that the dataset is relatively large, so while processing the data we should take the efficiency into account.
1.2 Reduce the Memory
First we delete the dataset that we won’t use in the subsequent steps.
del tran_tr, iden_tr, tran_ts, iden_ts
This function is used to reduce the memory usage of the dataset, which can accelerate the process.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
def reduce_mem_usage(df, use_float16=False):
# reduce memory usage
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
if is_datetime(df[col]) or is_categorical_dtype(df[col]):
# skip datetime type or categorical type
continue
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
reduce_mem_usage(train)
reduce_mem_usage(test)
Memory usage of dataframe is 1959.88 MB
Memory usage after optimization is: 1049.21 MB
Decreased by 46.5%
Memory usage of dataframe is 1677.73 MB
Memory usage after optimization is: 899.75 MB
Decreased by 46.4%
We can see that the memory usage has largely reduced and it will process faster when we build the model.
2 Data Preprocessing
2.1 Label Encoding
In this part we need to convert the categorial features to integers. By using the LabelEncoder in sklearn it will be easier to achieve the goal.
# label encoding
# convert the categorical features to integer code
for col in test.columns:
if train[col].dtype=='object' or test[col].dtype=='object':
lb=LabelEncoder()
lb.fit(list(train[col].values)+list(test[col].values))
train[col]=lb.transform(list(train[col].values))
test[col]=lb.transform(list(test[col].values))
print(f'Train: {train.shape[0]} rows {train.shape[1]} columns.')
print(f'Test: {test.shape[0]} rows {test.shape[1]} columns.')
Train: 590540 rows 434 columns.
Test: 506691 rows 433 columns.
2.2 Drop Useless Columns and Fill the NaN
We can see that there’re too many features in this dataset. Luckily, we have visualized those features’ distributions before and analyzed what they may represent. So we can choose some useful columns for our model in this part.
useful_cols=['isFraud','TransactionAmt','ProductCD','card1','card2','card3','card4','card5','card6',
'addr1','addr2','dist1','P_emaildomain','R_emaildomain','C1','C2','C3','C4','C5','C6',
'C7','C8','C9','C10','C11','C12','C13','C14','V95','V96','V97','V98','V99','V100','V101',
'V102','V103','V104','V105','V106','V107','V108','V109','V110','V111','V112','V113','V114',
'V115','V116','V117','V118','V119','V120','V121','V122','V123','V124','V125','V126','V127',
'V128','V129','V130','V131','V132','V133','V134','V135','V136','V137','V279','V280','V281',
'V282','V283','V284','V285','V286','V287','V288','V289','V290','V291','V292','V293','V294',
'V295','V296','V297','V298','V299','V300','V301','V302','V303','V304','V305','V306','V307',
'V308','V309','V310','V311','V312','V313','V314','V315','V316','V317','V318','V319','V320',
'DeviceType','DeviceInfo']
For columns with missing values, fill them with average or mode (or the most frequent category).
def drop_columns(data):
for col in data.columns:
if col not in useful_cols:
data.drop(col,axis=1,inplace=True)
drop_columns(train)
drop_columns(test)
def fill_na_mean(data):
col=['addr1','card2','card3','card5','V95','V96','V97','V98','V99','V100','V101',
'V102','V103','V104','V105','V106','V107','V108','V109','V110','V111','V112','V113','V114',
'V115','V116','V117','V118','V119','V120','V121','V122','V123','V124','V125','V126','V127',
'V128','V129','V130','V131','V132','V133','V134','V135','V136','V137','V279','V280','V281',
'V282','V283','V284','V285','V286','V287','V288','V289','V290','V291','V292','V293','V294',
'V295','V296','V297','V298','V299','V300','V301','V302','V303','V304','V305','V306','V307',
'V308','V309','V310','V311','V312','V313','V314','V315','V316','V317','V318','V319','V320']
for c in col:
data[c].fillna(data[c].mean(),inplace=True)
def fill_na_mean_test(data):
col=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']
for c in col:
data[c].fillna(data[c].mean(),inplace=True)
# C has missing values in test set only
def fill_na_mode(data):
col=['addr2','dist1']
for c in col:
data[c].fillna(data[c].mode()[0],inplace=True)
fill_na_mean(train)
fill_na_mean(test)
fill_na_mean_test(test)
fill_na_mode(train)
fill_na_mode(test)
After process the data, we need to check whether there exists any missing value.
# check missing values
def check(data):
T_na=(data.isnull().sum()/len(data))*100
T_na=T_na.drop(T_na[T_na==0].index).sort_values(ascending=False)
T_mis=pd.DataFrame({'MissingRatio':T_na})
print(T_mis)
check(train)
check(test)
# there are no missing value now
Empty DataFrame
Columns: [MissingRatio]
Index: []
Empty DataFrame
Columns: [MissingRatio]
Index: []
We can see that missing values do not exist in our dataset now.
2.3 Log Transform the Data
For the column TransactionAmt, it is fine to do the log transform to make the distribution of the data closer to a normal distribution.
def data_transform(data):
log_trans_col=['TransactionAmt']
for c in log_trans_col:
data[c]=np.log(data[c]+1)
data_transform(train)
data_transform(test)
print(f'Train: {train.shape[0]} rows {train.shape[1]} columns.')
print(f'Test: {test.shape[0]} rows {test.shape[1]} columns.')
Train: 590540 rows 115 columns.
Test: 506691 rows 114 columns.
# you can store the file for safety in this part
# Atrain=train
# Atest=test
# Atrain.to_csv(PATH+'Atrain.csv',index=0)
# Atest.to_csv(PATH+'Atest.csv',index=0)
# train=pd.read_csv(PATH+'Atrain.csv')
# test=pd.read_csv(PATH+'Atest.csv')
3 Build the Models
In this passage I only try a few simple models in sklearn and observe the output. Further improvement of the models will be discussed in the serial articles.
import sklearn.metrics as metric
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import lightgbm as lgb
3.1 One-hot Encoding
For those categorial features, it is suitable to do one-hot encoding, which will transform those features to vectors. (You can also do this step just after importing the data)
# one-hot
def One_hot(df):
oh=OneHotEncoder()
df_col=oh.fit_transform(np.array(df['ProductCD'].astype(str)).reshape(-1,1))
df_col=pd.DataFrame(df_col.todense())
df_col.columns=['Pro_1','Pro_2','Pro_3','Pro_4','Pro_5']
df=pd.concat([df,df_col],axis=1)
df.drop(['ProductCD'],axis=1,inplace=True)
oh=OneHotEncoder()
df_col=oh.fit_transform(np.array(df['card4'].astype(str)).reshape(-1,1))
df_col=pd.DataFrame(df_col.todense())
df_col.columns=['c4_1','c4_2','c4_3','c4_4','c4_5']
df=pd.concat([df,df_col],axis=1)
df.drop(['card4'],axis=1,inplace=True)
oh=OneHotEncoder()
df_col=oh.fit_transform(np.array(df['DeviceType'].astype(str)).reshape(-1,1))
df_col=pd.DataFrame(df_col.todense())
df_col.columns=['de_1','de_2','de_3']
df=pd.concat([df,df_col],axis=1)
df.drop(['DeviceType'],axis=1,inplace=True)
print(type(df))
return df
train=One_hot(train)
test=One_hot(test)
print(f'Train: {train.shape[0]} rows {train.shape[1]} columns.')
print(f'Test: {test.shape[0]} rows {test.shape[1]} columns.')
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Train: 590540 rows 125 columns.
Test: 506691 rows 124 columns.
3.2 Build the Models
In this step, split the dataset into training set and testing set as we usually do in building supervised learning models.
# build the model
X = train.loc[:,"TransactionAmt":]
y = train.loc[:,'isFraud']
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.25, random_state=0)
After we finish the job, we can try several models:
3.2.1 Random Forest
start =time.clock()
rft = RandomForestClassifier(criterion='entropy',max_depth=8,n_estimators=100,verbose=0)
rft.fit(X_train, y_train)
predVal = rft.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[142452 132]
[ 4023 1028]]
0.9718562671453246
Running time: 86.99202809999952 Seconds
We can see that the accuracy is relatively high. However, don’t forget we are dealing with a really unbalanced dataset! So if we just predict all the target values are Not Fraud, we will get high accuracy score, too. So it is necessary for us to check the confusion matrix. We can see that this algorithm doesn’t perform well in this case.
3.2.2 K Nearest Neighbor
start =time.clock()
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
predVal = clf.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[141658 926]
[ 3511 1540]]
0.9699461509804586
Running time: 1646.5342081 Seconds
3.2.3 Logistic Regression
start =time.clock()
log = LogisticRegression()
log.fit(X_train, y_train)
predVal = log.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[142527 57]
[ 4990 61]]
0.9658143394181596
Running time: 101.20429169999989 Seconds
3.2.4 SVM Classifier
start =time.clock()
svc = LinearSVC()
svc.fit(X_train, y_train)
predVal = svc.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[142418 166]
[ 4792 259]]
0.9664171774985606
Running time: 226.08227179999994 Seconds
3.2.5 Decision Tree
start =time.clock()
tree = DecisionTreeClassifier(max_depth=8,random_state=0)
tree.fit(X_train, y_train)
predVal = tree.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[142229 355]
[ 3522 1529]]
0.9737392894638806
Running time: 6.9810839999991 Seconds
3.2.6 Gradient Boosting Tree
start =time.clock()
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(X_train, y_train)
predVal = gbrt.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[142342 242]
[ 3650 1401]]
0.9736376875402174
Running time: 290.94794999999976 Seconds
3.2.7 MLP
start =time.clock()
mlp = MLPClassifier(solver='lbfgs', random_state=0)
mlp.fit(X_train, y_train)
predVal = mlp.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[140823 1761]
[ 4667 384]]
0.956460188979578
Running time: 373.1303016999991 Seconds
3.2.8 XGBoost
start =time.clock()
XGB = xgb.XGBClassifier(n_estimators=500,
n_jobs=4,
max_depth=9,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9
)
XGB.fit(X_train, y_train)
predVal = XGB.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[142354 230]
[ 2339 2712]]
0.9825989772073018
Running time: 1444.0721217999999 Seconds
3.2.9 LightGBM
start =time.clock()
GBM = lgb.LGBMClassifier(boosting_type='gbdt',
objective = 'binary',
metric = 'auc',
verbose = 0,
learning_rate = 0.05,
max_depth=8,
n_estimators = 1500,
num_leaves = 256,
max_bin = 255,
lambda_l1= 0.6,
lambda_l2= 0)
GBM.fit(X_train, y_train)
predVal = GBM.predict(X_test)
actVal = y_test.values
print(metric.confusion_matrix(actVal, predVal))
print(metric.accuracy_score(actVal, predVal))
end = time.clock()
print('Running time: %s Seconds'%(end-start))
[[142354 230]
[ 2038 3013]]
0.9846377891421411
Running time: 280.8494519000001 Seconds
3.3 Conclusion of the Algorithm
We can see that among these algorithms, XGBoost and LightGBM perform well and we have discussed these two algorithms in other passages. However, if we want to improve the performance, beside adjusting the parameters, we need to do taxing feature engineering as this is a question based on actual commercial environment.
4 Make Submissions
# make submissions
# y_preds=rft.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv'index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_rf.csv')
# y_preds=clf.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_KNeighbor.csv')
# y_preds=log.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_log.csv')
# y_preds=svc.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
#submission.to_csv(PATH+'submission11_svc.csv')
# y_preds=tree.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_tree.csv')
# y_preds=gbrt.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_gbrt.csv')
# y_preds=mlp.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_mlp.csv')
# y_preds=XGB.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_xgb.csv')
# y_preds=GBM.predict(Atest)
# submission=pd.read_csv(PATH+'sample_submission.csv',index_col='TransactionID')
# submission['isFraud'] = y_preds
# submission.to_csv(PATH+'submission11_gbm.csv')
上一篇: Java多线程的内存模型