构建xgboost和lightgbm模型(某金融数据集)
程序员文章站
2024-03-24 20:25:28
...
导入各种包
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,f1_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
导入数据
data=pd.read_csv('./data.csv',index_col=0,encoding='gbk')
数据理解
#单独提取出y列标签,和其余的88列标记为x
y=data['status']
X=data.drop('status',axis=1)
#X值的行列数,以及y的分布类型
print('X.shape:',X.shape)
print('y的分布:',y.value_counts())
数据准备
#首先剔除一些明显无用的特征,如id_name,custid,trade_no,bank_card_no
X.drop(['id_name','custid','trade_no','bank_card_no'],axis=1,inplace=True)
print(X.shape)
#选取数值型特征
X_num=X.select_dtypes('number').copy()
print(X_num.shape)
type(X_num.mean())
#使用均值填充缺失值
X_num.fillna(X_num.mean(),inplace=True)
#观察数值型以外的变量
X_str=X.select_dtypes(exclude='number').copy()
X_str.describe()
#把reg_preference用虚拟变量代替,其它三个变量删除
X_str['reg_preference_for_trad'] = X_str['reg_preference_for_trad'].fillna(X_str['reg_preference_for_trad'].mode()[0])
X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])
X_str_dummy.head()
#合并数值型变量和名义型(字符型)变量
X_cl = pd.concat([X_num,X_str_dummy],axis=1,sort=False)
#X_cl.shape
数据建模
#以三七比例分割训练集和测试集
random_state = 1118
X_train,X_test,y_train,y_test = train_test_split(X_cl,y,test_size=0.3,random_state=1118)
print(X_train.shape)
print(X_test.shape)
"""
#建立xgboost模型
xgboost_model=XGBClassifier()
xgboost_model.fit(X_train,y_train)
#用建立好的xgboost模型运用到训练集和测试集上,进行预测
y_train_pred = xgboost_model.predict(X_train)
y_test_pred = xgboost_model.predict(X_test)
"""
#建立lightgbm模型
lgbm_model=LGBMClassifier()
lgbm_model.fit(X_train,y_train)
#用建立好的lightbm模型运用到训练集和测试集上,进行预测
y_train_pred = lgbm_model.predict(X_train)
y_test_pred = lgbm_model.predict(X_test)
模型评估
"""
## xgboost模型评估
print('训练集:{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('测试集:{:.4f}'.format(f1_score(y_test, y_test_pred)))
print('ROC AUC:')
print('训练集:{:.4f}'.format(roc_auc_score(y_train, y_train_pred)))
print('测试集:{:.4f}'.format(roc_auc_score(y_test, y_test_pred)))
"""
训练集:0.6258
测试集:0.4472
ROC AUC:
训练集:0.7335
测试集:0.6386
## lgbm模型评估
print('训练集:{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('测试集:{:.4f}'.format(f1_score(y_test, y_test_pred)))
print('ROC AUC:')
print('训练集:{:.4f}'.format(roc_auc_score(y_train, y_train_pred)))
print('测试集:{:.4f}'.format(roc_auc_score(y_test, y_test_pred)))
训练集:0.9945
测试集:0.4544
ROC AUC:
训练集:0.9949
测试集:0.6396
xgboost和lightgbm的安装请参考以下三个链接:(如果安装错误可能是下载的安装包版本和你电脑的anaconda版本不匹配导致,maybe)
https://blog.csdn.net/zz860890410/article/details/78682041
https://blog.csdn.net/famirtse/article/details/80379545
https://www.lfd.uci.edu/~gohlke/pythonlibs/#xgboost