未命名
程序员文章站
2022-07-14 11:02:41
...
1. 赛前准备知识
读取数据
import pandas as pd
import matplotlib.pyplot as plt
train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')
print('Train data shape:',train.shape)
print('testA data shape:',testA.shape)
Train data shape: (800000, 47)
testA data shape: (200000, 48)
train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
分类指标计算
# confusion_matrix
# pre
# P N
# acu P 1(TP) 1(FN)
# N 1(TN) 1(FP)
import numpy as np
from sklearn.metrics import confusion_matrix # 混淆矩阵
y_pred = [0,1,0,1]
y_true = [0,1,1,0]
print('混淆矩阵:\n',confusion_matrix(y_true,y_pred))
混淆矩阵:
[[1 1]
[1 1]]
#acurracy
# 预测正确的样本数/总样本数
from sklearn.metrics import accuracy_score
y_pred = [0,1,0,1]
y_true = [0,1,1,0]
print('ACC:',accuracy_score(y_true,y_pred))
ACC: 0.5
#precision,recall,F1-score 针对正样本出发
#precision = TP/TP+FP (实际正)预测为正的/预测正的总样本数
# recall = TP/TP+FN (实际正)预测为正的/实际为正的总样本数
# F1-score = 0.5 * (1/precision + 1/recall)
from sklearn import metrics
print('Presion',metrics.precision_score(y_true,y_pred))
print('Recall',metrics.recall_score(y_true,y_pred))
print('F1-score',metrics.f1_score(y_true,y_pred))
Presion 0.8571428571428571
Recall 0.8571428571428571
F1-score 0.8571428571428571
#P-R曲线 描述precision和recall变化的曲线
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,0,1]
precision,recall,thresholds = precision_recall_curve(y_true,y_pred)
plt.plot(precision,recall)
[<matplotlib.lines.Line2D at 0x11d850b00>]
#ROC曲线
#纵轴 真阳率 (recall) TPR = TP/TP+FN (实际正)预测正/实际为正的所有样本和
#横轴 假阳率 FPR = FP /TN+ FP
from sklearn.metrics import roc_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,0,1]
FPR,TPR,thresholds = roc_curve(y_true,y_pred)
plt.title('ROC')
plt.plot(FPR,TPR,'b')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
Text(0.5, 0, 'FPR')
## AUC
#什么是AUC值? ROC曲线下的面积 通常>0.5
import numpy as np
from sklearn.metrics import roc_auc_score
y_true = np.array([0,0,1,1])
y_scores = np.array([0.1,0.4,0.35,0.8]) #为啥这里是y_scores
print('AUC score:',roc_auc_score(y_true,y_scores))
AUC score: 0.75
# KS值 在实际操作时往往使用ROC曲线求KS值
#什么是KS值?
#<0.2 差
#[0.2,0.3] 勉强
#[0.3,0.5] 好
#>0.75 异常
from sklearn.metrics import roc_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,1,1]
FPR,TPR,thresholds = roc_curve(y_true,y_pred)
KS = abs(FPR-TPR).max()
print('KS值:',KS)
KS值: 0.5238095238095237
# 评分卡 不是标准评分卡
# 刻画用户的信用评分
# 评分卡是金融风控中常用的一种对于用户信用进行刻画的手段
def Score(prob,P0=600,PDO=20,badrate=None,goodrate = None):
import numpy as np
P0 = P0
PDO = PDO
theta0 = badrate / goodrate
B = PDO / np.log(2)
A = P0 + B * np.log(2 * theta0)
score = A-B*np.log(prob/(1-prob))
return score
2. 数据分析(EDA)
- 了解数据基本情况(缺失值、异常值)
- 了解变量之间的关系、变量和label之间的关系
- 为特征工程作准备
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')
2.1 读取文件
几个比较重要的参数:
- delimiter 读取文件分隔符
- nrows 读取文件前几行
- chunksize 控制每次迭代数据的大小
注意:检查路径可用os.getcwd()函数
data_train = pd.read_csv('train.csv')
data_test_a = pd.read_csv('testA.csv')
data_test_a.shape
(200000, 48)
data_train.shape
(800000, 47)
data_train.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
dtype='object')
data_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
id 800000 non-null int64
loanAmnt 800000 non-null float64
term 800000 non-null int64
interestRate 800000 non-null float64
installment 800000 non-null float64
grade 800000 non-null object
subGrade 800000 non-null object
employmentTitle 799999 non-null float64
employmentLength 753201 non-null object
homeOwnership 800000 non-null int64
annualIncome 800000 non-null float64
verificationStatus 800000 non-null int64
issueDate 800000 non-null object
isDefault 800000 non-null int64
purpose 800000 non-null int64
postCode 799999 non-null float64
regionCode 800000 non-null int64
dti 799761 non-null float64
delinquency_2years 800000 non-null float64
ficoRangeLow 800000 non-null float64
ficoRangeHigh 800000 non-null float64
openAcc 800000 non-null float64
pubRec 800000 non-null float64
pubRecBankruptcies 799595 non-null float64
revolBal 800000 non-null float64
revolUtil 799469 non-null float64
totalAcc 800000 non-null float64
initialListStatus 800000 non-null int64
applicationType 800000 non-null int64
earliesCreditLine 800000 non-null object
title 799999 non-null float64
policyCode 800000 non-null float64
n0 759730 non-null float64
n1 759730 non-null float64
n2 759730 non-null float64
n2.1 759730 non-null float64
n4 766761 non-null float64
n5 759730 non-null float64
n6 759730 non-null float64
n7 759730 non-null float64
n8 759729 non-null float64
n9 759730 non-null float64
n10 766761 non-null float64
n11 730248 non-null float64
n12 759730 non-null float64
n13 759730 non-null float64
n14 759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB
2.2 查看缺失值,唯一值等
print(f'There are {data_train.isnull().any().sum()} columns in train dataset with missing vlues.')
#isull()返回data_train的每个字段是否为空 any()列上只要有一个为空 就为false ,再整个求和
There are 22 columns in train dataset with missing vlues.
得到含有缺失值共有22列,进一步查看缺失值大于50%的列
have_null_fea_dict = (data_train.isnull().sum() / len(data_train)).to_dict()
fea_null_moreThan_half = {}
for key,value in have_null_fea_dict.items():
#print(key,value)
if value > 0.5:
fea_null_moreThan_half[key] = value
fea_null_moreThan_half
{}
没有缺失值超过50%的列
查看具体的缺失率
# nan 可视化
missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing > 0]
missing.sort_values(inplace = True)
missing.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x12a557ac8>
下一篇: Docker之容器命令