欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

未命名

程序员文章站 2022-07-14 11:02:41
...

1. 赛前准备知识

读取数据

import pandas as pd
import matplotlib.pyplot as plt

train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')
print('Train data shape:',train.shape)
print('testA data shape:',testA.shape)
Train data shape: (800000, 47)
testA data shape: (200000, 48)
train.head()
id loanAmnt term interestRate installment grade subGrade employmentTitle employmentLength homeOwnership ... n5 n6 n7 n8 n9 n10 n11 n12 n13 n14
0 0 35000.0 5 19.52 917.97 E E2 320.0 2 years 2 ... 9.0 8.0 4.0 12.0 2.0 7.0 0.0 0.0 0.0 2.0
1 1 18000.0 5 18.49 461.90 D D2 219843.0 5 years 0 ... NaN NaN NaN NaN NaN 13.0 NaN NaN NaN NaN
2 2 12000.0 5 16.99 298.17 D D3 31698.0 8 years 0 ... 0.0 21.0 4.0 5.0 3.0 11.0 0.0 0.0 0.0 4.0
3 3 11000.0 3 7.26 340.96 A A4 46854.0 10+ years 1 ... 16.0 4.0 7.0 21.0 6.0 9.0 0.0 0.0 0.0 1.0
4 4 3000.0 3 12.99 101.07 C C2 54.0 NaN 1 ... 4.0 9.0 10.0 15.0 7.0 12.0 0.0 0.0 0.0 4.0

5 rows × 47 columns

分类指标计算

# confusion_matrix
#       pre
#       P       N
# acu P 1(TP)   1(FN)
#     N 1(TN)   1(FP)
import numpy as np
from sklearn.metrics import confusion_matrix # 混淆矩阵
y_pred = [0,1,0,1]
y_true = [0,1,1,0]
print('混淆矩阵:\n',confusion_matrix(y_true,y_pred))
混淆矩阵:
 [[1 1]
 [1 1]]
#acurracy
# 预测正确的样本数/总样本数
from sklearn.metrics import accuracy_score
y_pred = [0,1,0,1]
y_true = [0,1,1,0]
print('ACC:',accuracy_score(y_true,y_pred))
ACC: 0.5
#precision,recall,F1-score  针对正样本出发
#precision = TP/TP+FP  (实际正)预测为正的/预测正的总样本数
# recall = TP/TP+FN   (实际正)预测为正的/实际为正的总样本数
# F1-score = 0.5 * (1/precision + 1/recall) 
from sklearn import metrics
print('Presion',metrics.precision_score(y_true,y_pred))
print('Recall',metrics.recall_score(y_true,y_pred))
print('F1-score',metrics.f1_score(y_true,y_pred))
Presion 0.8571428571428571
Recall 0.8571428571428571
F1-score 0.8571428571428571
#P-R曲线 描述precision和recall变化的曲线
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,0,1]
precision,recall,thresholds = precision_recall_curve(y_true,y_pred)
plt.plot(precision,recall)
[<matplotlib.lines.Line2D at 0x11d850b00>]

未命名

#ROC曲线
#纵轴 真阳率 (recall) TPR  = TP/TP+FN  (实际正)预测正/实际为正的所有样本和
#横轴 假阳率  FPR = FP /TN+ FP
from sklearn.metrics import roc_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,0,1]
FPR,TPR,thresholds = roc_curve(y_true,y_pred)
plt.title('ROC')
plt.plot(FPR,TPR,'b')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
Text(0.5, 0, 'FPR')

未命名

## AUC
#什么是AUC值? ROC曲线下的面积 通常>0.5
import numpy as np
from sklearn.metrics import roc_auc_score
y_true = np.array([0,0,1,1])
y_scores = np.array([0.1,0.4,0.35,0.8])    #为啥这里是y_scores
print('AUC score:',roc_auc_score(y_true,y_scores))
AUC score: 0.75
# KS值 在实际操作时往往使用ROC曲线求KS值
#什么是KS值?
#<0.2  差
#[0.2,0.3] 勉强
#[0.3,0.5] 好
#>0.75 异常
from sklearn.metrics import roc_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,1,1]
FPR,TPR,thresholds = roc_curve(y_true,y_pred)
KS = abs(FPR-TPR).max()
print('KS值:',KS)
KS值: 0.5238095238095237
# 评分卡 不是标准评分卡
# 刻画用户的信用评分
# 评分卡是金融风控中常用的一种对于用户信用进行刻画的手段
def Score(prob,P0=600,PDO=20,badrate=None,goodrate = None):
    import numpy as np
    P0 = P0
    PDO = PDO
    theta0 = badrate / goodrate
    B = PDO / np.log(2)
    A = P0 + B * np.log(2 * theta0)
    score = A-B*np.log(prob/(1-prob))
    return score

2. 数据分析(EDA)

  1. 了解数据基本情况(缺失值、异常值)
  2. 了解变量之间的关系、变量和label之间的关系
  3. 为特征工程作准备
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')

2.1 读取文件

几个比较重要的参数:

  1. delimiter 读取文件分隔符
  2. nrows 读取文件前几行
  3. chunksize 控制每次迭代数据的大小
    注意:检查路径可用os.getcwd()函数
data_train = pd.read_csv('train.csv')
data_test_a = pd.read_csv('testA.csv')
data_test_a.shape
(200000, 48)
data_train.shape
(800000, 47)
data_train.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')
data_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
id                    800000 non-null int64
loanAmnt              800000 non-null float64
term                  800000 non-null int64
interestRate          800000 non-null float64
installment           800000 non-null float64
grade                 800000 non-null object
subGrade              800000 non-null object
employmentTitle       799999 non-null float64
employmentLength      753201 non-null object
homeOwnership         800000 non-null int64
annualIncome          800000 non-null float64
verificationStatus    800000 non-null int64
issueDate             800000 non-null object
isDefault             800000 non-null int64
purpose               800000 non-null int64
postCode              799999 non-null float64
regionCode            800000 non-null int64
dti                   799761 non-null float64
delinquency_2years    800000 non-null float64
ficoRangeLow          800000 non-null float64
ficoRangeHigh         800000 non-null float64
openAcc               800000 non-null float64
pubRec                800000 non-null float64
pubRecBankruptcies    799595 non-null float64
revolBal              800000 non-null float64
revolUtil             799469 non-null float64
totalAcc              800000 non-null float64
initialListStatus     800000 non-null int64
applicationType       800000 non-null int64
earliesCreditLine     800000 non-null object
title                 799999 non-null float64
policyCode            800000 non-null float64
n0                    759730 non-null float64
n1                    759730 non-null float64
n2                    759730 non-null float64
n2.1                  759730 non-null float64
n4                    766761 non-null float64
n5                    759730 non-null float64
n6                    759730 non-null float64
n7                    759730 non-null float64
n8                    759729 non-null float64
n9                    759730 non-null float64
n10                   766761 non-null float64
n11                   730248 non-null float64
n12                   759730 non-null float64
n13                   759730 non-null float64
n14                   759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB

2.2 查看缺失值,唯一值等

print(f'There are {data_train.isnull().any().sum()} columns in train dataset with missing vlues.')

#isull()返回data_train的每个字段是否为空 any()列上只要有一个为空 就为false ,再整个求和

There are 22 columns in train dataset with missing vlues.

得到含有缺失值共有22列,进一步查看缺失值大于50%的列

have_null_fea_dict = (data_train.isnull().sum() / len(data_train)).to_dict()
fea_null_moreThan_half = {}
for key,value in have_null_fea_dict.items():
    #print(key,value)
    if value > 0.5:
        fea_null_moreThan_half[key] = value
fea_null_moreThan_half
{}

没有缺失值超过50%的列
查看具体的缺失率

# nan 可视化
missing = data_train.isnull().sum()/len(data_train)
missing = missing[missing > 0]
missing.sort_values(inplace = True)
missing.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x12a557ac8>

未命名