未命名
程序员文章站
2022-07-14 11:02:23
...
1. 赛前准备知识
读取数据
import pandas as pd
import matplotlib.pyplot as plt
train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')
print('Train data shape:',train.shape)
print('testA data shape:',testA.shape)
Train data shape: (800000, 47)
testA data shape: (200000, 48)
train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n5 | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 9.0 | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 0.0 | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 16.0 | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 4.0 | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 |
5 rows × 47 columns
分类指标计算
# confusion_matrix
# pre
# P N
# acu P 1(TP) 1(FN)
# N 1(TN) 1(FP)
import numpy as np
from sklearn.metrics import confusion_matrix # 混淆矩阵
y_pred = [0,1,0,1]
y_true = [0,1,1,0]
print('混淆矩阵:\n',confusion_matrix(y_true,y_pred))
混淆矩阵:
[[1 1]
[1 1]]
#acurracy
# 预测正确的样本数/总样本数
from sklearn.metrics import accuracy_score
y_pred = [0,1,0,1]
y_true = [0,1,1,0]
print('ACC:',accuracy_score(y_true,y_pred))
ACC: 0.5
#precision,recall,F1-score 针对正样本出发
#precision = TP/TP+FP (实际正)预测为正的/预测正的总样本数
# recall = TP/TP+FN (实际正)预测为正的/实际为正的总样本数
# F1-score = 0.5 * (1/precision + 1/recall)
from sklearn import metrics
print('Presion',metrics.precision_score(y_true,y_pred))
print('Recall',metrics.recall_score(y_true,y_pred))
print('F1-score',metrics.f1_score(y_true,y_pred))
Presion 0.8571428571428571
Recall 0.8571428571428571
F1-score 0.8571428571428571
#P-R曲线 描述precision和recall变化的曲线
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,0,1]
precision,recall,thresholds = precision_recall_curve(y_true,y_pred)
plt.plot(precision,recall)
[<matplotlib.lines.Line2D at 0x11d850b00>]
#ROC曲线
#纵轴 真阳率 (recall) TPR = TP/TP+FN (实际正)预测正/实际为正的所有样本和
#横轴 假阳率 FPR = FP /TN+ FP
from sklearn.metrics import roc_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,0,1]
FPR,TPR,thresholds = roc_curve(y_true,y_pred)
plt.title('ROC')
plt.plot(FPR,TPR,'b')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
Text(0.5, 0, 'FPR')
## AUC
#什么是AUC值? ROC曲线下的面积 通常>0.5
import numpy as np
from sklearn.metrics import roc_auc_score
y_true = np.array([0,0,1,1])
y_scores = np.array([0.1,0.4,0.35,0.8]) #为啥这里是y_scores
print('AUC score:',roc_auc_score(y_true,y_scores))
AUC score: 0.75
# KS值 在实际操作时往往使用ROC曲线求KS值
#什么是KS值?
#<0.2 差
#[0.2,0.3] 勉强
#[0.3,0.5] 好
#>0.75 异常
from sklearn.metrics import roc_curve
y_pred = [0,1,1,0,1,1,0,1,1,1]
y_true = [0,1,1,0,1,0,1,1,1,1]
FPR,TPR,thresholds = roc_curve(y_true,y_pred)
KS = abs(FPR-TPR).max()
print('KS值:',KS)
KS值: 0.5238095238095237
# 评分卡 不是标准评分卡
# 刻画用户的信用评分
# 评分卡是金融风控中常用的一种对于用户信用进行刻画的手段
def Score(prob,P0=600,PDO=20,badrate=None,goodrate = None):
import numpy as np
P0 = P0
PDO = PDO
theta0 = badrate / goodrate
B = PDO / np.log(2)
A = P0 + B * np.log(2 * theta0)
score = A-B*np.log(prob/(1-prob))
return score
2. 数据分析(EDA)
- 了解数据基本情况(缺失值、异常值)
- 了解变量之间的关系、变量和label之间的关系
- 为特征工程作准备
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
warnings.filterwarnings('ignore')
2.1 读取文件
几个比较重要的参数:
- delimiter 读取文件分隔符
- nrows 读取文件前几行
- chunksize 控制每次迭代数据的大小
注意:检查路径可用os.getcwd()函数
data_train = pd.read_csv('train.csv')
data_test_a = pd.read_csv('testA.csv')
data_test_a.shape
(200000, 48)
data_train.shape
(800000, 47)
data_train.columns
Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
'policyCode', 'n0', 'n1', 'n2', 'n2.1', 'n4', 'n5', 'n6', 'n7', 'n8',
'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
dtype='object')
data_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
id 800000 non-null int64
loanAmnt 800000 non-null float64
term 800000 non-null int64
interestRate 800000 non-null float64
installment 800000 non-null float64
grade 800000 non-null object
subGrade 800000 non-null object
employmentTitle 799999 non-null float64
employmentLength 753201 non-null object
homeOwnership 800000 non-null int64
annualIncome 800000 non-null float64
verificationStatus 800000 non-null int64
issueDate 800000 non-null object
isDefault 800000 non-null int64
purpose 800000 non-null int64
postCode 799999 non-null float64
regionCode 800000 non-null int64
dti 799761 non-null float64
delinquency_2years 800000 non-null float64
ficoRangeLow 800000 non-null float64
ficoRangeHigh 800000 non-null float64
openAcc 800000 non-null float64
pubRec 800000 non-null float64
pubRecBankruptcies 799595 non-null float64
revolBal 800000 non-null float64
revolUtil 799469 non-null float64
totalAcc 800000 non-null float64
initialListStatus 800000 non-null int64
applicationType 800000 non-null int64
earliesCreditLine 800000 non-null object
title 799999 non-null float64
policyCode 800000 non-null float64
n0 759730 non-null float64
n1 759730 non-null float64
n2 759730 non-null float64
n2.1 759730 non-null float64
n4 766761 non-null float64
n5 759730 non-null float64
n6 759730 non-null float64
n7 759730 non-null float64
n8 759729 non-null float64
n9 759730 non-null float64
n10 766761 non-null float64
n11 730248 non-null float64
n12 759730 non-null float64
n13 759730 non-null float64
n14 759730 non-null float64
dtypes: float64(33), int64(9), object(5)
memory usage: 286.9+ MB
2.2 查看缺失值,唯一值等
print(f'There are {data_train.isnull().any().sum()} columns in train dataset with missing vlues.')
#isull()返回data_train的每个字段是否为空 any()列上只要有一个为空 就为false ,再整个求和
There are 22 columns in train dataset with missing vlues.
得到含有缺失值共有22列,进一步查看缺失值大于50%的列
have_null_fea_dict = (data_train.isnull().sum() / len(data_train)).to_dict()
fea_null_moreThan_half = {}
for key,value in have_null_fea_dict.items():
#print(key,value)
if value > 0.5:
fea_null_moreThan_half[key] = value
fea_null_moreThan_half
{}
没有缺失值超过50%的列
查看具体的缺失率
# nan 可视化
missing = data_train.isnull().sum()/len(data_train) #return seres
missing = missing[missing > 0]
missing.sort_values(inplace = True)
missing.plot.bar()
<matplotlib.axes._subplots.AxesSubplot at 0x12a557ac8>
以上缺失值比较少,所以在这里选择进行树模型(LGB)进行缺失填补。缺失较多的列可以考虑删掉
查看列值只有一值的列
one_value_fea = [col for col in data_train.columns if data_train[col].nunique() <= 1]
one_value_fea
['policyCode']
one_value_test = [col for col in data_test_a.columns if data_test_a[col].nunique() <=1]
one_value_test
['policyCode']
print(f'There are {len(one_value_fea)} columns in train dataset with one unique value')
print(f'there are {len(one_value_test)} columns in testa dataset with one unique value')
There are 1 columns in train dataset with one unique value
there are 1 columns in testa dataset with one unique value
47列数据中有22列含有缺失值,‘policycode’只有一个值(或全空)。
2.3 查看特征的数据类型、对象类型
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea, list(data_train.columns)))
numerical_fea
['id',
'loanAmnt',
'term',
'interestRate',
'installment',
'employmentTitle',
'homeOwnership',
'annualIncome',
'verificationStatus',
'isDefault',
'purpose',
'postCode',
'regionCode',
'dti',
'delinquency_2years',
'ficoRangeLow',
'ficoRangeHigh',
'openAcc',
'pubRec',
'pubRecBankruptcies',
'revolBal',
'revolUtil',
'totalAcc',
'initialListStatus',
'applicationType',
'title',
'policyCode',
'n0',
'n1',
'n2',
'n2.1',
'n4',
'n5',
'n6',
'n7',
'n8',
'n9',
'n10',
'n11',
'n12',
'n13',
'n14']
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
data_train.grade
0 E
1 D
2 D
3 A
4 C
..
799995 C
799996 A
799997 C
799998 A
799999 B
Name: grade, Length: 800000, dtype: object
数值型变量分析,数值型肯定是包括连续性型变量和离散型变量的,找出来
- 划分数值型变量中的连续变量和分类变量
# 过滤数值型类别特征
def get_numerical_serial_fea(data,feas):
numerical_serial_fea = []
numerical_noserial_fea = []
for fea in feas:
temp = data[fea].nunique()
if temp <= 10: # 离散值
numerical_noserial_fea.append(fea)
continue
numerical_serial_fea.append(fea) # 连续值
return numerical_serial_fea,numerical_noserial_fea
numerical_serial_fea,numerical_noserial_fea = get_numerical_serial_fea(data_train,numerical_fea)
numerical_serial_fea
['id',
'loanAmnt',
'interestRate',
'installment',
'employmentTitle',
'annualIncome',
'purpose',
'postCode',
'regionCode',
'dti',
'delinquency_2years',
'ficoRangeLow',
'ficoRangeHigh',
'openAcc',
'pubRec',
'pubRecBankruptcies',
'revolBal',
'revolUtil',
'totalAcc',
'title',
'n0',
'n1',
'n2',
'n2.1',
'n4',
'n5',
'n6',
'n7',
'n8',
'n9',
'n10',
'n13',
'n14']
numerical_noserial_fea
['term',
'homeOwnership',
'verificationStatus',
'isDefault',
'initialListStatus',
'applicationType',
'policyCode',
'n11',
'n12']
- 数值类别型变量分析
data_train['term'].value_counts() #离散型变量
3 606902
5 193098
Name: term, dtype: int64
data_train['homeOwnership'].value_counts()#离散型变量
0 395732
1 317660
2 86309
3 185
5 81
4 33
Name: homeOwnership, dtype: int64
data_train['verificationStatus'].value_counts()#离散型变量
1 309810
2 248968
0 241222
Name: verificationStatus, dtype: int64
data_train['initialListStatus'].value_counts()#离散型变量
0 466438
1 333562
Name: initialListStatus, dtype: int64
data_train['applicationType'].value_counts()
0 784586
1 15414
Name: applicationType, dtype: int64
data_train['policyCode'].value_counts() # 无用,全部一个值
1.0 800000
Name: policyCode, dtype: int64
data_train['n11'].value_counts() # 离散型变量,相差悬殊,用不用再分析
0.0 729682
1.0 540
2.0 24
4.0 1
3.0 1
Name: n11, dtype: int64
data_train['n12'].value_counts() # 相差悬殊
0.0 757315
1.0 2281
2.0 115
3.0 16
4.0 3
Name: n12, dtype: int64
- 数值连续型变量分析
# 每个数字特征得分布可视化
f = pd.melt(data_train, value_vars = numerical_serial_fea )
g = sns.FacetGrid(f, col = 'variable',col_wrap = 2,sharex = False,sharey = False)
g = g.map(sns.distplot,'value')
- 查看某一数值型变量的分布,查看变量是否符合正态分布,如果不符合正态分布的变量可以log化后再观察下是否符合正态分布。
- 如果想统一处理一批数据变标准化,必须把这些已经正太化的数据提出
# Ploting Transcation Amount Values Distribution
plt.figure(figsize=(16,12))
plt.suptitle('Transaction Values Dsitribution', fontsize = 22)
plt.subplot(221)
sub_plot_1 = sns.distplot(data_train['loanAmnt'])
sub_plot_1.set_title("loadAmnt Distribution", fontsize = 18)
sub_plot_1.set_xlabel("")
sub_plot_1.set_ylabel('Probability',fontsize = 15)
plt.subplot(222)
sub_plot_2 = sns.distplot(np.log(data_train['loanAmnt']))
sub_plot_2.set_title('loanAmnt (log) Distribution', fontsize = 18)
sub_plot_2.set_xlabel("")
sub_plot_2.set_ylabel('Probability',fontsize=15)
Text(0, 0.5, 'Probability')
- 非数值类别型变量分析
category_fea
['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
data_train['grade'].value_counts()
B 233690
C 227118
A 139661
D 119453
E 55661
F 19053
G 5364
Name: grade, dtype: int64
data_train['subGrade'].value_counts()
C1 50763
B4 49516
B5 48965
B3 48600
C2 47068
C3 44751
C4 44272
B2 44227
B1 42382
C5 40264
A5 38045
A4 30928
D1 30538
D2 26528
A1 25909
D3 23410
A3 22655
A2 22124
D4 21139
D5 17838
E1 14064
E2 12746
E3 10925
E4 9273
E5 8653
F1 5925
F2 4340
F3 3577
F4 2859
F5 2352
G1 1759
G2 1231
G3 978
G4 751
G5 645
Name: subGrade, dtype: int64
data_train['employmentLength'].value_counts()
10+ years 262753
2 years 72358
< 1 year 64237
3 years 64152
1 year 52489
5 years 50102
4 years 47985
6 years 37254
8 years 36192
7 years 35407
9 years 30272
Name: employmentLength, dtype: int64
data_train['issueDate'].value_counts()
2016-03-01 29066
2015-10-01 25525
2015-07-01 24496
2015-12-01 23245
2014-10-01 21461
...
2007-08-01 23
2007-07-01 21
2008-09-01 19
2007-09-01 7
2007-06-01 1
Name: issueDate, Length: 139, dtype: int64
data_train['earliesCreditLine'].value_counts()
Aug-2001 5567
Sep-2003 5403
Aug-2002 5403
Oct-2001 5258
Aug-2000 5246
...
Dec-1960 1
Mar-1962 1
Jan-1944 1
Aug-1958 1
May-1957 1
Name: earliesCreditLine, Length: 720, dtype: int64
data_train['isDefault'].value_counts()
0 640390
1 159610
Name: isDefault, dtype: int64
2.4 变量分布可视化
2.4.1 单一变量分布可视化
plt.figure(figsize=(8,8))
sns.barplot(data_train['employmentLength'].value_counts(dropna = False)[:20],
data_train['employmentLength'].value_counts(dropna = False).keys()[:20])
plt.show()
2.4.2 根据y值不同可视化x某个特征的分布
- 类别型变量(grade\employmentLength)在不同y值上的分布
train_loan_fr = data_train.loc[data_train['isDefault'] == 1]
train_loan_nofr = data_train.loc[data_train['isDefault'] == 0]
fig,((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2,figsize=(15,8))
train_loan_fr.groupby('grade')['grade'].count().plot(kind = 'barh',ax = ax1, title = 'Count of grade fraud')
train_loan_nofr.groupby('grade')['grade'].count().plot(kind = 'barh',ax = ax2,
title='Count of grade no-fraud')
train_loan_fr.groupby('employmentLength')['employmentLength'].count().plot(kind = 'barh',
ax = ax3,
title = 'Count of employmentLength fraud')
train_loan_nofr.groupby('employmentLength')['employmentLength'].count().plot(kind = 'barh',
ax = ax4,
title = 'Count of employmentLength fraud')
plt.show()
变量grade 变化情况差不多。只有当grade = A时,才能比较明显的区分fraud和no-fraud
train_loan_fr.groupby('grade')['grade'].count()
grade
A 8432
B 31079
C 51106
D 36296
E 21390
F 8641
G 2666
Name: grade, dtype: int64
- 查看连续型变量(loanAmnt)在不同y值上的分布
numerical_serial_fea
#numerical_fea # 数值连续型变量
['id',
'loanAmnt',
'interestRate',
'installment',
'employmentTitle',
'annualIncome',
'purpose',
'postCode',
'regionCode',
'dti',
'delinquency_2years',
'ficoRangeLow',
'ficoRangeHigh',
'openAcc',
'pubRec',
'pubRecBankruptcies',
'revolBal',
'revolUtil',
'totalAcc',
'title',
'n0',
'n1',
'n2',
'n2.1',
'n4',
'n5',
'n6',
'n7',
'n8',
'n9',
'n10',
'n13',
'n14']
fig,((ax1,ax2)) = plt.subplots(1,2,figsize = (15,6))
data_train.loc[data_train['isDefault']==1]['loanAmnt'].apply(np.log).plot(kind = 'hist',
bins = 100,
title = 'Log Loan Amt - Fraud',
color = 'r',
xlim = (-3,10),
ax = ax1)
# serese 可以直接plot
data_train.loc[data_train['isDefault']==0]['loanAmnt'].apply(np.log).plot(kind = 'hist',
bins = 100,
title = 'Log Loan Amt - no Fraud',
color = 'b',
xlim = (-3,10),
ax = ax2)
<matplotlib.axes._subplots.AxesSubplot at 0x12a1a0898>
total = len(data_train)
total_amt = data_train.groupby('isDefault')['loanAmnt'].sum().sum()
plt.figure(figsize=(12,5))
plt.subplot(121)
plot_tr = sns.countplot(x='isDefault',data = data_train)
plot_tr.set_title('Fraud Loan Distribution \n 0: good user | 1:bad user',fontsize =14)
plot_tr.set_xlabel('Is fraud by count',fontsize= 16)
for p in plot_tr.patches:
height = p.get_height()
plot_tr.text(p.get_x()+p.get_width()/2.,
height+3,
'{:1.2f}%'.format(height/total*100),
ha = 'center',
fontsize=15)
percent_amt = (data_train.groupby(['isDefault'])['loanAmnt'].sum())
percent_amt = percent_amt.reset_index()
plt.subplot(122)
plot_tr_2 = sns.barplot(x = 'isDefault',y='loanAmnt',data=percent_amt)
plot_tr_2.set_title('Total Amount in LoanAmnt \n 0: good user | 1:bad user',fontsize = 14)
plot_tr_2.set_xlabel('Is fraud by percent',fontsize = 16)
plot_tr_2.set_ylabel('total Loan Amount Scalar',fontsize = 16)
for p in plot_tr_2.patches:#遍历每个柱子 目的:给每个柱子添加标记
height= p.get_height()
plot_tr_2.text(p.get_x()+p.get_width()/2., # 文字横坐标位置
height +3, #文件纵坐标位置
'{:1.2f}%'.format(height/total_amt*100), # 文字
ha = 'center', # 居中
fontsize = 15)
时间格式数据处理及查看
#转化成时间格式 注意怎么转的!! 也可以用apply
data_train['issueDate'] = pd.to_datetime(data_train['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01','%Y-%m-%d')
data_train['issueDateDT'] = data_train['issueDate'].apply(lambda x: x-startdate).dt.days
#这是什么意思? dt.days
# 转换成时间格式
data_test_a['issueDate'] =pd.to_datetime(data_train['issueDate'],format = '%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01','%Y-%m-%d')
data_test_a['issueDateDT'] = data_test_a['issueDate'].apply(lambda x: x-startdate).dt.days
#将日期单位化成天!
data_train.head()
id | loanAmnt | term | interestRate | installment | grade | subGrade | employmentTitle | employmentLength | homeOwnership | ... | n6 | n7 | n8 | n9 | n10 | n11 | n12 | n13 | n14 | issueDateDT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 35000.0 | 5 | 19.52 | 917.97 | E | E2 | 320.0 | 2 years | 2 | ... | 8.0 | 4.0 | 12.0 | 2.0 | 7.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2587 |
1 | 1 | 18000.0 | 5 | 18.49 | 461.90 | D | D2 | 219843.0 | 5 years | 0 | ... | NaN | NaN | NaN | NaN | 13.0 | NaN | NaN | NaN | NaN | 1888 |
2 | 2 | 12000.0 | 5 | 16.99 | 298.17 | D | D3 | 31698.0 | 8 years | 0 | ... | 21.0 | 4.0 | 5.0 | 3.0 | 11.0 | 0.0 | 0.0 | 0.0 | 4.0 | 3044 |
3 | 3 | 11000.0 | 3 | 7.26 | 340.96 | A | A4 | 46854.0 | 10+ years | 1 | ... | 4.0 | 7.0 | 21.0 | 6.0 | 9.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2983 |
4 | 4 | 3000.0 | 3 | 12.99 | 101.07 | C | C2 | 54.0 | NaN | 1 | ... | 9.0 | 10.0 | 15.0 | 7.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 | 3196 |
5 rows × 48 columns
plt.hist(data_train['issueDateDT'],label='train')
plt.hist(data_test_a['issueDateDT'],label = 'test')
plt.legend();
plt.title('Dsitribution of issuedateDT dates');
# train 和test issueDateDT 日期有重叠 所以使用基于时间的分割 所以基于时间的分割进行验证是不明智的
透视图
#透视图
pivot = pd.pivot_table(data_train,index = ['grade'],columns=['issueDateDT'],
values=['loanAmnt'],aggfunc=np.sum)
pivot
loanAmnt | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
issueDateDT | 0 | 30 | 61 | 92 | 122 | 153 | 183 | 214 | 245 | 274 | ... | 3926 | 3957 | 3987 | 4018 | 4048 | 4079 | 4110 | 4140 | 4171 | 4201 |
grade | |||||||||||||||||||||
A | NaN | 53650.0 | 42000.0 | 19500.0 | 34425.0 | 63950.0 | 43500.0 | 168825.0 | 85600.0 | 101825.0 | ... | 13093850.0 | 11757325.0 | 11945975.0 | 9144000.0 | 7977650.0 | 6888900.0 | 5109800.0 | 3919275.0 | 2694025.0 | 2245625.0 |
B | NaN | 13000.0 | 24000.0 | 32125.0 | 7025.0 | 95750.0 | 164300.0 | 303175.0 | 434425.0 | 538450.0 | ... | 16863100.0 | 17275175.0 | 16217500.0 | 11431350.0 | 8967750.0 | 7572725.0 | 4884600.0 | 4329400.0 | 3922575.0 | 3257100.0 |
C | NaN | 68750.0 | 8175.0 | 10000.0 | 61800.0 | 52550.0 | 175375.0 | 151100.0 | 243725.0 | 393150.0 | ... | 17502375.0 | 17471500.0 | 16111225.0 | 11973675.0 | 10184450.0 | 7765000.0 | 5354450.0 | 4552600.0 | 2870050.0 | 2246250.0 |
D | NaN | NaN | 5500.0 | 2850.0 | 28625.0 | NaN | 167975.0 | 171325.0 | 192900.0 | 269325.0 | ... | 11403075.0 | 10964150.0 | 10747675.0 | 7082050.0 | 7189625.0 | 5195700.0 | 3455175.0 | 3038500.0 | 2452375.0 | 1771750.0 |
E | 7500.0 | NaN | 10000.0 | NaN | 17975.0 | 1500.0 | 94375.0 | 116450.0 | 42000.0 | 139775.0 | ... | 3983050.0 | 3410125.0 | 3107150.0 | 2341825.0 | 2225675.0 | 1643675.0 | 1091025.0 | 1131625.0 | 883950.0 | 802425.0 |
F | NaN | NaN | 31250.0 | 2125.0 | NaN | NaN | NaN | 49000.0 | 27000.0 | 43000.0 | ... | 1074175.0 | 868925.0 | 761675.0 | 685325.0 | 665750.0 | 685200.0 | 316700.0 | 315075.0 | 72300.0 | NaN |
G | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 24625.0 | NaN | NaN | ... | 56100.0 | 243275.0 | 224825.0 | 64050.0 | 198575.0 | 245825.0 | 53125.0 | 23750.0 | 25100.0 | 1000.0 |
7 rows × 139 columns
这个能得到什么结论??
生成数据报告
import pandas_profiling