欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

存款流失预警模型--数据探索

程序员文章站 2022-05-26 19:02:15
...

存款流失预警模型–数据探索

crm 字段箱线图 输入数据集以及需要绘制的列
异常值比例
相关系数
缺失值比例
基本统计


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
# 进入指定目录
os.chdir('/home/sophonAdmin/POC/poc_data/')
header = ['cust_isn'
,'bel_org'
,'deposit_cur_balabce'
,'deposit_year_total__balabce'
,'financial_cur_balabce'
,'financial_year_total__balabce'
,'loan_cur_balabce'
,'loan_year_total__balabce'
,'own_products_sum'
,'inbank_total_debt'
,'debt_year_dailyaverage'
,'asset_month_dailyaverage'
,'asset_qtr_dailyaverage'
,'asset_year_dailyaverage'
,'asset_this_diff_begin'
,'asset_this_diff_lastyear'
,'debt_month_dailyaverage'
,'debt_qtr_dailyaverage'
,'debt_this_diff_begin'
,'debt_this_diff_lastyear'
,'inbank_total_debt_increase_rate_year'
,'inbank_total_debt_increase_rate_month'
,'personal_consume_loan'
,'consume_loan_year_dailyaverage'
,'personal_consume_loan_increase_rate_year'
,'personal_consume_loan_increase_rate_month'
,'personal_manage_loan'
,'manage_loan_year_dailyaverage'
,'personal_manage_loan_increase_rate_year'
,'personal_manage_loan_increase_rate_month'
,'credit_card_over_balance'
,'credit_card_over_balance_increase_rate_year'
,'credit_card_over_balance_increase_rate_month'
,'fund_cur_balance'
,'noblemetal_cur_balance'
,'total_assets_increase_rate_month'
,'total_assets_increase_rate_year'
,'deposit_increase_rate_month'
,'deposit_increase_rate_year'
,'financial_increase_rate_month'
,'financial_increase_rate_year'
,'fund_increase_rate_month'
,'fund_increase_rate_year'
,'noblemetal_increase_rate_month'
,'noblemetal_increase_rate_year'
,'last_etl_acg_dt']
​
data = pd.read_csv('crm.csv',header=None)
/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
# 添加列名
data.columns = header
data_sample = data.sample(n=100000)
len(data_sample.bel_org.unique())
230
crm 字段箱线图 输入数据集以及需要绘制的列
# python_循环为每个特征做箱型图
data_temp = data_sample
def box(dataset,columns):
    fig_num = len(columns)
    sub_figure = [fig_num//5+1,5]
#     figsize : 2-tuple of floats, default: rcParams["figure.figsize"] = [6.4, 4.8]
# Figure dimension (width, height) in inches.
    plt.figure(figsize=[20,4*sub_figure[0]])
    for i in range(fig_num):
        plt.subplot(sub_figure[0],sub_figure[1],i+1)
        dataset[columns[i]].plot.box()
        plt.grid()        
columns = data_temp.columns.drop(['last_etl_acg_dt','cust_isn','bel_org'])
box(data_temp,columns)

data_sample.describe()

异常值比例
data_temp = data_sample
data_describe = data_temp.describe()
outlier_rate = []
num = len(data_temp)
plt.figure(figsize=[20,6])
for i in data_describe.columns:
    mid = data_describe[i]['50%']
    q_up = data_describe[i]['75%']
    q_down = data_describe[i]['25%']
    limit_up = mid+(q_up-q_down)*3
    limit_down = mid-(q_up-q_down)*3
#     求异常值所占比例
    outlier_rate.append(sum((data_temp[i]>limit_up) | (data_temp[i]<limit_down))/num)#     python range() 函数可创建一个整数列表
#  zip 将对象中对应的元素打包成一个个元组
for x,y in zip(list(range(len(data_describe.columns))),outlier_rate):
#     设置柱形图标题
    plt.text(x,y+0.005,y,ha = 'center',va = 'bottom',rotation=90)
# 画柱状图   
plt.bar(data_describe.columns,outlier_rate)
plt.xticks(rotation=90)
plt.grid()
plt.show()

相关系数
data_temp = data_sample
#相关系数矩阵绘制
def cor_matrix(cor):
    ax = plt.figure(figsize=(10,10)).add_subplot(111)
    col_num = len(cor)
    columns = cor.columns
    step = 0.0625*10/col_num
    start_pos_1_x = 0.15-(0.0625-step)/2
    start_pos_1_y = 0.15
    start_pos_2_x = 0.10-(0.0625-step)/2
    start_pos_2_y = 0.215-(0.0625-step)/2
    for i in range(col_num):
        plt.figtext(x=start_pos_1_x+i*step,y=start_pos_1_y,s=columns[i],rotation='vertical')
        plt.figtext(x=start_pos_2_x,y=start_pos_2_y+i*step,s=columns[-i-1],horizontalalignment='right')
    #for i in range(col_num):
    #    for j in range(col_num):
    #        plt.figtext(x=start_pos_1_x+i*step,y=start_pos_2_y+j*step,s = '%0.2f'%cor.iloc[i,col_num-j-1],horizontalalignment='center')
    im = ax.imshow(cor,)
    plt.colorbar(im)
​
columns = data_temp.columns.drop(['last_etl_acg_dt','cust_isn','bel_org'])
​
temp = data_temp[columns]
cor = temp.corr()
cor_matrix(cor)

缺失值比例
columns = data.columns.drop(['last_etl_acg_dt','cust_isn','bel_org'])
def nan_calculate(data,columns):
    num = len(data)
    nan_rate = []
    plt.figure(figsize=[15,8])
    for i in columns:    
        nan_rate.append(sum(data[i].isnull())/num)
    plt.bar(columns,nan_rate)for x,y in zip(list(range(len(columns))),nan_rate):
        plt.text(x,y+0.005,"%.2e"%y,ha = 'center',va = 'bottom',rotation=45)
​
    plt.xticks(rotation=90)
    plt.grid()
    plt.show()
nan_calculate(data,columns)

columns = data.columns.drop(['last_etl_acg_dt','cust_isn','bel_org'])[0:-12]
num = len(data)
nan_rate = []
plt.figure(figsize=[15,8])
for i in columns:    
    nan_rate.append(sum(data[i].isnull())/num)
plt.bar(columns,nan_rate)for x,y in zip(list(range(len(columns))),nan_rate):
    plt.text(x,y,"%.2e"%y,ha = 'center',va = 'bottom',rotation=45)
    
plt.xticks(rotation=90)
plt.grid()
plt.show()

基本统计
print('crm 数据表shape:',data.shape)
print('crm 去重客户内码数量:',len(data.cust_isn.unique()))
print('crm 去重客户内码+机构数量:',len(data[['cust_isn','bel_org']].drop_duplicates()))
print('crm last_etl 最大值:',max(data.last_etl_acg_dt))
print('crm last_etl 最小值:',min(data.last_etl_acg_dt))
crm 数据表shape: (1292488, 46)
crm 去重客户内码数量: 961629
crm 去重客户内码+机构数量: 1209175
crm last_etl 最大值: 2017-12-31
crm last_etl 最小值: 2016-11-30
每月crm记录数
date_count = data[['cust_isn','last_etl_acg_dt']].groupby('last_etl_acg_dt',as_index=False).count()
plt.figure(figsize=[12,6])
plt.plot(date_count.iloc[:,0],date_count.iloc[:,1],'.-')
plt.xticks(rotation=45)
plt.grid()
plt.ylim([0,200000])
plt.show()
date_count = data[['cust_isn','last_etl_acg_dt']].groupby('last_etl_acg_dt',as_index=False).count()
plt.figure(figsize=[12,6])
plt.plot(date_count.iloc[:,0],date_count.iloc[:,1],'.-')
plt.xticks(rotation=45)
plt.grid()
plt.ylim([0,200000])
plt.show()

相关标签: 机器学习案例