pandas数据分析实战1
程序员文章站
2022-06-05 20:14:10
...
import time
import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
time_tabel = pd.read_csv("data/loan_time_train.txt",header=None,names=['用户标识','放款时间'])
time_tabel['放款时间']=time_tabel['放款时间']/86400
user_tabel = pd.read_csv("data/user_info_train.txt",header=None,
names=['用户标识','用户性别','用户职业','用户教育程度',
'用户婚姻状态', '用户户口类型'])
card_tabel = pd.read_csv("data/bill_detail_train.txt",header=None,
names=['用户标识','时间','银行标识','上期账单金额','上期还款金额','信用卡额度',
'本期账单余额','本期账单最低还款额','消费笔数','本期账单金额','调整金额',
'循环利息','可用余额','预借现金额度','还款状态'])
card_tabel['时间'] = card_tabel['时间']/86400
account = pd.merge(card_tabel,time_tabel,how='inner', on = "用户标识")
traintabel = pd.read_csv("data/overdue_train.txt",header=None,
names=['用户标识','标签'])
traintabel = pd.merge(traintabel,user_tabel,how='inner',on = "用户标识")
traintabel = pd.merge(traintabel,time_tabel,how='inner',on = "用户标识")
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
p = sns.color_palette()
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})
browse = pd.read_csv("data/browse_history_train.txt",header=None,
names=['用户标识','浏览时间','浏览行为数据','浏览子行为编号'])
browse['浏览时间'] = browse['浏览时间']//86400
bank = pd.read_csv("data/bank_detail_train.txt").rename(index=str,
columns={"uid": "用户标识","timespan": "流水时间",
"type":"交易类型","amount":"交易金额","markup":"工资收入标记"})
bank['流水时间'] = bank['流水时间']//86400
sex = traintabel.groupby('用户性别',as_index=False)['标签'].agg({'逾期' : 'sum','总数' : 'count'})
sex['性别逾期比'] = sex['逾期']/sex['总数']
career = traintabel.groupby('用户职业',as_index=False)['标签'].agg({'逾期' : 'sum','总数' : 'count'})
career['职业逾期比'] = career['逾期']/career['总数']
marray = traintabel.groupby('用户婚姻状态',as_index=False)['标签'].agg({'逾期' : 'sum','总数' : 'count'})
marray['婚姻逾期比'] = marray['逾期']/marray['总数']
education = traintabel.groupby('用户教育程度',as_index=False)['标签'].agg({'逾期' : 'sum','总数' : 'count'})
education['教育程度逾期比'] = education['逾期']/education['总数']
household = traintabel.groupby('用户户口类型',as_index=False)['标签'].agg({'逾期' : 'sum','总数' : 'count'})
household['户口类型逾期比'] = household['逾期']/household['总数']
fig = plt.figure(figsize=(20, 20))
ax1 = fig.add_subplot(3, 2, 1)
ax1=sns.barplot(career.index, career['逾期']/career['总数'], alpha=0.8, color=p[0], label='train')
ax1.legend()
#ax1.set_title(u'职业分布情况')
ax1.set_xlabel(u'用户职业')
ax1.set_ylabel(u'逾期用户比例')
ax2 = fig.add_subplot(3, 2, 2)
ax2=sns.barplot(sex.index, sex['逾期']/sex['总数'], alpha=0.8, color=p[1], label='train')
ax2.legend()
#ax2.set_title(u'性别分布情况')
ax2.set_xlabel(u'用户性别')
ax2.set_ylabel(u'逾期用户比例')
ax3 = fig.add_subplot(3, 2, 3)
ax3=sns.barplot(education.index, education['逾期']/education['总数'], alpha=0.8, color=p[2], label='train')
ax3.legend()
#ax3.set_title(u'教育程度分布')
ax3.set_xlabel(u'教育程度')
ax3.set_ylabel(u'逾期用户比例')
ax4 = fig.add_subplot(3, 2, 4)
ax4=sns.barplot(marray.index, marray['逾期']/marray['总数'], alpha=0.8, color=p[3], label='train')
ax4.legend()
#ax4.set_title(u'用户婚姻状态')
ax4.set_xlabel(u'用户婚姻状态')
ax4.set_ylabel(u'逾期用户比例')
ax5 = fig.add_subplot(3, 2, 5)
ax5=sns.barplot(household.index, household['逾期']/household['总数'], alpha=0.8, color=p[4], label='train')
ax5.legend()
#ax5.set_title(u'用户户口类型')
ax5.set_xlabel(u'用户户口类型')
ax5.set_ylabel(u'逾期用户比例')
plt.show()
plt.waitforbuttonpress(0)
上一篇: 九九乘法表之单双循环