Python dataframe列拆分多行与统计
程序员文章站
2024-01-05 23:13:29
...
7.2.4 列拆分多行与统计
需求:对原因字段里按照分隔符拆分并汇总统计分析
解决方法:通过Python的DataFrame、Mysql结合row_number进行统计完成该需求。
# coding=utf8
import pandas as pd
#返回<br/>的个数
def fuc_brCnt(col):
return col['REJECT'].count('<br/>',0,600)
def tidy_split(df, column, sep='|', keep=False):
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
#返回冒号的个数
def fuc_ColonCnt(col):
return col['REJECT'].count(':',0,600)
def couple_single(df,df_isApply,type):
newtab1_isApply = df[df_isApply]
newtab1_isApply.to_csv(r'D:\newtab1_is'+type+'.csv')
newtab1_isApply_1 = tidy_split(newtab1_isApply.iloc[:, :], 'REJECT', sep='`')
newtab1_isApply_1.to_csv(r'D:\newtab1_is'+type+'_1.csv')
newtab1_isApply_2 = tidy_split(newtab1_isApply_1.iloc[:, :], 'REJECT', sep='-')
newtab1_isApply_2.reset_index(level=0, inplace=True)
newtab1_isApply_2.rename(columns={'index': 'IDX_OLD'}, inplace=True)
newtab1_isApply_2.to_csv(r'D:\newtab1_is'+type+'_2.csv')
newtab1_isApply_2_1 = newtab1_isApply_2.iloc[::2, :].reset_index(drop=True) ##模2为0的行
newtab1_isApply_2_2 = newtab1_isApply_2.iloc[1::2, :].reset_index(drop=True) ##模2为1的行
newtab1_isApply_2_1.rename(columns={'REJECT': 'SECOND'}, inplace=True)
newtab1_isApply_2_1.to_csv(r'D:\newtab1_is'+type+'_2_1.csv')
newtab1_isApply_3 = tidy_split(newtab1_isApply_2_2.iloc[:, :], 'REJECT', sep='|')
df_concat_apply = pd.concat([newtab1_isApply_2_1, newtab1_isApply_3['REJECT']], axis=1, sort=True)
if type=='Apply':
df_concat_apply['TYPE']=0
else:
df_concat_apply['TYPE'] =1
df_concat_apply.rename(columns={'REJECT': 'THIRD', 'TYPE': 'FIRST'}, inplace=True)
df_concat_apply['SECOND'] = df_concat_apply["SECOND"].map(lambda s: s.replace("申请人:", '').replace("配偶:", '')if ":" in s else s)
df_concat_apply.to_csv(r'D:\df_concat_'+type+'.csv')
def rejetct_split():
filename = r'./input/REJACT_DETAIL_reBuild.csv'
df = pd.read_csv(filename, header=0, sep="\t", names=["ID_CARD", "APPLY_ID", "CUSTOMER_GROUP", "REJECT"])
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 99)
pd.set_option('display.width', 1000)
df_ori = df.copy()
### Step 1 数据清洗
df_ori['REJECT']=df_ori["REJECT"].map(lambda s: s.strip("<br/>").replace(": ",':').replace("][","|").replace("]","").replace("[","").replace("<br/> ","`"))
df_ori.to_csv(r'd:\df_ori.csv')
### Step 2
df_ori['COLON']=df_ori.apply(fuc_ColonCnt, axis=1)
df_c=df_ori[df_ori.COLON>0].reset_index(drop=True)
newtab1 = tidy_split(df_c.iloc[:, :], 'REJECT',sep='#')
newtab1.to_csv(r'D:\newtab1.csv')
## 分割出来的空字符串过滤掉
newtab1_noEmp = (newtab1["REJECT"].apply(lambda x: x!=""))
newtab1=newtab1[newtab1_noEmp]
newtab1['REJECT'] = newtab1["REJECT"].map(lambda s: s.strip("`").strip("|").strip(" ") ) ## 这里空格处理
newtab1.to_csv(r'D:\newtab1_1.csv')
### Step 3 分两个dataframe单独处理
#newtab1_isApply = (newtab1["REJECT"].apply(lambda x: x.find("注册人:")>=0))
#couple_single(newtab1,newtab1_isApply,'Apply')
newtab1_isSpouse = (newtab1["REJECT"].apply(lambda x: x.find("推荐人:") >= 0))
couple_single(newtab1, newtab1_isSpouse, 'Spouse')
if __name__ == '__main__':
rejetct_split()
示例数据
ID |
SEQ |
TYPE |
REASON |
123456 |
10052500000000029871 |
sq |
注册者: 硬性条件-[非本地] [申请次数]<br/> |
923456 |
10052500000000029882 |
gq |
注册者: 其他原因 |
处理结果
AUTO_ID |
IDX_OLD |
ID |
SEQ |
TYPE |
SECOND |
COLON |
THIRD |
FIRST |
1 |
0 |
123456 |
10052500000000029871 |
sq |
硬性条件 |
1 |
非本地 |
注册者 |
2 |
0 |
123456 |
10052500000000029871 |
sq |
硬性条件 |
1 |
申请次数 |
注册者 |
3 |
1 |
923456 |
10052500000000029882 |
gq |
硬性条件 |
1 |
其它原因 |
推荐者 |
入库reason_V2表后结合mysql统计分析
SELECT CNT TYPE,COUNT(1) CNT,COUNT(DISTINCT A.ID) DISTINCT_P
FROM
(
SELECT A.SEQ,A.ID,ROW_NUMBER()OVER(PARTITION BY SEQ ORDER BY auto_ID) CNT
FROM reason_V2 A
)A
GROUP BY CNT
ORDER BY CNT
统计结果
TYPE |
CNT |
DISTINCT_P |
10 |
376 |
287 |
9 |
263 |
217 |
7 |
173 |
126 |