DataFrame使用笔记(持续更新中)
程序员文章站
2022-06-24 17:42:20
...
DataFrame使用笔记
元数据操作
>>> df = pd.DataFrame((np.random.rand(4, 4)*100), columns=list('ABCD'))
>>> df
A B C D
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
重置index
>>> df.T.reset_index()
index 0 1 2 3
0 A 90.1097 26.8527 74.1789 72.2995
1 B 77.4622 44.3568 17.546 29.8467
2 C 49.7188 0.610779 47.6466 36.7566
3 D 97.7677 39.4645 34.3834 54.0056
列操作
增加列
>>> df['E']='S'
>>> df['F']=np.nan
>>> df
A B C D E F
0 90.109671 77.462236 49.718842 97.767686 S NaN
1 26.852667 44.356768 0.610779 39.464533 S NaN
2 74.178923 17.545973 47.646630 34.383420 S NaN
3 72.299508 29.846723 36.756602 54.005581 S NaN
删除列
drop方式
>>> df.drop(['E', 'F'], axis=1) #只生成的新数据块中实现删除效果
A B C D
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
>>> df
A B C D E F
0 90.109671 77.462236 49.718842 97.767686 167.571908 T
1 26.852667 44.356768 0.610779 39.464533 71.209435 F
2 74.178923 17.545973 47.646630 34.383420 91.724897 F
3 72.299508 29.846723 36.756602 54.005581 102.146231 T
>>> df.drop(['E','F'] ,axis=1,inplace =True) #inplace =True能删除原有数据块的相应行
>>> df
A B C D
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
del方式
>>> df['E']='S'
>>> df['F']=np.nan
>>> del df['E']
>>> del df.F #注意此种方式只释放了指针,不会生效
A B C D F
0 90.109671 77.462236 49.718842 97.767686 NaN
1 26.852667 44.356768 0.610779 39.464533 NaN
2 74.178923 17.545973 47.646630 34.383420 NaN
3 72.299508 29.846723 36.756602 54.005581 NaN
>>> del df['F']
重命名列
对columns赋值
>>> df.columns=['AA','BB','CC','DD']
>>> df
AA BB CC DD
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
>>> df.columns=df.columns.map(lambda x:x[0])
>>> df
A B C D
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
使用rename函数
>>> df.rename(columns={'A':'AA','B':'BB','C':'CC','D':'DD'},inplace=True)
>>> df
AA BB CC DD
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
>>> df.rename(columns=lambda x:x[0], inplace=True)
>>> df
A B C D
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
求列数
>>> len(df.columns)
4
行操作
增加行
>>> df.loc[4,:]=4 #也可用于修改行
>>> df
A B C D
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
4 4.000000 4.000000 4.000000 4.000000
删除行
>>> df.drop(4,axis=0,inplace=True) #删除index=4的行
>>> df
A B C D
0 90.109671 77.462236 49.718842 97.767686
1 26.852667 44.356768 0.610779 39.464533
2 74.178923 17.545973 47.646630 34.383420
3 72.299508 29.846723 36.756602 54.005581
求行数
>>> len(df)
4
df
apply应用函数处理
lambda表达式
>>> df['E'] = df.apply(lambda row: row.A + row.B, axis=1)
>>> df
A B C D E
0 90.109671 77.462236 49.718842 97.767686 167.571908
1 26.852667 44.356768 0.610779 39.464533 71.209435
2 74.178923 17.545973 47.646630 34.383420 91.724897
3 72.299508 29.846723 36.756602 54.005581 102.146231
上面lamda表达式等价于
>>> df['E'] = df['A'] + df['B']
>>> df
A B C D E
0 90.109671 77.462236 49.718842 97.767686 167.571908
1 26.852667 44.356768 0.610779 39.464533 71.209435
2 74.178923 17.545973 47.646630 34.383420 91.724897
3 72.299508 29.846723 36.756602 54.005581 102.146231
查看dataframe的转置
>>> df.T
0 1 2 3
A 90.109671 26.852667 74.178923 72.299508
B 77.462236 44.356768 17.545973 29.846723
C 49.718842 0.610779 47.646630 36.756602
D 97.767686 39.464533 34.383420 54.005581
E 167.571908 71.209435 91.724897 102.146231
排序
单列排序
>>> df.sort_values(by = 'A',axis = 0,ascending = True)
A B C D E
1 26.852667 44.356768 0.610779 39.464533 71.209435
3 72.299508 29.846723 36.756602 54.005581 102.146231
2 74.178923 17.545973 47.646630 34.383420 91.724897
0 90.109671 77.462236 49.718842 97.767686 167.571908
多列排序
>>> df.sort_values(by = ['A','B','C'],axis = 0,ascending = True)
A B C D E
1 26.852667 44.356768 0.610779 39.464533 71.209435
3 72.299508 29.846723 36.756602 54.005581 102.146231
2 74.178923 17.545973 47.646630 34.383420 91.724897
0 90.109671 77.462236 49.718842 97.767686 167.571908
筛选
行选择
#### 逻辑运算选择
>>> df[ (df.A>70) & (df.B<50) ]
A B C D E
2 74.178923 17.545973 47.646630 34.383420 91.724897
3 72.299508 29.846723 36.756602 54.005581 102.146231
等价于
>>> df[ (df['A']>70) & (df['B']<50) ]
A B C D E
2 74.178923 17.545973 47.646630 34.383420 91.724897
3 72.299508 29.846723 36.756602 54.005581 102.146231
isin(list)选择
>>> df['F'] = df.apply(lambda row: 'T' if row.E>100 else 'F', axis=1)
>>> df
A B C D E F
0 90.109671 77.462236 49.718842 97.767686 167.571908 T
1 26.852667 44.356768 0.610779 39.464533 71.209435 F
2 74.178923 17.545973 47.646630 34.383420 91.724897 F
3 72.299508 29.846723 36.756602 54.005581 102.146231 T
>>> df[df.F.isin(['T'])]
A B C D E F
0 90.109671 77.462236 49.718842 97.767686 167.571908 T
3 72.299508 29.846723 36.756602 54.005581 102.146231 T
抽样筛选
>>> df.sample(n=2)
A B C D E F
0 90.109671 77.462236 49.718842 97.767686 167.571908 T
2 74.178923 17.545973 47.646630 34.383420 91.724897 F
Head和tail
>>> df.head(2)
A B C D E F
0 90.109671 77.462236 49.718842 97.767686 167.571908 T
1 26.852667 44.356768 0.610779 39.464533 71.209435 F
>>> df.tail(2)
A B C D E F
2 74.178923 17.545973 47.646630 34.383420 91.724897 F
3 72.299508 29.846723 36.756602 54.005581 102.146231 T
列选择
>>> df[['A','B']]
A B
0 90.109671 77.462236
1 26.852667 44.356768
2 74.178923 17.545973
3 72.299508 29.846723
注意:df[[‘A’]]是一个pandas.core.frame.DataFrame对象,而df[‘A’]是一个pandas.core.series.Series对象
切片
pandas的索引函数主要有三种:
+ loc 标签索引,行和列的名称
+ iloc 整型索引(绝对位置索引),绝对意义上的几行几列,起始索引为0
+ ix 是 iloc 和 loc的合体
+ at是loc的快捷方式
+ iat是iloc的快捷方式
切块
集合操作
关联操作
data=stk_data.merge(idx_data,how='inner', on=['tradeDate'])
分组统计groupby
>>> df.groupby(['F']).agg({'A':['min','max','mean','std','var','count'],'B':['min','max','mean','std','var','count']})
A B \
min max mean std var count min
F
F 26.852667 74.178923 50.515795 33.464717 1119.887275 2 17.545973
T 72.299508 90.109671 81.204590 12.593687 158.600951 2 29.846723
max mean std var count
F
F 44.356768 30.951371 18.958095 359.409352 2
T 77.462236 53.654480 33.669252 1133.618553 2
lag和lead实现
当shift函数中的数字为正数时,我们就实现了lag的功能,当数字为负数时,实现的是lead的功能。
df = pd.DataFrame({'A':[12,20,12,5,18,11,18],
'C':['A','B','A','B','B','A','A']})
df['lag'] = df.groupby('C')['A'].shift(1)
df['lead'] = df.groupby('C')['A'].shift(-1)
print(df)
分析函数
df = pd.DataFrame({'A':[12,20,12,5,18,11,18],
'C':['A','B','A','B','B','A','A']})
df['row_number'] = df['A'].groupby(df['C']).rank(ascending=True,method='first')
print(df)