概率论之pandas
程序员文章站
2024-03-25 20:40:10
...
快速入门
1
import numpy as np
s=pd.series([1,3,5,np.nan,8,4])
Series
s=pd.Series([1,3,5,np.nan,8,4])
s
Out[6]:
0 1.0
1 3.0
2 5.0
3 NaN
4 8.0
5 4.0
dtype: float64
date_range
dates=pd.date_range('20190301',periods=6)
dates
Out[10]:
DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04',
'2019-03-05', '2019-03-06'],
dtype='datetime64[ns]', freq='D')
二维数组索引
data=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
data
Out[14]:
A B C D
2019-03-01 1.866219 -1.003057 2.526899 -0.563343
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
2019-03-05 -1.854786 0.980092 0.797543 -0.432381
data.shape
Out[15]: (6, 4)
data.values
Out[16]:
array([[ 1.86621859, -1.0030566 , 2.52689932, -0.56334339],
[-1.41362647, 1.11672695, -0.47005354, -0.56793016],
[ 0.11220208, -0.62181257, -0.65821206, -0.53731156],
[-0.5678393 , 2.17593279, 1.12604991, -0.412436 ],
[-1.85478576, 0.98009218, 0.79754332, -0.43238061],
[-0.28618627, 0.7689724 , 0.75578607, 0.15118955]])
字典时间戳
d={'A':1,'B':pd.Timestamp('20130301'),'C':range(4),'D':np.arange(4)}
d
Out[19]:
{'A': 1,
'B': Timestamp('2013-03-01 00:00:00'),
'C': range(0, 4),
'D': array([0, 1, 2, 3])}
构造二维形式
df=pd.DataFrame(d)
df
Out[21]:
A B C D
0 1 2013-03-01 0 0
1 1 2013-03-01 1 1
2 1 2013-03-01 2 2
查看数据和排序修改
data.head(2)
Out[22]:
A B C D
2019-03-01 1.866219 -1.003057 2.526899 -0.563343
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
data.tail()
Out[23]:
A B C D
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
2019-03-05 -1.854786 0.980092 0.797543 -0.432381
2019-03-06 -0.286186 0.768972 0.755786 0.151190
data.index
Out[24]:
DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04',
'2019-03-05', '2019-03-06'],
dtype='datetime64[ns]', freq='D')
data.columns
Out[25]: Index(['A', 'B', 'C', 'D'], dtype='object')
data.describe()
Out[26]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.357336 0.569476 0.679669 -0.393702
std 1.309362 1.181577 1.161292 0.275140
min -1.854786 -1.003057 -0.658212 -0.567930
25% -1.202180 -0.274116 -0.163594 -0.556835
50% -0.427013 0.874532 0.776665 -0.484846
75% 0.012605 1.082568 1.043923 -0.417422
max 1.866219 2.175933 2.526899 0.151190
data.T
Out[27]:
2019-03-01 2019-03-02 2019-03-03 2019-03-04 2019-03-05 2019-03-06
A 1.866219 -1.413626 0.112202 -0.567839 -1.854786 -0.286186
B -1.003057 1.116727 -0.621813 2.175933 0.980092 0.768972
C 2.526899 -0.470054 -0.658212 1.126050 0.797543 0.755786
D -0.563343 -0.567930 -0.537312 -0.412436 -0.432381 0.151190
安列排序
data.sort_index(axis=1)
Out[28]:
A B C D
2019-03-01 1.866219 -1.003057 2.526899 -0.563343
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
2019-03-05 -1.854786 0.980092 0.797543 -0.432381
2019-03-06 -0.286186 0.768972 0.755786 0.151190
data.sort_index(axis=1,ascending=False)
Out[29]:
D C B A
2019-03-01 -0.563343 2.526899 -1.003057 1.866219
2019-03-02 -0.567930 -0.470054 1.116727 -1.413626
2019-03-03 -0.537312 -0.658212 -0.621813 0.112202
2019-03-04 -0.412436 1.126050 2.175933 -0.567839
2019-03-05 -0.432381 0.797543 0.980092 -1.854786
2019-03-06 0.151190 0.755786 0.768972 -0.286186
data.sort_index(axis=1,ascending=True)
Out[30]:
A B C D
2019-03-01 1.866219 -1.003057 2.526899 -0.563343
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
2019-03-05 -1.854786 0.980092 0.797543 -0.432381
2019-03-06 -0.286186 0.768972 0.755786 0.151190
data.sort_values(by='A')
Out[31]:
A B C D
2019-03-05 -1.854786 0.980092 0.797543 -0.432381
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
2019-03-06 -0.286186 0.768972 0.755786 0.151190
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-01 1.866219 -1.003057 2.526899 -0.563343
data.A
Out[32]:
2019-03-01 1.866219
2019-03-02 -1.413626
2019-03-03 0.112202
2019-03-04 -0.567839
2019-03-05 -1.854786
2019-03-06 -0.286186
Freq: D, Name: A, dtype: float64
data[2:4]
Out[33]:
A B C D
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
data['20190302':'20190305']
Out[34]:
A B C D
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
loc效率较高
data.loc['20190302':'20190304']
Out[35]:
A B C D
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
data.iloc[2:4]
Out[36]:
A B C D
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
data.loc[:,['B','C']]
Out[37]:
B C
2019-03-01 -1.003057 2.526899
2019-03-02 1.116727 -0.470054
2019-03-03 -0.621813 -0.658212
2019-03-04 2.175933 1.126050
2019-03-05 0.980092 0.797543
2019-03-06 0.768972 0.755786
at效率更高访问特定值
data.at[pd.Timestamp('20190302'),'B']
Out[38]: 1.116726953479249
data.iat[1,1]
Out[39]: 1.116726953479249
布尔索引
data[data.A>0]
Out[40]:
A B C D
2019-03-01 1.866219 -1.003057 2.526899 -0.563343
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
data[data>0]
Out[41]:
A B C D
2019-03-01 1.866219 NaN 2.526899 NaN
2019-03-02 NaN 1.116727 NaN NaN
2019-03-03 0.112202 NaN NaN NaN
2019-03-04 NaN 2.175933 1.126050 NaN
2019-03-05 NaN 0.980092 0.797543 NaN
2019-03-06 NaN 0.768972 0.755786 0.15119
data2=data.copy()
data2
Out[43]:
A B C D
2019-03-01 1.866219 -1.003057 2.526899 -0.563343
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
2019-03-05 -1.854786 0.980092 0.797543 -0.432381
2019-03-06 -0.286186 0.768972 0.755786 0.151190
tag=['a']*2+['b']*2+['c']*2
data2['TAG']=tag
data2
Out[46]:
A B C D TAG
2019-03-01 1.866219 -1.003057 2.526899 -0.563343 a
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930 a
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312 b
2019-03-04 -0.567839 2.175933 1.126050 -0.412436 b
2019-03-05 -1.854786 0.980092 0.797543 -0.432381 c
2019-03-06 -0.286186 0.768972 0.755786 0.151190 c
data2[data2.TAG .isin(['a','c'])]
Out[48]:
A B C D TAG
2019-03-01 1.866219 -1.003057 2.526899 -0.563343 a
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930 a
2019-03-05 -1.854786 0.980092 0.797543 -0.432381 c
2019-03-06 -0.286186 0.768972 0.755786 0.151190 c
data.iat[0,0]=100
data
Out[50]:
A B C D
2019-03-01 100.000000 -1.003057 2.526899 -0.563343
2019-03-02 -1.413626 1.116727 -0.470054 -0.567930
2019-03-03 0.112202 -0.621813 -0.658212 -0.537312
2019-03-04 -0.567839 2.175933 1.126050 -0.412436
2019-03-05 -1.854786 0.980092 0.797543 -0.432381
2019-03-06 -0.286186 0.768972 0.755786 0.151190
data.A=range(6)
data
Out[53]:
A B C D
2019-03-01 0 -1.003057 2.526899 -0.563343
2019-03-02 1 1.116727 -0.470054 -0.567930
2019-03-03 2 -0.621813 -0.658212 -0.537312
2019-03-04 3 2.175933 1.126050 -0.412436
2019-03-05 4 0.980092 0.797543 -0.432381
2
import matplotlib.pyplot as plt
import numpy as np#矩阵运算
import pandas as pd
dates=pd.date_range('20190301',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df
Out[6]:
A B C D
2019-03-01 -1.916667 -0.291255 -0.733860 2.434108
2019-03-02 1.557134 -1.545051 -0.961491 0.515089
2019-03-03 1.331631 0.714138 1.231407 -0.422883
2019-03-04 0.719223 -0.260048 0.523958 1.172832
2019-03-05 0.293174 -0.045853 1.150185 0.631227
处理丢失数据
df1=df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1
Out[8]:
A B C D E
2019-03-01 -1.916667 -0.291255 -0.733860 2.434108 NaN
2019-03-02 1.557134 -1.545051 -0.961491 0.515089 NaN
2019-03-03 1.331631 0.714138 1.231407 -0.422883 NaN
2019-03-04 0.719223 -0.260048 0.523958 1.172832 NaN
df1.loc[dates[1:3],'E']=2
df1
Out[10]:
A B C D E
2019-03-01 -1.916667 -0.291255 -0.733860 2.434108 NaN
2019-03-02 1.557134 -1.545051 -0.961491 0.515089 2.0
2019-03-03 1.331631 0.714138 1.231407 -0.422883 2.0
2019-03-04 0.719223 -0.260048 0.523958 1.172832 NaN
丢掉空数据
df1.dropna()
Out[11]:
A B C D E
2019-03-02 1.557134 -1.545051 -0.961491 0.515089 2.0
2019-03-03 1.331631 0.714138 1.231407 -0.422883 2.0
填充数据
df1.fillna(value=5)
Out[12]:
A B C D E
2019-03-01 -1.916667 -0.291255 -0.733860 2.434108 5.0
2019-03-02 1.557134 -1.545051 -0.961491 0.515089 2.0
2019-03-03 1.331631 0.714138 1.231407 -0.422883 2.0
2019-03-04 0.719223 -0.260048 0.523958 1.172832 5.0
判断是否有空数据
pd.isnull(df1)
Out[13]:
A B C D E
2019-03-01 False False False False True
2019-03-02 False False False False False
2019-03-03 False False False False False
2019-03-04 False False False False True
pd.isnull(df1).any()
Out[14]:
A False
B False
C False
D False
E True
dtype: bool
pd.isnull(df1).any().any()
Out[15]: True
shift隐去数据
s=pd.Series((1,3,5,np.nan,6,8),index=dates).shift(2)
s
Out[17]:
2019-03-01 NaN
2019-03-02 NaN
2019-03-03 1.0
2019-03-04 3.0
2019-03-05 5.0
2019-03-06 NaN
Freq: D, dtype: float64
df
Out[18]:
A B C D
2019-03-01 -1.916667 -0.291255 -0.733860 2.434108
2019-03-02 1.557134 -1.545051 -0.961491 0.515089
2019-03-03 1.331631 0.714138 1.231407 -0.422883
2019-03-04 0.719223 -0.260048 0.523958 1.172832
2019-03-05 0.293174 -0.045853 1.150185 0.631227
2019-03-06 -1.177185 -0.906234 2.267641 -0.571508
df.sub(s,axis='index')
Out[19]:
A B C D
2019-03-01 NaN NaN NaN NaN
2019-03-02 NaN NaN NaN NaN
2019-03-03 0.331631 -0.285862 0.231407 -1.422883
2019-03-04 -2.280777 -3.260048 -2.476042 -1.827168
2019-03-05 -4.706826 -5.045853 -3.849815 -4.368773
2019-03-06 NaN NaN NaN NaN
apply函数,传入函数
df.apply(np.cumsum)
Out[22]:
A B C D
2019-03-01 -1.916667 -0.291255 -0.733860 2.434108
2019-03-02 -0.359533 -1.836306 -1.695351 2.949196
2019-03-03 0.972098 -1.122168 -0.463944 2.526314
2019-03-04 1.691322 -1.382216 0.060014 3.699146
2019-03-05 1.984496 -1.428069 1.210199 4.330373
2019-03-06 0.807311 -2.334303 3.477840 3.758864
df
Out[23]:
A B C D
2019-03-01 -1.916667 -0.291255 -0.733860 2.434108
2019-03-02 1.557134 -1.545051 -0.961491 0.515089
2019-03-03 1.331631 0.714138 1.231407 -0.422883
2019-03-04 0.719223 -0.260048 0.523958 1.172832
2019-03-05 0.293174 -0.045853 1.150185 0.631227
2019-03-06 -1.177185 -0.906234 2.267641 -0.571508
df.apply(lambda x:x.max()-x.min())
Out[25]:
A 3.473800
B 2.259189
C 3.229132
D 3.005616
dtype: float64
数据运算
s=pd.Series(np.random.randint(10,20,size=20))
s
Out[29]:
0 13
1 13
2 10
3 16
4 10
5 19
6 13
7 14
8 13
9 13
10 18
11 10
12 13
13 16
14 18
15 16
16 16
17 14
18 16
19 13
dtype: int32
s.value_counts()
Out[31]:
13 7
16 5
10 3
18 2
14 2
19 1
dtype: int64
s.mode(3)
Out[33]:
0 13
dtype: int32
数据合并
df=pd.DataFrame(np.random.randn(10,4),columns=list('ABCD'))
df
Out[35]:
A B C D
0 -0.068485 -0.731070 -1.158196 -0.952469
1 -0.776078 0.118621 0.359391 -0.427518
2 2.190398 -0.170339 -0.275725 0.184332
3 0.111006 2.263383 -1.164128 -1.653160
4 0.454094 -0.390870 0.181000 -0.713891
5 1.770307 -0.125938 -0.470755 0.073045
6 0.178509 -0.007117 -1.474438 1.280151
7 -1.074046 -1.068972 0.821342 -1.032382
8 0.767212 0.886415 -0.453329 -1.260979
9 -0.053771 2.024723 -0.387595 0.039338
df.iloc[:3]
Out[36]:
A B C D
0 -0.068485 -0.731070 -1.158196 -0.952469
1 -0.776078 0.118621 0.359391 -0.427518
2 2.190398 -0.170339 -0.275725 0.184332
df.iloc[3:7]
Out[37]:
A B C D
3 0.111006 2.263383 -1.164128 -1.653160
4 0.454094 -0.390870 0.181000 -0.713891
5 1.770307 -0.125938 -0.470755 0.073045
6 0.178509 -0.007117 -1.474438 1.280151
df.iloc[7:]
Out[38]:
A B C D
7 -1.074046 -1.068972 0.821342 -1.032382
8 0.767212 0.886415 -0.453329 -1.260979
9 -0.053771 2.024723 -0.387595 0.039338
pd.concat([df.iloc[:3],df.iloc[3:7],df.iloc[7:]])
Out[40]:
A B C D
0 -0.068485 -0.731070 -1.158196 -0.952469
1 -0.776078 0.118621 0.359391 -0.427518
2 2.190398 -0.170339 -0.275725 0.184332
3 0.111006 2.263383 -1.164128 -1.653160
4 0.454094 -0.390870 0.181000 -0.713891
5 1.770307 -0.125938 -0.470755 0.073045
6 0.178509 -0.007117 -1.474438 1.280151
7 -1.074046 -1.068972 0.821342 -1.032382
8 0.767212 0.886415 -0.453329 -1.260979
9 -0.053771 2.024723 -0.387595 0.039338
pd.merge(left,right,on='key')
Out[45]:
key lval_x lval_y
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
等价select*from left INNER JOIN right ON left.key=right.key
df.append(s,ignore_index=True)
Out[46]:
A B C D 0 ... 15 16 17 18 19
0 -0.068485 -0.731070 -1.158196 -0.952469 NaN ... NaN NaN NaN NaN NaN
1 -0.776078 0.118621 0.359391 -0.427518 NaN ... NaN NaN NaN NaN NaN
2 2.190398 -0.170339 -0.275725 0.184332 NaN ... NaN NaN NaN NaN NaN
3 0.111006 2.263383 -1.164128 -1.653160 NaN ... NaN NaN NaN NaN NaN
4 0.454094 -0.390870 0.181000 -0.713891 NaN ... NaN NaN NaN NaN NaN
5 1.770307 -0.125938 -0.470755 0.073045 NaN ... NaN NaN NaN NaN NaN
6 0.178509 -0.007117 -1.474438 1.280151 NaN ... NaN NaN NaN NaN NaN
7 -1.074046 -1.068972 0.821342 -1.032382 NaN ... NaN NaN NaN NaN NaN
8 0.767212 0.886415 -0.453329 -1.260979 NaN ... NaN NaN NaN NaN NaN
9 -0.053771 2.024723 -0.387595 0.039338 NaN ... NaN NaN NaN NaN NaN
10 NaN NaN NaN NaN 13.0 ... 16.0 16.0 14.0 16.0 13.0
数据分组
df=pd.DataFrame({'A':['foo','bar','foo','bar','foo','bar','foo','foo'],
'B':['one','one','two','three','two','two','one','three'],
'C':np.random.randn(8),
'D':np.random.randn(8)})
df
Out[49]:
A B C D
0 foo one -0.985579 -1.184493
1 bar one 0.462434 2.005462
2 foo two -0.397243 -0.402288
3 bar three 1.273866 -0.170183
4 foo two 0.621279 0.342449
5 bar two 0.839009 0.278341
6 foo one -0.620062 1.062472
7 foo three -0.368640 -0.846278
df.groupby('A').sum()
Out[50]:
C D
A
bar 2.575308 2.113620
foo -1.750244 -1.028139
3
数据整形
tuples=list(zip(*[['bar','bar','baz','baz','foo','foo','qux','qux'],['one','two','one','two','one','two','one','two',]]))
tuples
Out[54]:
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
index=pd.MultiIndex.from_tuples(tuples,names=['first','second'])
index
Out[57]:
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
codes=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
df=pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])
df
Out[59]:
A B
first second
bar one 0.811642 -1.801518
two -1.307845 -0.983155
baz one -1.939176 -0.270718
two 0.141660 -1.333879
foo one -0.311622 1.027429
two -1.819690 0.366389
qux one -0.714346 2.603627
two -1.325926 1.577150
列索引变行索引
stack=df.stack()
stack
Out[61]:
first second
bar one A 0.811642
B -1.801518
two A -1.307845
B -0.983155
baz one A -1.939176
B -0.270718
two A 0.141660
B -1.333879
foo one A -0.311622
B 1.027429
two A -1.819690
B 0.366389
qux one A -0.714346
B 2.603627
two A -1.325926
B 1.577150
dtype: float64
stack.index
Out[62]:
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two'], ['A', 'B']],
codes=[[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second', None])
stack.unstack()
Out[63]:
A B
first second
bar one 0.811642 -1.801518
two -1.307845 -0.983155
baz one -1.939176 -0.270718
two 0.141660 -1.333879
foo one -0.311622 1.027429
two -1.819690 0.366389
qux one -0.714346 2.603627
two -1.325926 1.577150
stack.unstack().unstack()
Out[64]:
A B
second one two one two
first
bar 0.811642 -1.307845 -1.801518 -0.983155
baz -1.939176 0.141660 -0.270718 -1.333879
foo -0.311622 -1.819690 1.027429 0.366389
qux -0.714346 -1.325926 2.603627 1.577150
数据透视
df=pd.DataFrame({'A':['one','one','two','three']*3,'B':['A','B','C']*4,'C':['foo','foo','foo','bar','bar','bar',]*2,'D':np.random.randn(12),'E':np.random.randn(12)})
df
Out[66]:
A B C D E
0 one A foo 1.242650 -2.186249
1 one B foo 0.390195 -0.003180
2 two C foo 0.572055 -0.556580
3 three A bar 1.125122 -0.182827
4 one B bar 0.518552 0.274675
5 one C bar -2.022590 -1.309944
6 two A foo 0.509303 0.574005
7 three B foo 0.204616 0.286539
8 one C foo 1.657085 0.699781
9 one A bar -0.502274 0.191885
10 two B bar 0.799141 0.494775
11 three C bar -0.355739 0.357618
df.pivot_table(values=['D'],index=['A','B'],columns=['C'])
Out[68]:
D
C bar foo
A B
one A -0.502274 1.242650
B 0.518552 0.390195
C -2.022590 1.657085
three A 1.125122 NaN
B NaN 0.204616
C -0.355739 NaN
two A NaN 0.509303
B 0.799141 NaN
C NaN 0.572055
多个数据求平均值
时间序列
rng=pd.date_range('20190301',periods=600,freq='s')
rng
Out[71]:
DatetimeIndex(['2019-03-01 00:00:00', '2019-03-01 00:00:01',
'2019-03-01 00:00:02', '2019-03-01 00:00:03',
'2019-03-01 00:00:04', '2019-03-01 00:00:05',
'2019-03-01 00:00:06', '2019-03-01 00:00:07',
'2019-03-01 00:00:08', '2019-03-01 00:00:09',
...
'2019-03-01 00:09:50', '2019-03-01 00:09:51',
'2019-03-01 00:09:52', '2019-03-01 00:09:53',
'2019-03-01 00:09:54', '2019-03-01 00:09:55',
'2019-03-01 00:09:56', '2019-03-01 00:09:57',
'2019-03-01 00:09:58', '2019-03-01 00:09:59'],
dtype='datetime64[ns]', length=600, freq='S')
a=pd.Series(np.random.randint(0,500,len(rng)),index=rng)
a
Out[73]:
2019-03-01 00:00:00 187
2019-03-01 00:00:01 140
2019-03-01 00:00:02 406
2019-03-01 00:00:03 398
2019-03-01 00:00:04 92
2019-03-01 00:00:05 387
2019-03-01 00:00:06 45
2019-03-01 00:00:07 305
2019-03-01 00:00:08 356
2019-03-01 00:00:09 252
2019-03-01 00:00:10 383
2019-03-01 00:00:11 343
2019-03-01 00:00:12 455
2019-03-01 00:00:13 70
2019-03-01 00:00:14 493
2019-03-01 00:00:15 308
2019-03-01 00:00:16 492
2019-03-01 00:00:17 49
2019-03-01 00:00:18 400
2019-03-01 00:00:19 90
2019-03-01 00:00:20 336
2019-03-01 00:00:21 50
2019-03-01 00:00:22 397
2019-03-01 00:00:23 85
2019-03-01 00:00:24 437
2019-03-01 00:00:25 448
2019-03-01 00:00:26 460
2019-03-01 00:00:27 395
2019-03-01 00:00:28 494
2019-03-01 00:00:29 463
...
2019-03-01 00:09:30 172
2019-03-01 00:09:31 60
2019-03-01 00:09:32 469
2019-03-01 00:09:33 73
2019-03-01 00:09:34 65
2019-03-01 00:09:35 41
2019-03-01 00:09:36 106
2019-03-01 00:09:37 140
2019-03-01 00:09:38 240
2019-03-01 00:09:39 454
2019-03-01 00:09:40 403
2019-03-01 00:09:41 380
2019-03-01 00:09:42 96
2019-03-01 00:09:43 343
2019-03-01 00:09:44 182
2019-03-01 00:09:45 246
2019-03-01 00:09:46 366
2019-03-01 00:09:47 283
2019-03-01 00:09:48 399
2019-03-01 00:09:49 278
2019-03-01 00:09:50 404
2019-03-01 00:09:51 424
2019-03-01 00:09:52 111
2019-03-01 00:09:53 126
2019-03-01 00:09:54 117
2019-03-01 00:09:55 120
2019-03-01 00:09:56 168
2019-03-01 00:09:57 92
2019-03-01 00:09:58 147
2019-03-01 00:09:59 455
Freq: S, Length: 600, dtype: int32
a.resample('2Min',how='sum')
D:\PyCharm\helpers\pydev\pydevconsole.py:1: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...).sum()
'''
Out[74]:
2019-03-01 00:00:00 32498
2019-03-01 00:02:00 32954
2019-03-01 00:04:00 29239
2019-03-01 00:06:00 30275
2019-03-01 00:08:00 27312
Freq: 2T, dtype: int32
rng=pd.period_range('2000Q1','2019Q1',freq='Q')
rng
Out[76]:
PeriodIndex(['2000Q1', '2000Q2', '2000Q3', '2000Q4', '2001Q1', '2001Q2',
'2001Q3', '2001Q4', '2002Q1', '2002Q2', '2002Q3', '2002Q4',
'2003Q1', '2003Q2', '2003Q3', '2003Q4', '2004Q1', '2004Q2',
'2004Q3', '2004Q4', '2005Q1', '2005Q2', '2005Q3', '2005Q4',
'2006Q1', '2006Q2', '2006Q3', '2006Q4', '2007Q1', '2007Q2',
'2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3', '2008Q4',
'2009Q1', '2009Q2', '2009Q3', '2009Q4', '2010Q1', '2010Q2',
'2010Q3', '2010Q4', '2011Q1', '2011Q2', '2011Q3', '2011Q4',
'2012Q1', '2012Q2', '2012Q3', '2012Q4', '2013Q1', '2013Q2',
'2013Q3', '2013Q4', '2014Q1', '2014Q2', '2014Q3', '2014Q4',
'2015Q1', '2015Q2', '2015Q3', '2015Q4', '2016Q1', '2016Q2',
'2016Q3', '2016Q4', '2017Q1', '2017Q2', '2017Q3', '2017Q4',
'2018Q1', '2018Q2', '2018Q3', '2018Q4', '2019Q1'],
dtype='period[Q-DEC]', freq='Q-DEC')
rng.to_timestamp()
Out[77]:
DatetimeIndex(['2000-01-01', '2000-04-01', '2000-07-01', '2000-10-01',
'2001-01-01', '2001-04-01', '2001-07-01', '2001-10-01',
'2002-01-01', '2002-04-01', '2002-07-01', '2002-10-01',
'2003-01-01', '2003-04-01', '2003-07-01', '2003-10-01',
'2004-01-01', '2004-04-01', '2004-07-01', '2004-10-01',
'2005-01-01', '2005-04-01', '2005-07-01', '2005-10-01',
'2006-01-01', '2006-04-01', '2006-07-01', '2006-10-01',
'2007-01-01', '2007-04-01', '2007-07-01', '2007-10-01',
'2008-01-01', '2008-04-01', '2008-07-01', '2008-10-01',
'2009-01-01', '2009-04-01', '2009-07-01', '2009-10-01',
'2010-01-01', '2010-04-01', '2010-07-01', '2010-10-01',
'2011-01-01', '2011-04-01', '2011-07-01', '2011-10-01',
'2012-01-01', '2012-04-01', '2012-07-01', '2012-10-01',
'2013-01-01', '2013-04-01', '2013-07-01', '2013-10-01',
'2014-01-01', '2014-04-01', '2014-07-01', '2014-10-01',
'2015-01-01', '2015-04-01', '2015-07-01', '2015-10-01',
'2016-01-01', '2016-04-01',
'2016-07-01', '2016-10-01',
'2017-01-01', '2017-04-01', '2017-07-01', '2017-10-01',
'2018-01-01', '2018-04-01', '2018-07-01', '2018-10-01',
'2019-01-01'],
dtype='datetime64[ns]', freq='QS-OCT')
时间运算
pd.Timestamp('20190302')-pd.Timestamp('20180817')
Out[78]: Timedelta('197 days 00:00:00')
pd.Timestamp('20190821')+pd.Timedelta(days=90)
Out[79]: Timestamp('2019-11-19 00:00:00')
数据可视化
插入操作
df=pd.DataFrame({'ID':(1,2,3,4,5,6),'raw_grade':('a','b','b','a','a','d')})
df
Out[82]:
ID raw_grade
0 1 a
1 2 b
2 3 b
3 4 a
4 5 a
5 6 d
df['grade']=df.raw_grade.astype('category')
df
Out[84]:
ID raw_grade grade
0 1 a a
1 2 b b
2 3 b b
3 4 a a
4 5 a a
5 6 d d
df.grade.cat.categories=['very good','good','bad']
df
Out[86]:
ID raw_grade grade
0 1 a very good
1 2 b good
2 3 b good
3 4 a very good
4 5 a very good
5 6 d bad
s=pd.Series(np.random.randn(1000),index=pd.date_range('20000101',periods=1000))
s
Out[88]:
2000-01-01 0.211737
2000-01-02 0.977688
2000-01-03 0.158780
2000-01-04 -0.608021
2000-01-05 -0.578980
2000-01-06 0.640827
2000-01-07 0.235984
2000-01-08 0.240308
2000-01-09 -0.868525
2000-01-10 0.330677
2000-01-11 -0.421135
2000-01-12 -0.673116
2000-01-13 -0.272406
2000-01-14 1.292067
2000-01-15 0.250142
2000-01-16 -1.780387
2000-01-17 0.510583
2000-01-18 -0.583681
2000-01-19 0.541223
2000-01-20 0.715070
2000-01-21 0.097076
2000-01-22 0.450696
2000-01-23 0.388371
2000-01-24 -0.122493
2000-01-25 -0.198185
2000-01-26 -1.314134
2000-01-27 -0.421204
2000-01-28 -0.847615
2000-01-29 0.104611
2000-01-30 -0.067303
...
2002-08-28 -0.581689
2002-08-29 0.533128
2002-08-30 1.263552
2002-08-31 -0.528198
2002-09-01 0.769058
2002-09-02 -0.737847
2002-09-03 0.602740
2002-09-04 -0.025594
2002-09-05 -1.756733
2002-09-06 0.711866
2002-09-07 0.472691
2002-09-08 0.007598
2002-09-09 -0.027224
2002-09-10 0.675931
2002-09-11 -1.979112
2002-09-12 1.464765
2002-09-13 -0.721414
2002-09-14 1.366840
2002-09-15 2.734268
2002-09-16 1.883177
2002-09-17 -0.289667
2002-09-18 0.846453
2002-09-19 1.358423
2002-09-20 -0.405080
2002-09-21 -0.233949
2002-09-22 -0.609962
2002-09-23 0.292490
2002-09-24 1.478842
2002-09-25 0.304181
2002-09-26 -0.369478
Freq: D, Length: 1000, dtype: float64
数据载入与保存
df=pd.DataFrame(np.random.randn(100,4),columns=list('ABCD'))
df
Out[94]:
A B C D
0 0.064095 0.909678 1.028639 -0.027457
1 -1.471539 -0.903830 -0.736624 -0.959836
2 -0.386030 0.435065 -0.550544 0.942760
3 -1.781232 0.709337 0.757159 -1.224160
4 3.036006 1.778238 -0.055653 2.688993
5 -0.052378 -0.917533 -0.215525 -0.355401
6 -1.154648 -0.130338 0.335879 -0.062172
7 -1.392447 1.605805 0.456320 0.434286
8 1.977163 1.002907 -0.272517 -0.718466
9 0.424313 -1.493273 0.015595 0.334600
10 -1.469211 -0.774529 0.642443 0.998211
11 -0.818350 0.756107 -0.830342 0.274861
12 0.337988 0.023247 -0.811474 2.679754
13 0.089583 0.088795 -0.804172 -1.085647
14 -1.257731 -1.034421 0.713171 -0.412326
15 -0.767504 -0.011130 0.906505 -0.306483
16 0.849606 -0.108564 -0.081838 -1.151810
17 -0.028719 -0.783104 -0.772919 -0.236640
18 0.421343 0.695579 0.293997 1.174231
19 -1.154045 0.530179 -0.300266 0.351877
20 0.075423 -0.602728 0.229882 0.334207
21 -0.097215 -1.332227 -2.146795 -0.385589
22 -0.300275 -2.516605 -0.925743 -1.656152
23 1.398598 0.286433 -0.524563 1.695352
24 -0.748971 0.807054 0.909789 -0.293069
25 -2.221560 0.081356 0.161429 0.175381
26 0.329723 -1.212668 0.102878 1.645397
27 1.695421 1.404585 -1.787636 1.679410
28 0.166341 1.225791 -0.852990 0.330605
29 -0.539736 -0.807441 -0.140344 -0.018509
.. ... ... ... ...
70 -2.225945 -2.131682 0.127904 -0.648249
71 -3.111897 -0.408759 1.322142 2.635054
72 -0.776495 -1.335791 -0.090325 1.978786
73 0.574853 0.394074 -0.064251 -1.413256
74 0.111574 -1.042979 0.320441 1.229795
75 -0.775243 -0.125456 0.191114 0.440073
76 -0.539562 -3.155613 -0.371731 -0.396655
77 -1.613776 -1.332295 0.590957 -0.183335
78 1.682681 -1.148102 -2.044948 -0.867742
79 -0.330760 0.907191 -0.909002 -0.110943
80 1.913363 0.008444 -0.809529 -1.403463
81 0.850681 0.357707 -0.042104 0.667657
82 -0.467310 -0.123689 0.616570 -1.746645
83 0.004234 0.000694 1.493874 -0.497544
84 -0.230444 -1.056287 -1.514723 1.024147
85 -1.380981 1.905973 -0.096171 -0.151536
86 -0.276202 -1.549406 -1.069241 1.181221
87 1.637849 -0.077122 0.338336 -2.048566
88 1.656047 0.279875 -1.131464 1.264856
89 -1.151340 -0.575027 -0.517369 -1.076379
90 -0.410812 -0.302089 -0.157782 0.087979
91 0.094909 -1.238561 -0.757331 -1.236590
92 1.164323 -0.615039 1.863732 0.741732
93 -0.145811 -0.470317 0.773096 -0.760261
94 1.095176 1.654621 1.456416 0.188866
95 -0.675272 -1.333038 0.605275 -0.821463
96 0.403931 0.528502 -0.123698 0.941321
97 1.000746 0.519680 -1.481362 -0.051457
98 -0.388573 -0.994269 0.170004 -0.851865
99 2.851694 0.458063 0.691514 -0.507752
[100 rows x 4 columns]
df.to_csv(data.csv)
Traceback (most recent call last):
File "C:\Users\HP\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3296, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-95-7a5c562dad46>", line 1, in <module>
df.to_csv(data.csv)
NameError: name 'data' is not defined
df.to_csv('data.csv')
电影评分数据
合并
不同性别平均得分
热门电影
核心数据结构
Series
一维带标签数组,数组里可以放任意数据(整数,浮点数,字符串,python object)
基本创建函数是
s=pd.Series(data,index=index)
其中index是一个列表 用来作为数据标签,data可以是不同数据类型
python字典
ndarray对象
标量值
创建
[4]:
s=pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
s
Out[4]:
a 0.643028
b -0.006442
c 1.466846
d -0.828449
e 1.953144
dtype: float6
[6]:
d={'a':0,'b':1,'d':3}
s=pd.Series(d,index=list('abcd'))
s
Out[6]:
a 0.0
b 1.0
c NaN
d 3.0
dtype: float64
:
s=pd.Series(5,index=list('abcd'))
s
Out[7]:
a 5
b 5
c 5
d 5
dtype: int64
特性
Series对象性质
类ndarray对象
标签对齐操作
[9]:
np.sin(s)
Out[9]:
a -0.958924
b -0.958924
c -0.958924
d -0.958924
dtype: float64
11]:
s['g']=100
s
Out[11]:
a 5
b 5
c 5
d 5
g 100
dtype: int64
s1=pd.Series(np.random.randn(3),index=['a','c','e'])
s2=pd.Series(np.random.randn(3),index=['a','d','e'])
s1+s2
Out[12]:
a 1.054570
c NaN
d NaN
e -0.658003
dtype: float64
DataFrame
1DataFrame是二维带行标签和列标签的数组,可以把DataFrame像成一个Excel列表和一个sq数据库表格,还可以想象成一个Series对象字典,他是pandas里最常用的数据结构
2创建DataFrame基本格式是
df=pd.DataFrame(Data,index=index,columns=columns)
其中index是行标签,columns是列标签
data可以是下面数据:
有一维numpy数组,list,Series构成的字典
二维numpy数组
一个Series
另外的DataFrame
创建
d={'one':pd.Series([1,2,3],index=['a','b','c']),
'two':pd.Series([1,2,3,4],index=['a','b','c','d'])}
df=pd.DataFrame(d)
df
Out[14]:
one
two
a
1.0
1
b
2.0
2
c
3.0
3
d
NaN
4
d={'one':[1,2,3,4],
'two':[21,22,23,24]}
df=pd.DataFrame(d)
df
Out[15]:
one
two
0
1
21
1
2
22
2
3
23
3
4
24
特性
列选择/增加/删除
使用assign()来插入新列
索引选择
选择一列->df[col]-.>Series
Series
根据行标签选择一行->df.loc[label]->Series
根据行位置选择一行->df.iloc[label]->Series
选择多行->df[5:10]->DataFrame
根据布尔向量选择多行->df[bool_vector]->DataFrame
数据对齐
使用numpy函数
Panel
panel是三维带标签数组,实际上pandas名称有panel演进的,即pan(el)da(ta)s
panel比较少用,但依然是重要数据结构之一
items:坐标轴为0索引对应元素DataFrame
major_axis:坐标轴为1,DataFrame里行标签
minor_axis坐标轴2DataFrame列标签
基础运算
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
s=pd.Series([1,3,5,6,8],index=list('acefh'))
s
Out[6]:
a 1
c 3
e 5
f 6
h 8
dtype: int64
重新索引
s.reindex(list('abcdefgh'))
Out[7]:
a 1.0
b NaN
c 3.0
d NaN
e 5.0
f 6.0
g NaN
h 8.0
dtype: float64
s.reindex(list('abcdefgh'),fill_value=0)
Out[8]:
a 1
b 0
c 3
d 0
e 5
f 6
g 0
h 8
dtype: int64
s.reindex(list('abcdefgh'),method='ffill')
Out[9]:
a 1
b 1
c 3
d 3
e 5
f 6
g 6
h 8
dtype: int64
DataFram
df=pd.DataFrame(np.random.randn(4,6),index=list('ADFH'),columns=['one','two','three','four','five','six'])
df
Out[11]:
one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
df2=df.reindex(index=list('ABCDEFGH'))
df2
Out[14]:
one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
B NaN NaN NaN NaN NaN NaN
C NaN NaN NaN NaN NaN NaN
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
E NaN NaN NaN NaN NaN NaN
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
G NaN NaN NaN NaN NaN NaN
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
向前填充
df.reindex(index=list('ABCDEFGH'),method='ffill')
Out[15]:
one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
B -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
C -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
E -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
G 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
向后填空
df.reindex(index=list('ABCDEFGH'),method='bfill')
Out[16]:
one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
B -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
C -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
E 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
G -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
丢弃部分数据
df
Out[17]:
one two three four five six
A -0.636050 0.706831 0.037713 -0.618195 0.146753 0.227147
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
df.drop('A')
Out[18]:
one two three four five six
D -0.407534 -0.400880 -0.477477 0.404516 0.036828 0.095793
F 1.562102 0.717782 0.353708 1.315581 -0.209741 0.081160
H -0.936974 0.458558 -2.331884 0.200938 1.599978 0.010540
丢弃列
df.drop(['two','four'],axis=1)
Out[23]:
one three five six
A -0.636050 0.037713 0.146753 0.227147
D -0.407534 -0.477477 0.036828 0.095793
F 1.562102 0.353708 -0.209741 0.081160
H -0.936974 -2.331884 1.599978 0.010540
映射函数
df=pd.DataFrame(np.arange(12).reshape(4,3),index=['one','two','three','four'],columns=list('ABC'))
df
Out[25]:
A B C
one 0 1 2
two 3 4 5
three 6 7 8
four 9 10 11
apply
df.apply(lambda x:x.max()-x.min())
Out[27]:
A 9
B 9
C 9
dtype: int64
df.apply(lambda x:x.max()-x.min(),axis=1)
Out[28]:
one 2
two 2
three 2
four 2
dtype: int64
def min_max(x):
return pd.Series([x.min(),x.max()],index=['min','max'])
df.apply(min_max)
Out[29]:
A B C
min 0 1 2
max 9 10 11
apply map
formater='(0:.03f)'.format
df.applymap(formater)
Out[39]:
A B C
one (0:.03f) (0:.03f) (0:.03f)
two (0:.03f) (0:.03f) (0:.03f)
three (0:.03f) (0:.03f) (0:.03f)
four (0:.03f) (0:.03f) (0:.03f)
排名
s
Out[42]:
0 3
1 6
2 2
3 6
4 4
dtype: int64
s.rank()
Out[44]:
0 2.0
1 4.5
2 1.0
3 4.5
4 3.0
dtype: float64
s.rank(method='first')
Out[45]:
0 2.0
1 4.0
2 1.0
3 5.0
4 3.0
dtype: float64
数据唯一性及成员
s=pd.Series(list('ABCDEFGHDFDFDFD'))
s.value_counts()
Out[47]:
D 5
F 4
B 1
C 1
E 1
A 1
H 1
G 1
dtype: int64
s.unique()
Out[48]: array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'], dtype=object)
s.isin(['A','B','C'])
Out[50]:
0 True
1 True
2 True
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
dtype: bool
s.isin(s.unique())
Out[51]:
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
dtype: bool