pandas向量化字符串处理以及数据透视表笔记
import numpy as np
import pandas as pd
import re
df=pd.DataFrame(np.random.randint(2,15,(5,4)),columns=['data1','data2','data3','data4'])
df
|
data1 |
data2 |
data3 |
data4 |
0 |
10 |
11 |
5 |
4 |
1 |
10 |
13 |
11 |
7 |
2 |
10 |
8 |
7 |
10 |
3 |
12 |
7 |
14 |
6 |
4 |
12 |
12 |
6 |
9 |
df.groupby(['data1','data2'])[['data3']].mean()
|
|
data3 |
data1 |
data2 |
|
10 |
8 |
7 |
11 |
5 |
13 |
11 |
12 |
7 |
14 |
12 |
6 |
df.groupby(['data1','data2'])[['data3','data4']].mean()
|
|
data3 |
data4 |
data1 |
data2 |
|
|
10 |
8 |
7 |
10 |
11 |
5 |
4 |
13 |
11 |
7 |
12 |
7 |
14 |
6 |
12 |
6 |
9 |
df.groupby(['data1','data2']).mean().unstack(level=1).fillna('x')
|
data3 |
data4 |
data2 |
7 |
8 |
11 |
12 |
13 |
7 |
8 |
11 |
12 |
13 |
data1 |
|
|
|
|
|
|
|
|
|
|
10 |
x |
7 |
5 |
x |
11 |
x |
10 |
4 |
x |
7 |
12 |
14 |
x |
x |
6 |
x |
6 |
x |
x |
9 |
x |
df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
"bar", "bar", "bar", "bar"],
"B": ["one", "one", "one", "two", "two",
"one", "one", "two", "two"],
"C": ["small", "large", "large", "small",
"small", "large", "small", "small",
"large"],
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
df
|
A |
B |
C |
D |
E |
0 |
foo |
one |
small |
1 |
2 |
1 |
foo |
one |
large |
2 |
4 |
2 |
foo |
one |
large |
2 |
5 |
3 |
foo |
two |
small |
3 |
5 |
4 |
foo |
two |
small |
3 |
6 |
5 |
bar |
one |
large |
4 |
6 |
6 |
bar |
one |
small |
5 |
8 |
7 |
bar |
two |
small |
6 |
9 |
8 |
bar |
two |
large |
7 |
9 |
df.pivot_table(index='A',columns=['B','C'])
|
D |
E |
B |
one |
two |
one |
two |
C |
large |
small |
large |
small |
large |
small |
large |
small |
A |
|
|
|
|
|
|
|
|
bar |
4.0 |
5.0 |
7.0 |
6.0 |
6.0 |
8.0 |
9.0 |
9.0 |
foo |
2.0 |
1.0 |
NaN |
3.0 |
4.5 |
2.0 |
NaN |
5.5 |
df.pivot_table(index='A',columns=['B','C'],aggfunc={'D':np.mean,'E':np.sum},margins=True,margins_name='YY')
|
D |
E |
B |
one |
two |
YY |
one |
two |
YY |
C |
large |
small |
large |
small |
|
large |
small |
large |
small |
|
A |
|
|
|
|
|
|
|
|
|
|
bar |
4.000000 |
5.0 |
7.0 |
6.0 |
5.500000 |
6.0 |
8.0 |
9.0 |
9.0 |
32 |
foo |
2.000000 |
1.0 |
NaN |
3.0 |
2.200000 |
9.0 |
2.0 |
NaN |
11.0 |
22 |
YY |
2.666667 |
3.0 |
7.0 |
4.0 |
3.666667 |
15.0 |
10.0 |
9.0 |
20.0 |
54 |
向量化字符串方法
data=pd.Series(['peter','paul','mary','gUIDO'])
data.str.capitalize()
0 Peter
1 Paul
2 Mary
3 Guido
dtype: object
L={'peter':'pp'}
data.str.translate(L)
0 peter
1 paul
2 mary
3 gUIDO
dtype: object
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte
0 Graham Chapman
1 John Cleese
2 Terry Gilliam
3 Eric Idle
4 Terry Jones
5 Michael Palin
dtype: object
monte.str.extract('([a-zA-Z]+)')
|
0 |
0 |
Graham |
1 |
John |
2 |
Terry |
3 |
Eric |
4 |
Terry |
5 |
Michael |
text='Graham Chapman'
m=re.match('[a-zA-Z]+',text)
m.group(0)
'Graham'
monte.str.extract(r'(^[^AEIOU].*[^aeiou]$)')
|
0 |
0 |
Graham Chapman |
1 |
NaN |
2 |
Terry Gilliam |
3 |
NaN |
4 |
Terry Jones |
5 |
Michael Palin |
monte.str[0:3]
monte.str.slice(0,3)
0 Gra
1 Joh
2 Ter
3 Eri
4 Ter
5 Mic
dtype: object
monte.str.wrap(3)
0 Gra\nham\nCha\npma\nn
1 Joh\nn C\nlee\nse
2 Ter\nry \nGil\nlia\nm
3 Eri\nc I\ndle
4 Ter\nry \nJon\nes
5 Mic\nhae\nl P\nali\nn
dtype: object
monte.str.repeat(3)
0 Graham ChapmanGraham ChapmanGraham Chapman
1 John CleeseJohn CleeseJohn Cleese
2 Terry GilliamTerry GilliamTerry Gilliam
3 Eric IdleEric IdleEric Idle
4 Terry JonesTerry JonesTerry Jones
5 Michael PalinMichael PalinMichael Palin
dtype: object
方法分为以下三类:
- pandas向量化字符串处理与re库API:
- pandas其他字符串处理方法: