Pandas实用小功能
import pandas as pd
import numpy as np
1、读取时抽样百分比
data = pd.read_csv(r'..\test.csv',encoding='utf-8',
skiprows = lambda x: x>0 and np.random.rand() > 0.5)
data.head()
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
1 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
2 |
897 |
3 |
Svensson, Mr. Johan Cervin |
male |
14.0 |
0 |
0 |
7538 |
9.2250 |
NaN |
S |
3 |
898 |
3 |
Connolly, Miss. Kate |
female |
30.0 |
0 |
0 |
330972 |
7.6292 |
NaN |
Q |
4 |
900 |
3 |
Abrahim, Mrs. Joseph (Sophie Halaut Easu) |
female |
18.0 |
0 |
0 |
2657 |
7.2292 |
NaN |
C |
2、Pandas 空值检查
data.isnull().any()
PassengerId False
Pclass False
Name False
Sex False
Age True
SibSp False
Parch False
Ticket False
Fare True
Cabin True
Embarked False
dtype: bool
data.isnull().sum()
PassengerId 0
Pclass 0
Name 0
Sex 0
Age 39
SibSp 0
Parch 0
Ticket 0
Fare 1
Cabin 146
Embarked 0
dtype: int64
3、replace 做清洗
data['Name'] = data['Name'].replace('[()]','',regex=True).astype('str')
data.head()
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
1 |
896 |
3 |
Hirvonen, Mrs. Alexander Helga E Lindqvist |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
2 |
897 |
3 |
Svensson, Mr. Johan Cervin |
male |
14.0 |
0 |
0 |
7538 |
9.2250 |
NaN |
S |
3 |
898 |
3 |
Connolly, Miss. Kate |
female |
30.0 |
0 |
0 |
330972 |
7.6292 |
NaN |
Q |
4 |
900 |
3 |
Abrahim, Mrs. Joseph Sophie Halaut Easu |
female |
18.0 |
0 |
0 |
2657 |
7.2292 |
NaN |
C |
4、使用 apply(type) 做类型检查
data['Age'].apply(type)
0 <class 'float'>
1 <class 'float'>
2 <class 'float'>
3 <class 'float'>
4 <class 'float'>
...
189 <class 'float'>
190 <class 'float'>
191 <class 'float'>
192 <class 'float'>
193 <class 'float'>
Name: Age, Length: 194, dtype: object
5、重新排序列
data1 = data[['Name','Age','Sex']]
data1.head()
|
Name |
Age |
Sex |
0 |
Myles, Mr. Thomas Francis |
62.0 |
male |
1 |
Hirvonen, Mrs. Alexander Helga E Lindqvist |
22.0 |
female |
2 |
Svensson, Mr. Johan Cervin |
14.0 |
male |
3 |
Connolly, Miss. Kate |
30.0 |
female |
4 |
Abrahim, Mrs. Joseph Sophie Halaut Easu |
18.0 |
female |
cols = data.columns[[2,4,3]]
data2 = data[cols]
data2.head()
|
Name |
Age |
Sex |
0 |
Myles, Mr. Thomas Francis |
62.0 |
male |
1 |
Hirvonen, Mrs. Alexander Helga E Lindqvist |
22.0 |
female |
2 |
Svensson, Mr. Johan Cervin |
14.0 |
male |
3 |
Connolly, Miss. Kate |
30.0 |
female |
4 |
Abrahim, Mrs. Joseph Sophie Halaut Easu |
18.0 |
female |
6、标签和位置选择数据
data.iloc[:2]
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
1 |
896 |
3 |
Hirvonen, Mrs. Alexander Helga E Lindqvist |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
data.iloc[:,1:3]
|
Pclass |
Name |
0 |
2 |
Myles, Mr. Thomas Francis |
1 |
3 |
Hirvonen, Mrs. Alexander Helga E Lindqvist |
2 |
3 |
Svensson, Mr. Johan Cervin |
3 |
3 |
Connolly, Miss. Kate |
4 |
3 |
Abrahim, Mrs. Joseph Sophie Halaut Easu |
... |
... |
... |
189 |
3 |
Riordan, Miss. Johanna Hannah"" |
190 |
3 |
Peacock, Miss. Treasteall |
191 |
3 |
Naughton, Miss. Hannah |
192 |
1 |
Minahan, Mrs. William Edward Lillian E Thorpe |
193 |
3 |
Henriksson, Miss. Jenny Lovisa |
194 rows × 2 columns
data.iloc[2:5,2:5]
|
Name |
Sex |
Age |
2 |
Svensson, Mr. Johan Cervin |
male |
14.0 |
3 |
Connolly, Miss. Kate |
female |
30.0 |
4 |
Abrahim, Mrs. Joseph Sophie Halaut Easu |
female |
18.0 |
7、找到出现频率最高的值TOPn–nlargest(n)
data['Title'] = data['Name'].str.split(',',expand=True)[1].str.split('.',expand=True)[0]
data['Title'].value_counts().nlargest(3)
Mr 107
Miss 40
Mrs 34
Name: Title, dtype: int64
8、结合使用 where 和 isin
称谓名称top3
top3 = data['Title'].value_counts().nlargest(3).index
top3
Index([' Mr', ' Miss', ' Mrs'], dtype='object')
其他的改为【others】
data1 = data.where(data['Title'].isin(top3),other='others')
data1['Title'].value_counts()
Mr 107
Miss 40
Mrs 34
others 13
Name: Title, dtype: int64
9、小分类值的替换
dt = data['Title'].value_counts(normalize=True)
dt
Mr 0.551546
Miss 0.206186
Mrs 0.175258
Master 0.051546
Ms 0.005155
Rev 0.005155
Dr 0.005155
Name: Title, dtype: float64
flag = 0.05
dt_others = dt[dt<flag].index
dt_others
Index([' Ms', ' Rev', ' Dr'], dtype='object')
data['Title'] = data['Title'].replace(dt_others,'others')
data['Title'].value_counts()
Mr 107
Miss 40
Mrs 34
Master 10
others 3
Name: Title, dtype: int64
10、生成时间序列的数据集
pd.util.testing.makeTimeDataFrame(10)
|
A |
B |
C |
D |
2000-01-03 |
-1.640177 |
2.286963 |
0.663883 |
0.730500 |
2000-01-04 |
-0.962963 |
0.468783 |
0.420607 |
-2.049439 |
2000-01-05 |
1.176148 |
0.336326 |
0.275701 |
-1.546738 |
2000-01-06 |
-0.346942 |
0.988084 |
0.638690 |
2.542611 |
2000-01-07 |
1.464977 |
-0.425312 |
0.452835 |
0.555940 |
2000-01-10 |
1.468564 |
0.102388 |
-1.694136 |
0.758442 |
2000-01-11 |
-0.741469 |
-0.037244 |
0.422585 |
-0.300642 |
2000-01-12 |
-0.749106 |
0.286984 |
-0.023601 |
0.093908 |
2000-01-13 |
-2.053304 |
0.408734 |
-0.602061 |
2.245386 |
2000-01-14 |
0.287270 |
1.010302 |
1.319785 |
0.667926 |
11、时间数据下采样
df = pd.DataFrame(np.random.randint(1,10,size=(100000,3)),columns = ['商品编码','商品销量','商品库存'])
df.index = pd.util.testing.makeDateIndex(100000,freq='H')
df.head()
|
商品编码 |
商品销量 |
商品库存 |
2000-01-01 00:00:00 |
7 |
9 |
1 |
2000-01-01 01:00:00 |
9 |
6 |
7 |
2000-01-01 02:00:00 |
6 |
8 |
1 |
2000-01-01 03:00:00 |
7 |
6 |
1 |
2000-01-01 04:00:00 |
3 |
2 |
5 |
使用 resample 方法,合并为天(D):
day_df = df.resample("D")["商品销量"].sum().to_frame()
day_df.head()
|
商品销量 |
2000-01-01 |
135 |
2000-01-02 |
117 |
2000-01-03 |
124 |
2000-01-04 |
106 |
2000-01-05 |
121 |
m_df = df.resample("M")['商品销量'].sum().to_frame()
m_df.head()
|
商品销量 |
2000-01-31 |
3767 |
2000-02-29 |
3473 |
2000-03-31 |
3818 |
2000-04-30 |
3538 |
2000-05-31 |
3674 |
y_df = df.resample("Y")['商品销量'].sum().to_frame()
y_df.head()
|
商品销量 |
2000-12-31 |
43720 |
2001-12-31 |
43705 |
2002-12-31 |
43642 |
2003-12-31 |
44268 |
2004-12-31 |
43867 |
12、转 datetime
df['datetime'] = pd.to_datetime(df.index,format='%Y/%m/%d')
df.head()
|
商品编码 |
商品销量 |
商品库存 |
datetime |
2000-01-01 00:00:00 |
7 |
9 |
1 |
2000-01-01 00:00:00 |
2000-01-01 01:00:00 |
9 |
6 |
7 |
2000-01-01 01:00:00 |
2000-01-01 02:00:00 |
6 |
8 |
1 |
2000-01-01 02:00:00 |
2000-01-01 03:00:00 |
7 |
6 |
1 |
2000-01-01 03:00:00 |
2000-01-01 04:00:00 |
3 |
2 |
5 |
2000-01-01 04:00:00 |
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100000 entries, 2000-01-01 00:00:00 to 2011-05-29 15:00:00
Freq: H
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 商品编码 100000 non-null int32
1 商品销量 100000 non-null int32
2 商品库存 100000 non-null int32
3 datetime 100000 non-null datetime64[ns]
dtypes: datetime64[ns](1), int32(3)
memory usage: 2.7 MB
13、map 做特征工程
d = {
"sex":["male", "female", "male","female"],
"color":["red", "green", "blue","green"],
"age":[25, 30, 15, 32]
}
df = pd.DataFrame(d)
df
|
sex |
color |
age |
0 |
male |
red |
25 |
1 |
female |
green |
30 |
2 |
male |
blue |
15 |
3 |
female |
green |
32 |
d = {"red": 0, "green": 1,'blue':2}
d
{'red': 0, 'green': 1, 'blue': 2}
df['color2'] = df['color'].map(d)
df
|
sex |
color |
age |
color2 |
0 |
male |
red |
25 |
0 |
1 |
female |
green |
30 |
1 |
2 |
male |
blue |
15 |
2 |
3 |
female |
green |
32 |
1 |