Pandas:时间序列数据基本操作和分组
程序员文章站
2024-03-25 13:34:40
...
随即生成一个数组并转换为DataFrame对象
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 生成一个随即数组,服从标准正态分布
a = np.random.standard_normal((9, 5))
# 保留6位小数
a = a.round(6)
print a
[[-0.57746 -0.589821 0.052449 -2.033341 0.668837]
[-0.664741 -1.135394 1.304165 -0.786955 -0.531138]
[-0.463654 -0.652154 -0.376531 0.485559 0.904629]
[-0.191037 -1.591801 -0.765786 -0.507762 -0.350944]
[-0.710072 0.275955 -0.360209 0.118086 -0.994722]
[ 0.812942 -1.152795 0.752262 0.320789 0.082105]
[-0.722696 0.569946 -0.024101 -0.116255 -1.648291]
[ 0.321042 -1.067299 -1.296154 -0.444176 0.159482]
[-1.929412 0.539893 0.186463 -1.293188 -0.303421]]
# 通过数组构造DataFrame 对象
df_a = pd.DataFrame(a)
print df_a
0 1 2 3 4
0 -0.577460 -0.589821 0.052449 -2.033341 0.668837
1 -0.664741 -1.135394 1.304165 -0.786955 -0.531138
2 -0.463654 -0.652154 -0.376531 0.485559 0.904629
3 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944
4 -0.710072 0.275955 -0.360209 0.118086 -0.994722
5 0.812942 -1.152795 0.752262 0.320789 0.082105
6 -0.722696 0.569946 -0.024101 -0.116255 -1.648291
7 0.321042 -1.067299 -1.296154 -0.444176 0.159482
8 -1.929412 0.539893 0.186463 -1.293188 -0.303421
# 给DataFrame对象指定一个列名
df_a.columns = [['col1','col2','col3','col4','col5']]
print df_a
col1 col2 col3 col4 col5
0 -0.577460 -0.589821 0.052449 -2.033341 0.668837
1 -0.664741 -1.135394 1.304165 -0.786955 -0.531138
2 -0.463654 -0.652154 -0.376531 0.485559 0.904629
3 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944
4 -0.710072 0.275955 -0.360209 0.118086 -0.994722
5 0.812942 -1.152795 0.752262 0.320789 0.082105
6 -0.722696 0.569946 -0.024101 -0.116255 -1.648291
7 0.321042 -1.067299 -1.296154 -0.444176 0.159482
8 -1.929412 0.539893 0.186463 -1.293188 -0.303421
访问DataFrame中的任意元素
# 访问数据列表中的元素
df_a.iloc[3][0]
-0.19103700000000001
df_a.loc[3][0]
-0.19103700000000001
df_a['col1'][3]
-0.19103700000000001
df_a[3:4]['col1']
3 -0.191037
Name: col1, dtype: float64
data_range()方法的参数
date_range(start='',end='',periods=,freq='',tz='',normalize=bool,name='')
periods:期数,当start或end缺失时
freq :日期间隔,M表示一个月,5D表示5天
tz :本地化索引的时区名称,默认为None
normalize:把start,end日期规范为午夜,默认为None
name :结果索引名称,默认为None
date_range(),freq接受的参数值:
B:交易日,W:每周,M:月底,BM:每月最后一个交易日,MS:月初,BMS:每月第一个交易日,Q:季度末,BQ:季度最后一个交易日
QS:季度初,BQS:每季度第一个交易日,A:年底,BA:每年最后一个交易日,AS:年初,BAS:每年第一个交易日,H:每小时,T:每分钟
S:每秒,L:毫秒,U:微秒
# 金融数据大都包含时间索引,data_range生成时间索引
dates = pd.date_range('2017-7-10',periods=9,freq='D')
print dates
DatetimeIndex(['2017-07-10', '2017-07-11', '2017-07-12', '2017-07-13',
'2017-07-14', '2017-07-15', '2017-07-16', '2017-07-17',
'2017-07-18'],
dtype='datetime64[ns]', freq='D')
# 将日期索引作为新的索引
df_a.index = dates
print df_a
col1 col2 col3 col4 col5
2017-07-10 -0.577460 -0.589821 0.052449 -2.033341 0.668837
2017-07-11 -0.664741 -1.135394 1.304165 -0.786955 -0.531138
2017-07-12 -0.463654 -0.652154 -0.376531 0.485559 0.904629
2017-07-13 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944
2017-07-14 -0.710072 0.275955 -0.360209 0.118086 -0.994722
2017-07-15 0.812942 -1.152795 0.752262 0.320789 0.082105
2017-07-16 -0.722696 0.569946 -0.024101 -0.116255 -1.648291
2017-07-17 0.321042 -1.067299 -1.296154 -0.444176 0.159482
2017-07-18 -1.929412 0.539893 0.186463 -1.293188 -0.303421
# nddary可以生成DataFrame对象,DataFame也可以转换为nddary
np.array(df_a).round(6)
array([[-0.57746 , -0.589821, 0.052449, -2.033341, 0.668837],
[-0.664741, -1.135394, 1.304165, -0.786955, -0.531138],
[-0.463654, -0.652154, -0.376531, 0.485559, 0.904629],
[-0.191037, -1.591801, -0.765786, -0.507762, -0.350944],
[-0.710072, 0.275955, -0.360209, 0.118086, -0.994722],
[ 0.812942, -1.152795, 0.752262, 0.320789, 0.082105],
[-0.722696, 0.569946, -0.024101, -0.116255, -1.648291],
[ 0.321042, -1.067299, -1.296154, -0.444176, 0.159482],
[-1.929412, 0.539893, 0.186463, -1.293188, -0.303421]])
基本操作
# DataFrame有一些内建方法用于简单的计算
# 每列求和
df_a.sum()
col1 -4.125088
col2 -4.803470
col3 -0.527442
col4 -4.257243
col5 -2.013463
dtype: float64
# 每行求和
df_a.sum(axis=1)
2017-07-10 -2.479336
2017-07-11 -1.814063
2017-07-12 -0.102151
2017-07-13 -3.407330
2017-07-14 -1.670962
2017-07-15 0.815303
2017-07-16 -1.941397
2017-07-17 -2.327105
2017-07-18 -2.799665
Freq: D, dtype: float64
# 评均值,可以通过参数改变计算的维度
df_a.mean(axis=1)
2017-07-10 -0.495867
2017-07-11 -0.362813
2017-07-12 -0.020430
2017-07-13 -0.681466
2017-07-14 -0.334192
2017-07-15 0.163061
2017-07-16 -0.388279
2017-07-17 -0.465421
2017-07-18 -0.559933
Freq: D, dtype: float64
# 累计求和
print df_a.cumsum(axis=0)
col1 col2 col3 col4 col5
2017-07-10 -0.577460 -0.589821 0.052449 -2.033341 0.668837
2017-07-11 -1.242201 -1.725215 1.356614 -2.820296 0.137699
2017-07-12 -1.705855 -2.377369 0.980083 -2.334737 1.042328
2017-07-13 -1.896892 -3.969170 0.214297 -2.842499 0.691384
2017-07-14 -2.606964 -3.693215 -0.145912 -2.724413 -0.303338
2017-07-15 -1.794022 -4.846010 0.606350 -2.403624 -0.221233
2017-07-16 -2.516718 -4.276064 0.582249 -2.519879 -1.869524
2017-07-17 -2.195676 -5.343363 -0.713905 -2.964055 -1.710042
2017-07-18 -4.125088 -4.803470 -0.527442 -4.257243 -2.013463
# 每行最大值
df_a.max(axis=1)
2017-07-10 0.668837
2017-07-11 1.304165
2017-07-12 0.904629
2017-07-13 -0.191037
2017-07-14 0.275955
2017-07-15 0.812942
2017-07-16 0.569946
2017-07-17 0.321042
2017-07-18 0.539893
Freq: D, dtype: float64
# describe方法
print df_a.describe()
col1 col2 col3 col4 col5
count 9.000000 9.000000 9.000000 9.000000 9.000000
mean -0.458343 -0.533719 -0.058605 -0.473027 -0.223718
std 0.762330 0.805681 0.775557 0.808824 0.794598
min -1.929412 -1.591801 -1.296154 -2.033341 -1.648291
25% -0.710072 -1.135394 -0.376531 -0.786955 -0.531138
50% -0.577460 -0.652154 -0.024101 -0.444176 -0.303421
75% -0.191037 0.275955 0.186463 0.118086 0.159482
max 0.812942 0.569946 1.304165 0.485559 0.904629
Numpy的一些适用与nddary的函数,也可以用在DataFrame对象上
print np.sqrt(df_a)
col1 col2 col3 col4 col5
2017-07-10 NaN NaN 0.229017 NaN 0.817825
2017-07-11 NaN NaN 1.142000 NaN NaN
2017-07-12 NaN NaN NaN 0.696821 0.951120
2017-07-13 NaN NaN NaN NaN NaN
2017-07-14 NaN 0.525314 NaN 0.343636 NaN
2017-07-15 0.901633 NaN 0.867330 0.566382 0.286540
2017-07-16 NaN 0.754948 NaN NaN NaN
2017-07-17 0.566606 NaN NaN NaN 0.399352
2017-07-18 NaN 0.734774 0.431814 NaN NaN
# pandas有很强的容错能力,对于不完整的数据,可以忽略Nan值
print np.sqrt(df_a).sum()
col1 1.468239
col2 2.015036
col3 2.670162
col4 1.606839
col5 2.454836
dtype: float64
# Pandas提供了Matplotlib的封装器,专门对DataFrame对象所设计
# 一行代码即可画图
%matplotlib inline
df_a.cumsum().plot(grid=True)
# plot方法的一些参数
subplots :布尔值,默认False; 在子图中绘图例
sharex :布尔值,默认True ; 共享x轴
sharey :
use_index :布尔值,默认True; DataFrame对象的索引作为x轴
stacked :布尔值,默认False; 堆叠(柱状图)
sort_columns:布尔值,默认False; 按字母顺序,排序列名
title :字符串, 标题
grid :布尔值,默认False; 网格线
legend :布尔值,默认True; 图例
ax :matplotlib绘图对象
style :字符串,列表,字典,
kind :line,bar,barh,kde,density;图表类型
logx,logy :布尔值,默认False; 对数刻度
xticks :序列,默认index; x轴刻度
xlim :二元组, x轴范围
rot :int,默认None; 旋转x轴刻度
secondary_y:布尔值/序列,默认False; 第二个y轴
mrak_right :布尔值,默认True; 第二个y轴自动设置标签
colormap :字符串,颜色映射对; 颜色映射
kwds :关键字 传递给matplotlib选项
Series
# Series
# 从DataFrame对象中选择单一的列时,就得到了一个Series对象
print df_a['col1']
2017-07-10 -0.577460
2017-07-11 -0.664741
2017-07-12 -0.463654
2017-07-13 -0.191037
2017-07-14 -0.710072
2017-07-15 0.812942
2017-07-16 -0.722696
2017-07-17 0.321042
2017-07-18 -1.929412
Freq: D, Name: col1, dtype: float64
type(df_a['col1'])
pandas.core.series.Series
# DataFrame对象的一些方法也可以用在Series对象上
%matplotlib inline
df_a['col2'].cumsum().plot(style='r')
GroupBy分组
类似于SQL中的分组和excel中的透视表
# 进行分组前对df_a进行列扩充
df_a['class'] = ['a','a','a','b','b','b','c','c','d']
print df_a
col1 col2 col3 col4 col5 class
2017-07-10 -0.577460 -0.589821 0.052449 -2.033341 0.668837 a
2017-07-11 -0.664741 -1.135394 1.304165 -0.786955 -0.531138 a
2017-07-12 -0.463654 -0.652154 -0.376531 0.485559 0.904629 a
2017-07-13 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944 b
2017-07-14 -0.710072 0.275955 -0.360209 0.118086 -0.994722 b
2017-07-15 0.812942 -1.152795 0.752262 0.320789 0.082105 b
2017-07-16 -0.722696 0.569946 -0.024101 -0.116255 -1.648291 c
2017-07-17 0.321042 -1.067299 -1.296154 -0.444176 0.159482 c
2017-07-18 -1.929412 0.539893 0.186463 -1.293188 -0.303421 d
# 根据calss对df_a进行分组
groups = df_a.groupby('class')
# 计算每组数据的平均值,最大
print groups.mean()
col1 col2 col3 col4 col5
class
a -0.568618 -0.792456 0.326694 -0.778246 0.347443
b -0.029389 -0.822880 -0.124578 -0.022962 -0.421187
c -0.200827 -0.248677 -0.660127 -0.280216 -0.744404
d -1.929412 0.539893 0.186463 -1.293188 -0.303421
print groups.max()
col1 col2 col3 col4 col5
class
a -0.463654 -0.589821 1.304165 0.485559 0.904629
b 0.812942 0.275955 0.752262 0.320789 0.082105
c 0.321042 0.569946 -0.024101 -0.116255 0.159482
d -1.929412 0.539893 0.186463 -1.293188 -0.303421
print groups.size()
class
a 3
b 3
c 2
d 1
dtype: int64
# 分组可以在多个列上进行,对df_a在进行列扩充
df_a['class2'] = ['A','A','A','A','B','B','B','B','C']
print df_a.groupby(['class2','class']).size()
class2 class
A a 3
b 1
B b 2
c 2
C d 1
dtype: int64
上一篇: 《SQLite》在SQLite数据库中创建一张“表”
下一篇: 爬取微博
推荐阅读