欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Pandas:时间序列数据基本操作和分组

程序员文章站 2024-03-25 13:34:40
...

随即生成一个数组并转换为DataFrame对象

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 生成一个随即数组,服从标准正态分布
a = np.random.standard_normal((9, 5))
# 保留6位小数
a = a.round(6)
print a
[[-0.57746  -0.589821  0.052449 -2.033341  0.668837]
 [-0.664741 -1.135394  1.304165 -0.786955 -0.531138]
 [-0.463654 -0.652154 -0.376531  0.485559  0.904629]
 [-0.191037 -1.591801 -0.765786 -0.507762 -0.350944]
 [-0.710072  0.275955 -0.360209  0.118086 -0.994722]
 [ 0.812942 -1.152795  0.752262  0.320789  0.082105]
 [-0.722696  0.569946 -0.024101 -0.116255 -1.648291]
 [ 0.321042 -1.067299 -1.296154 -0.444176  0.159482]
 [-1.929412  0.539893  0.186463 -1.293188 -0.303421]]
# 通过数组构造DataFrame 对象
df_a = pd.DataFrame(a)
print df_a
          0         1         2         3         4
0 -0.577460 -0.589821  0.052449 -2.033341  0.668837
1 -0.664741 -1.135394  1.304165 -0.786955 -0.531138
2 -0.463654 -0.652154 -0.376531  0.485559  0.904629
3 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944
4 -0.710072  0.275955 -0.360209  0.118086 -0.994722
5  0.812942 -1.152795  0.752262  0.320789  0.082105
6 -0.722696  0.569946 -0.024101 -0.116255 -1.648291
7  0.321042 -1.067299 -1.296154 -0.444176  0.159482
8 -1.929412  0.539893  0.186463 -1.293188 -0.303421
# 给DataFrame对象指定一个列名
df_a.columns = [['col1','col2','col3','col4','col5']]
print df_a
       col1      col2      col3      col4      col5
0 -0.577460 -0.589821  0.052449 -2.033341  0.668837
1 -0.664741 -1.135394  1.304165 -0.786955 -0.531138
2 -0.463654 -0.652154 -0.376531  0.485559  0.904629
3 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944
4 -0.710072  0.275955 -0.360209  0.118086 -0.994722
5  0.812942 -1.152795  0.752262  0.320789  0.082105
6 -0.722696  0.569946 -0.024101 -0.116255 -1.648291
7  0.321042 -1.067299 -1.296154 -0.444176  0.159482
8 -1.929412  0.539893  0.186463 -1.293188 -0.303421

访问DataFrame中的任意元素

# 访问数据列表中的元素
df_a.iloc[3][0]
-0.19103700000000001
df_a.loc[3][0]
-0.19103700000000001
df_a['col1'][3]
-0.19103700000000001
df_a[3:4]['col1']
3   -0.191037
Name: col1, dtype: float64

data_range()方法的参数

date_range(start='',end='',periods=,freq='',tz='',normalize=bool,name='')
periods:期数,当start或end缺失时
freq   :日期间隔,M表示一个月,5D表示5天
tz     :本地化索引的时区名称,默认为None
normalize:把start,end日期规范为午夜,默认为None
name   :结果索引名称,默认为None
date_range(),freq接受的参数值:
    B:交易日,W:每周,M:月底,BM:每月最后一个交易日,MS:月初,BMS:每月第一个交易日,Q:季度末,BQ:季度最后一个交易日
    QS:季度初,BQS:每季度第一个交易日,A:年底,BA:每年最后一个交易日,AS:年初,BAS:每年第一个交易日,H:每小时,T:每分钟
    S:每秒,L:毫秒,U:微秒
# 金融数据大都包含时间索引,data_range生成时间索引
dates = pd.date_range('2017-7-10',periods=9,freq='D')
print dates
DatetimeIndex(['2017-07-10', '2017-07-11', '2017-07-12', '2017-07-13',
               '2017-07-14', '2017-07-15', '2017-07-16', '2017-07-17',
               '2017-07-18'],
              dtype='datetime64[ns]', freq='D')
# 将日期索引作为新的索引
df_a.index = dates
print df_a
                col1      col2      col3      col4      col5
2017-07-10 -0.577460 -0.589821  0.052449 -2.033341  0.668837
2017-07-11 -0.664741 -1.135394  1.304165 -0.786955 -0.531138
2017-07-12 -0.463654 -0.652154 -0.376531  0.485559  0.904629
2017-07-13 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944
2017-07-14 -0.710072  0.275955 -0.360209  0.118086 -0.994722
2017-07-15  0.812942 -1.152795  0.752262  0.320789  0.082105
2017-07-16 -0.722696  0.569946 -0.024101 -0.116255 -1.648291
2017-07-17  0.321042 -1.067299 -1.296154 -0.444176  0.159482
2017-07-18 -1.929412  0.539893  0.186463 -1.293188 -0.303421
# nddary可以生成DataFrame对象,DataFame也可以转换为nddary
np.array(df_a).round(6)
array([[-0.57746 , -0.589821,  0.052449, -2.033341,  0.668837],
       [-0.664741, -1.135394,  1.304165, -0.786955, -0.531138],
       [-0.463654, -0.652154, -0.376531,  0.485559,  0.904629],
       [-0.191037, -1.591801, -0.765786, -0.507762, -0.350944],
       [-0.710072,  0.275955, -0.360209,  0.118086, -0.994722],
       [ 0.812942, -1.152795,  0.752262,  0.320789,  0.082105],
       [-0.722696,  0.569946, -0.024101, -0.116255, -1.648291],
       [ 0.321042, -1.067299, -1.296154, -0.444176,  0.159482],
       [-1.929412,  0.539893,  0.186463, -1.293188, -0.303421]])

基本操作

# DataFrame有一些内建方法用于简单的计算
# 每列求和
df_a.sum()
col1   -4.125088
col2   -4.803470
col3   -0.527442
col4   -4.257243
col5   -2.013463
dtype: float64
# 每行求和
df_a.sum(axis=1)
2017-07-10   -2.479336
2017-07-11   -1.814063
2017-07-12   -0.102151
2017-07-13   -3.407330
2017-07-14   -1.670962
2017-07-15    0.815303
2017-07-16   -1.941397
2017-07-17   -2.327105
2017-07-18   -2.799665
Freq: D, dtype: float64
# 评均值,可以通过参数改变计算的维度
df_a.mean(axis=1)
2017-07-10   -0.495867
2017-07-11   -0.362813
2017-07-12   -0.020430
2017-07-13   -0.681466
2017-07-14   -0.334192
2017-07-15    0.163061
2017-07-16   -0.388279
2017-07-17   -0.465421
2017-07-18   -0.559933
Freq: D, dtype: float64
# 累计求和
print df_a.cumsum(axis=0)
                col1      col2      col3      col4      col5
2017-07-10 -0.577460 -0.589821  0.052449 -2.033341  0.668837
2017-07-11 -1.242201 -1.725215  1.356614 -2.820296  0.137699
2017-07-12 -1.705855 -2.377369  0.980083 -2.334737  1.042328
2017-07-13 -1.896892 -3.969170  0.214297 -2.842499  0.691384
2017-07-14 -2.606964 -3.693215 -0.145912 -2.724413 -0.303338
2017-07-15 -1.794022 -4.846010  0.606350 -2.403624 -0.221233
2017-07-16 -2.516718 -4.276064  0.582249 -2.519879 -1.869524
2017-07-17 -2.195676 -5.343363 -0.713905 -2.964055 -1.710042
2017-07-18 -4.125088 -4.803470 -0.527442 -4.257243 -2.013463
# 每行最大值
df_a.max(axis=1)
2017-07-10    0.668837
2017-07-11    1.304165
2017-07-12    0.904629
2017-07-13   -0.191037
2017-07-14    0.275955
2017-07-15    0.812942
2017-07-16    0.569946
2017-07-17    0.321042
2017-07-18    0.539893
Freq: D, dtype: float64
# describe方法
print df_a.describe()
           col1      col2      col3      col4      col5
count  9.000000  9.000000  9.000000  9.000000  9.000000
mean  -0.458343 -0.533719 -0.058605 -0.473027 -0.223718
std    0.762330  0.805681  0.775557  0.808824  0.794598
min   -1.929412 -1.591801 -1.296154 -2.033341 -1.648291
25%   -0.710072 -1.135394 -0.376531 -0.786955 -0.531138
50%   -0.577460 -0.652154 -0.024101 -0.444176 -0.303421
75%   -0.191037  0.275955  0.186463  0.118086  0.159482
max    0.812942  0.569946  1.304165  0.485559  0.904629

Numpy的一些适用与nddary的函数,也可以用在DataFrame对象上

print np.sqrt(df_a)
                col1      col2      col3      col4      col5
2017-07-10       NaN       NaN  0.229017       NaN  0.817825
2017-07-11       NaN       NaN  1.142000       NaN       NaN
2017-07-12       NaN       NaN       NaN  0.696821  0.951120
2017-07-13       NaN       NaN       NaN       NaN       NaN
2017-07-14       NaN  0.525314       NaN  0.343636       NaN
2017-07-15  0.901633       NaN  0.867330  0.566382  0.286540
2017-07-16       NaN  0.754948       NaN       NaN       NaN
2017-07-17  0.566606       NaN       NaN       NaN  0.399352
2017-07-18       NaN  0.734774  0.431814       NaN       NaN
# pandas有很强的容错能力,对于不完整的数据,可以忽略Nan值
print np.sqrt(df_a).sum()
col1    1.468239
col2    2.015036
col3    2.670162
col4    1.606839
col5    2.454836
dtype: float64
# Pandas提供了Matplotlib的封装器,专门对DataFrame对象所设计
# 一行代码即可画图
%matplotlib inline
df_a.cumsum().plot(grid=True)

Pandas:时间序列数据基本操作和分组Pandas:时间序列数据基本操作和分组

# plot方法的一些参数
subplots :布尔值,默认False;   在子图中绘图例
sharex   :布尔值,默认True ;   共享x轴
sharey   :
use_index :布尔值,默认True;   DataFrame对象的索引作为x轴
stacked   :布尔值,默认False;   堆叠(柱状图)
sort_columns:布尔值,默认False;  按字母顺序,排序列名
title     :字符串,         标题
grid      :布尔值,默认False;    网格线
legend    :布尔值,默认True;    图例
ax        :matplotlib绘图对象
style     :字符串,列表,字典,
kind      :line,bar,barh,kde,density;图表类型
logx,logy :布尔值,默认False;    对数刻度
xticks    :序列,默认index;         x轴刻度
xlim      :二元组,         x轴范围
rot       :int,默认None;          旋转x轴刻度
secondary_y:布尔值/序列,默认False; 第二个y轴
mrak_right :布尔值,默认True;       第二个y轴自动设置标签
colormap   :字符串,颜色映射对;      颜色映射
kwds       :关键字          传递给matplotlib选项

Series

# Series
# 从DataFrame对象中选择单一的列时,就得到了一个Series对象
print df_a['col1']
2017-07-10   -0.577460
2017-07-11   -0.664741
2017-07-12   -0.463654
2017-07-13   -0.191037
2017-07-14   -0.710072
2017-07-15    0.812942
2017-07-16   -0.722696
2017-07-17    0.321042
2017-07-18   -1.929412
Freq: D, Name: col1, dtype: float64
type(df_a['col1'])
pandas.core.series.Series
# DataFrame对象的一些方法也可以用在Series对象上
%matplotlib inline
df_a['col2'].cumsum().plot(style='r')

Pandas:时间序列数据基本操作和分组Pandas:时间序列数据基本操作和分组

GroupBy分组
类似于SQL中的分组和excel中的透视表

# 进行分组前对df_a进行列扩充
df_a['class'] = ['a','a','a','b','b','b','c','c','d']
print df_a
                col1      col2      col3      col4      col5 class
2017-07-10 -0.577460 -0.589821  0.052449 -2.033341  0.668837     a
2017-07-11 -0.664741 -1.135394  1.304165 -0.786955 -0.531138     a
2017-07-12 -0.463654 -0.652154 -0.376531  0.485559  0.904629     a
2017-07-13 -0.191037 -1.591801 -0.765786 -0.507762 -0.350944     b
2017-07-14 -0.710072  0.275955 -0.360209  0.118086 -0.994722     b
2017-07-15  0.812942 -1.152795  0.752262  0.320789  0.082105     b
2017-07-16 -0.722696  0.569946 -0.024101 -0.116255 -1.648291     c
2017-07-17  0.321042 -1.067299 -1.296154 -0.444176  0.159482     c
2017-07-18 -1.929412  0.539893  0.186463 -1.293188 -0.303421     d
# 根据calss对df_a进行分组
groups = df_a.groupby('class')
# 计算每组数据的平均值,最大
print groups.mean()
           col1      col2      col3      col4      col5
class                                                  
a     -0.568618 -0.792456  0.326694 -0.778246  0.347443
b     -0.029389 -0.822880 -0.124578 -0.022962 -0.421187
c     -0.200827 -0.248677 -0.660127 -0.280216 -0.744404
d     -1.929412  0.539893  0.186463 -1.293188 -0.303421
print groups.max()
           col1      col2      col3      col4      col5
class                                                  
a     -0.463654 -0.589821  1.304165  0.485559  0.904629
b      0.812942  0.275955  0.752262  0.320789  0.082105
c      0.321042  0.569946 -0.024101 -0.116255  0.159482
d     -1.929412  0.539893  0.186463 -1.293188 -0.303421
print groups.size()
class
a    3
b    3
c    2
d    1
dtype: int64
# 分组可以在多个列上进行,对df_a在进行列扩充
df_a['class2'] = ['A','A','A','A','B','B','B','B','C']
print df_a.groupby(['class2','class']).size()
class2  class
A       a        3
        b        1
B       b        2
        c        2
C       d        1
dtype: int64
相关标签: groupby