欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

pandas 数据归整化 清理,转换,合并,重塑

程序员文章站 2024-02-11 21:29:40
...
from __future__ import division
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from scipy.interpolate import lagrange #导入拉格朗日插值函数
from pandas import Series,DataFrame

inputfile_path='E:/python/wangyiPython/the sixth week/data/catering_sale.xls'
outputfile_path='E:/python/wangyiPython/the sixth week/data/done_sale'

data=pd.read_excel(inputfile_path)
data['销量'][(data['销量']<400)|(data['销量']>5000)]= None #过滤异常值,将其变为空
data

移除重复数据
data=DataFrame({'k1':['one']*3+['two']*4,
               'k2':[1,1,2,3,3,4,4]})
data.duplicated() #返回true或false,重复则为true
data.drop_duplicates()#移除重复值
data['v1']=range(7)
data.drop_duplicates(['k1'])#移除K1列出现重复值的记录
data.drop_duplicates(['k1','k2'],take_last=True)#保留重复值的最后一个,take_lasst=true

自定义列向量插值函数 

自定义列向量插值函数
s 为列向量,n为被插值的位置,k为取取前后的数据个数,默认为5

def ployinterp_column(s,n,k=5):
    y=s[list(range(n-k,n))+list(range(n+1,n+1+k))]#取数
    y=y[y.notnull()]#剔除空值
    return lagrange(y.index,list(y))(n)#插值并返回插值结果

for i in data.columns:
    for j in range(len(data)):
        if(data[i].isnull())[j]:#如果为空则插值
            data[i][j]=ployinterp_column(data[i],j)
data.to_excel('E:/python/wangyiPython/the sixth week/data/out_lagrange_file.xls')

 利用函数或映射进行数据转换

data=DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal={ 
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}
data['animal']=data['food'].map(str.lower).map(meat_to_animal)
data

#相当于

data['food'].map(lambda x:meat_to_animal[x.lower()])

 数据标准化

datafile='E:/python/wangyiPython/the sixth week/data/normalization_data.xls'
data=pd.read_excel(datafile,header=None)
data

(data-data.min())/(data.max()-data.min())#最小-最大规范化
data/10**np.ceil(np.log10(data.abs().max()))# #小数定标规范化
(data-data.mean())/data.std() #零-均值规范化

替换值 

data=Series([1.,-999., 2., -999., -1000., 3.])
data.replace(-999,np.nan) #将-999替换为nan

#将-999替换为nan,-1000替换为0
data.replace([-999, -1000], [np.nan,0]) 
data.replace({-999:np.nan,-1000:0})

重命名轴索引

data=DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])

data.index=data.index.map(str.upper)

data.rename(index=str.title,columns=str.upper)

data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

_=data.rename(index={'OHTO':'INDIANA'},inplace=True)#总是返回DataFrame的引用

离散化与面元划分 



#cut将根据值本身来选择箱子均匀间隔,qcut是根据这些值的频率来选择箱子的均匀间隔

in:ages=[20,22,25,27,21,23,37,31,61,45,41,32]
in:bins=[18,25,35,60,100]
in:cats=pd.cut(ages,bins)#bins 分组依据,pd.cut()默认左开右闭
in:cats

out:[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

in:cats.codes

out:array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

in:cats.categories

out:IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

in:pd.value_counts(cats)
out:
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

in:pd.cut(ages,[18,26,36,61,100],right=False)
out:[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

in:group_names=['Youth','YoungAdult', 'MiddleAged', 'Senior']
in:pd.cut(ages,bins,labels=group_names)  #将分组类别命名为group_names
out:[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

in:data=np.random.rand(20) #随机产生20个位于0~1的数字
in:pd.cut(data,4,precision=2) #将data分为4个类别,类别保留两位小数
out:[(0.05, 0.29], (0.29, 0.52], (0.05, 0.29], (0.76, 0.99], (0.76, 0.99], ..., (0.76, 0.99], (0.52, 0.76], (0.29, 0.52], (0.76, 0.99], (0.76, 0.99]]
Length: 20
Categories (4, interval[float64]): [(0.05, 0.29] < (0.29, 0.52] < (0.52, 0.76] < (0.76, 0.99]]

in:data=np.random.randn(1000)# Normally distributed
in:cats=pd.qcut(data,4)  # Cut into quartiles
in:cats
out:[(0.631, 3.38], (0.631, 3.38], (-3.0309999999999997, -0.642], (-3.0309999999999997, -0.642], (-0.02, 0.631], ..., (0.631, 3.38], (0.631, 3.38], (0.631, 3.38], (-0.642, -0.02], (-3.0309999999999997, -0.642]]
Length: 1000
Categories (4, interval[float64]): [(-3.0309999999999997, -0.642] < (-0.642, -0.02] < (-0.02, 0.631] < (0.631, 3.38]]

检测和过滤异常值

in:np.random.seed(12345)
in:data=DataFrame(np.random.randn(1000,4))
in:data.describe()

out:         0	         1	               2 	    3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067684	0.067924	0.025598	-0.002298
std	0.998035	0.992106	1.006835	0.996794
min	-3.428254	-3.548824	-3.184377	-3.745356
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.366626	2.653656	3.260383	3.927528

in:col=data[3]
in:col[np.abs(col)>3]
out:
97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

in:data[(np.abs(data)>3).any(1)]#any(1)每一列
out:	   0	  1	             2	           3
5	-0.539741	0.476985	3.248944	-1.021228
97	-0.774363	0.552936	0.106061	3.927528
102	-0.655054	-0.565230	3.176873	0.959533
305	-2.315555	0.457246	-0.025907	-3.399312
324	0.050188	1.951312	3.260383	0.963301
400	0.146326	0.508391	-0.196713	-3.745356
499	-0.293333	-0.242459	-3.056990	1.918403
523	-3.428254	-0.296336	-0.439938	-0.867165
586	0.275144	1.179227	-3.184377	1.369891
808	-0.362528	-3.548824	1.553205	-2.186301
900	3.366626	-2.372214	0.851010	1.332846

#将绝对值大于3的值全部用3或-3代替
in:data[np.abs(data)>3]=np.sign(data)*3  #数组元素的符号:np.sign(a),返回数组中各元素的正负in:符号,用1和-1表示
in:data.describe()
out:       0	          1           	2	       3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	-0.067623	0.068473	0.025153	-0.002081
std	0.995485	0.990253	1.003977	0.989736
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.774890	-0.591841	-0.641675	-0.644144
50%	-0.116401	0.101143	0.002073	-0.013611
75%	0.616366	0.780282	0.680391	0.654328
max	3.000000	2.653656	3.000000	3.000000

排列与随机采样

df=DataFrame(np.arange(5*4).reshape((5,4)))
sampler=np.random.permutation(5) #随机排列一个序列,返回一个排列的序列。
df.take(sampler)#按sampler的序列重新排列df

df.take(np.random.permutation(len(df))[:3])

#重新排列bag,每个数字排列两次
bag=np.array([5,7,-1,6,4])
sampler=np.random.randint(0,len(bag),size=10)
sampler
draws=bag.take(sampler)
draws

计算指标与哑变量

pandas 数据归整化 清理,转换,合并,重塑

#Using the get_dummies will create a new column for every unique string in a certain column:使用get_dummies进行one-hot编码
df=DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})
pd.get_dummies(df['key'])

dummies=pd.get_dummies(df['key'],prefix='key')
df_with_dummy=df[['data1']].join(dummies)#显示具体数据

 对于DataFrame中某一列已包含多个类别的处理

#以处理movies类型为例

mnames=['movie_id','title','genres']
movies=pd.read_table('E:/python/wangyiPython/the sixth week/data/movies.dat',sep='::',header=None,names=mnames)
movies[:10]

输出数据前10行 

pandas 数据归整化 清理,转换,合并,重塑

#首先提取所有genres
genre_iter=(set(x.split('|')) for x in movies.genres)
genres=sorted(set.union(*genre_iter))
dummies=DataFrame(np.zeros((len(movies),len(genres))),columns=genres)

#接下来,迭代每一部电影并将dummies各行的项设置为1
for i,gen in enumerate(movies.genres):
    dummies.loc[i,gen.split('|')]=1

#然后与电影连接起来
movies_windic=movies.join(dummies.add_prefix('Genre_'))
movies_windic.loc[0]

 pandas 数据归整化 清理,转换,合并,重塑

pd.cut()与get_dummies()的联合使用

np.random.seed(12345)
values=np.random.rand(10)
bins=[0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values,bin))

线损率属性构造

#参数初始化
inputfile= 'E:/python/wangyiPython/the sixth week/data/electricity_data.xls' #供入供出电量数据
outputfile = 'E:/python/wangyiPython/the sixth week/data/electricity_data.xls' #属性构造后数据文件

data = pd.read_excel(inputfile) #读入数据
data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量'])/data[u'供入电量']

data.to_excel(outputfile, index = False) #保存结果