pandas 数据归整化 清理,转换,合并,重塑
程序员文章站
2024-02-11 21:29:40
...
from __future__ import division
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from scipy.interpolate import lagrange #导入拉格朗日插值函数
from pandas import Series,DataFrame
inputfile_path='E:/python/wangyiPython/the sixth week/data/catering_sale.xls'
outputfile_path='E:/python/wangyiPython/the sixth week/data/done_sale'
data=pd.read_excel(inputfile_path)
data['销量'][(data['销量']<400)|(data['销量']>5000)]= None #过滤异常值,将其变为空
data
移除重复数据
data=DataFrame({'k1':['one']*3+['two']*4,
'k2':[1,1,2,3,3,4,4]})
data.duplicated() #返回true或false,重复则为true
data.drop_duplicates()#移除重复值
data['v1']=range(7)
data.drop_duplicates(['k1'])#移除K1列出现重复值的记录
data.drop_duplicates(['k1','k2'],take_last=True)#保留重复值的最后一个,take_lasst=true
自定义列向量插值函数
自定义列向量插值函数
s 为列向量,n为被插值的位置,k为取取前后的数据个数,默认为5
def ployinterp_column(s,n,k=5):
y=s[list(range(n-k,n))+list(range(n+1,n+1+k))]#取数
y=y[y.notnull()]#剔除空值
return lagrange(y.index,list(y))(n)#插值并返回插值结果
for i in data.columns:
for j in range(len(data)):
if(data[i].isnull())[j]:#如果为空则插值
data[i][j]=ployinterp_column(data[i],j)
data.to_excel('E:/python/wangyiPython/the sixth week/data/out_lagrange_file.xls')
利用函数或映射进行数据转换
data=DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
'corned beef', 'Bacon', 'pastrami', 'honey ham',
'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal={
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
data['animal']=data['food'].map(str.lower).map(meat_to_animal)
data
#相当于
data['food'].map(lambda x:meat_to_animal[x.lower()])
数据标准化
datafile='E:/python/wangyiPython/the sixth week/data/normalization_data.xls'
data=pd.read_excel(datafile,header=None)
data
(data-data.min())/(data.max()-data.min())#最小-最大规范化
data/10**np.ceil(np.log10(data.abs().max()))# #小数定标规范化
(data-data.mean())/data.std() #零-均值规范化
替换值
data=Series([1.,-999., 2., -999., -1000., 3.])
data.replace(-999,np.nan) #将-999替换为nan
#将-999替换为nan,-1000替换为0
data.replace([-999, -1000], [np.nan,0])
data.replace({-999:np.nan,-1000:0})
重命名轴索引
data=DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])
data.index=data.index.map(str.upper)
data.rename(index=str.title,columns=str.upper)
data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})
_=data.rename(index={'OHTO':'INDIANA'},inplace=True)#总是返回DataFrame的引用
离散化与面元划分
#cut将根据值本身来选择箱子均匀间隔,qcut是根据这些值的频率来选择箱子的均匀间隔
in:ages=[20,22,25,27,21,23,37,31,61,45,41,32]
in:bins=[18,25,35,60,100]
in:cats=pd.cut(ages,bins)#bins 分组依据,pd.cut()默认左开右闭
in:cats
out:[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
in:cats.codes
out:array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
in:cats.categories
out:IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
closed='right',
dtype='interval[int64]')
in:pd.value_counts(cats)
out:
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
in:pd.cut(ages,[18,26,36,61,100],right=False)
out:[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
in:group_names=['Youth','YoungAdult', 'MiddleAged', 'Senior']
in:pd.cut(ages,bins,labels=group_names) #将分组类别命名为group_names
out:[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
in:data=np.random.rand(20) #随机产生20个位于0~1的数字
in:pd.cut(data,4,precision=2) #将data分为4个类别,类别保留两位小数
out:[(0.05, 0.29], (0.29, 0.52], (0.05, 0.29], (0.76, 0.99], (0.76, 0.99], ..., (0.76, 0.99], (0.52, 0.76], (0.29, 0.52], (0.76, 0.99], (0.76, 0.99]]
Length: 20
Categories (4, interval[float64]): [(0.05, 0.29] < (0.29, 0.52] < (0.52, 0.76] < (0.76, 0.99]]
in:data=np.random.randn(1000)# Normally distributed
in:cats=pd.qcut(data,4) # Cut into quartiles
in:cats
out:[(0.631, 3.38], (0.631, 3.38], (-3.0309999999999997, -0.642], (-3.0309999999999997, -0.642], (-0.02, 0.631], ..., (0.631, 3.38], (0.631, 3.38], (0.631, 3.38], (-0.642, -0.02], (-3.0309999999999997, -0.642]]
Length: 1000
Categories (4, interval[float64]): [(-3.0309999999999997, -0.642] < (-0.642, -0.02] < (-0.02, 0.631] < (0.631, 3.38]]
检测和过滤异常值
in:np.random.seed(12345)
in:data=DataFrame(np.random.randn(1000,4))
in:data.describe()
out: 0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528
in:col=data[3]
in:col[np.abs(col)>3]
out:
97 3.927528
305 -3.399312
400 -3.745356
Name: 3, dtype: float64
in:data[(np.abs(data)>3).any(1)]#any(1)每一列
out: 0 1 2 3
5 -0.539741 0.476985 3.248944 -1.021228
97 -0.774363 0.552936 0.106061 3.927528
102 -0.655054 -0.565230 3.176873 0.959533
305 -2.315555 0.457246 -0.025907 -3.399312
324 0.050188 1.951312 3.260383 0.963301
400 0.146326 0.508391 -0.196713 -3.745356
499 -0.293333 -0.242459 -3.056990 1.918403
523 -3.428254 -0.296336 -0.439938 -0.867165
586 0.275144 1.179227 -3.184377 1.369891
808 -0.362528 -3.548824 1.553205 -2.186301
900 3.366626 -2.372214 0.851010 1.332846
#将绝对值大于3的值全部用3或-3代替
in:data[np.abs(data)>3]=np.sign(data)*3 #数组元素的符号:np.sign(a),返回数组中各元素的正负in:符号,用1和-1表示
in:data.describe()
out: 0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067623 0.068473 0.025153 -0.002081
std 0.995485 0.990253 1.003977 0.989736
min -3.000000 -3.000000 -3.000000 -3.000000
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.000000 2.653656 3.000000 3.000000
排列与随机采样
df=DataFrame(np.arange(5*4).reshape((5,4)))
sampler=np.random.permutation(5) #随机排列一个序列,返回一个排列的序列。
df.take(sampler)#按sampler的序列重新排列df
df.take(np.random.permutation(len(df))[:3])
#重新排列bag,每个数字排列两次
bag=np.array([5,7,-1,6,4])
sampler=np.random.randint(0,len(bag),size=10)
sampler
draws=bag.take(sampler)
draws
计算指标与哑变量
#Using the get_dummies will create a new column for every unique string in a certain column:使用get_dummies进行one-hot编码
df=DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
pd.get_dummies(df['key'])
dummies=pd.get_dummies(df['key'],prefix='key')
df_with_dummy=df[['data1']].join(dummies)#显示具体数据
对于DataFrame中某一列已包含多个类别的处理
#以处理movies类型为例
mnames=['movie_id','title','genres']
movies=pd.read_table('E:/python/wangyiPython/the sixth week/data/movies.dat',sep='::',header=None,names=mnames)
movies[:10]
输出数据前10行
#首先提取所有genres
genre_iter=(set(x.split('|')) for x in movies.genres)
genres=sorted(set.union(*genre_iter))
dummies=DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
#接下来,迭代每一部电影并将dummies各行的项设置为1
for i,gen in enumerate(movies.genres):
dummies.loc[i,gen.split('|')]=1
#然后与电影连接起来
movies_windic=movies.join(dummies.add_prefix('Genre_'))
movies_windic.loc[0]
pd.cut()与get_dummies()的联合使用
np.random.seed(12345)
values=np.random.rand(10)
bins=[0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values,bin))
线损率属性构造
#参数初始化
inputfile= 'E:/python/wangyiPython/the sixth week/data/electricity_data.xls' #供入供出电量数据
outputfile = 'E:/python/wangyiPython/the sixth week/data/electricity_data.xls' #属性构造后数据文件
data = pd.read_excel(inputfile) #读入数据
data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量'])/data[u'供入电量']
data.to_excel(outputfile, index = False) #保存结果