数据分析-数据处理-pandas处理特征值重复的数据
程序员文章站
2022-04-15 14:10:46
...
import pandas as pd
df = pd.read_csv('H:/JupyterWork/数据分析/数据分析概述/datafile/data4.csv', sep=',', encoding='gbk')
df = df.fillna(method='ffill')
df.loc[0,'k3'] = 11
k5 = df.insert(4,'k5',df['k4'])
# 求销量‘k3’和售价‘k4’之间的相似度
corr_info = df[['k3', 'k4']].corr(method='pearson')
# 定义求取特征值是否完全相同的矩阵的函数
def FeatureEqual(df):
dfEquals = pd.DataFrame([], columns=df.columns, index=df.columns)
for i in df.columns:
for j in df.columns:
dfEquals.loc[i,j] = df.loc[:,i].equals(df.loc[:,j])
return dfEquals
detEquals = FeatureEqual(df)
# 遍历所有数据
len_num = detEquals.shape[0] # 返回所有的列数
dupCol = [] # 重复的列
for k in range(len_num):
for l in range(k+1, len_num):
if detEquals.iloc[k,l] & (detEquals.columns[1] not in dupCol):
dupCol.append(detEquals.columns[1])
# 去重
df.drop(dupCol, axis=1, inplace=True)
上一篇: python中如何比较两个时间点
下一篇: python中dict是什么