欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

离散特征值处理案例

程序员文章站 2024-03-08 09:32:10
...

离散特征值处理案例

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, Binarizer

# 1.
data = pd.read_excel('score.xlsx')
print(data.head())

############ LabelEncoder:自动对元素进行编号,根据首字母大小顺序或者数字的大小顺序编号
le = LabelEncoder()
print(data['GENDER'])
gender = le.fit_transform(data['GENDER'])
data['GENDER_ID'] = gender
# del data['GENDER']
print(data)

################# OneHotEncoder: 独热编码,只有1位是1, 其余是0
ohe = OneHotEncoder(sparse=False);
print(data['GENDER_ID'])
gender = ohe.fit_transform(data[['GENDER_ID']])  # onehotencoder输入必须为二维矩阵,所以,要用[[ ]]
df_gender = pd.DataFrame(gender, columns=('ohe1', 'ohe2'))
data = pd.concat([data, df_gender], axis=1);
print(data)  # concat合并两个dataframe

# 或者用pd.get_dummies可以直接把字符串转成独热编码(onehotencoder不行),相当于
# LabelEncoder+OneHotEncoder
a = pd.get_dummies(data['GENDER']);
print(a)
print(pd.concat([data, a], axis=1))

################ Binarizer: 以门槛值划分为0和1
data['SCORE'] = np.random.randint(1, 101, 6)
print(data)

bin = Binarizer(threshold=60)
pass_unpass = bin.fit_transform(data[['SCORE']])  # 输入必须为二维数据
data['pass/unpass'] = pass_unpass
print(data)

# 2.
data = pd.DataFrame({'name': ['joe', 'john', 'bob', 'kevin'],
                     'age': [10, 20, 30, 10],
                     'sex': ['male', 'male', 'female', 'male']})
data['age'] = LabelEncoder().fit_transform(data['age'])
print(data)

# LabelEncoder会自动对元素进行编号,根据首字母大小顺序或者数字的大小顺序编号。
data['name_LE'] = LabelEncoder().fit_transform(data['name'])
data['sex_LE'] = LabelEncoder().fit_transform(data['sex'])
print(data)
print(data.info())

# LabelBinarizer: 下标转化为0和1
data['sex_LB'] = LabelBinarizer().fit_transform(data['sex'])
print(data)
print(data.info())

相关标签: 机器学习