import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data
target = iris.target
feature_name = iris.feature_names
target_names = iris.target_names
data1 = pd.DataFrame(data,columns=feature_name)
print(data1.head(2))
data2 = pd.read_csv('Narrativedata.csv',index_col=[0])
data2.dropna(inplace=True)
print(data2.head(2))
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
Age Sex Embarked Survived
0 22.0 male S No
1 38.0 female C Yes
归一化
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler = scaler.fit(data1)
result = scaler.transform(data1)
result[:5]
array([[0.22222222, 0.625 , 0.06779661, 0.04166667],
[0.16666667, 0.41666667, 0.06779661, 0.04166667],
[0.11111111, 0.5 , 0.05084746, 0.04166667],
[0.08333333, 0.45833333, 0.08474576, 0.04166667],
[0.19444444, 0.66666667, 0.06779661, 0.04166667]])
result_ = scaler.fit_transform(data)
result_[:5]
array([[0.22222222, 0.625 , 0.06779661, 0.04166667],
[0.16666667, 0.41666667, 0.06779661, 0.04166667],
[0.11111111, 0.5 , 0.05084746, 0.04166667],
[0.08333333, 0.45833333, 0.08474576, 0.04166667],
[0.19444444, 0.66666667, 0.06779661, 0.04166667]])
scaler.inverse_transform(result)[:5]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2]])
scaler = MinMaxScaler(feature_range=[5,10])
result = scaler.fit_transform(data1)
result[:5]
array([[6.11111111, 8.125 , 5.33898305, 5.20833333],
[5.83333333, 7.08333333, 5.33898305, 5.20833333],
[5.55555556, 7.5 , 5.25423729, 5.20833333],
[5.41666667, 7.29166667, 5.42372881, 5.20833333],
[5.97222222, 8.33333333, 5.33898305, 5.20833333]])
scaler = scaler.partial_fit(data1)
scaler.transform(data1)[:5]
array([[6.11111111, 8.125 , 5.33898305, 5.20833333],
[5.83333333, 7.08333333, 5.33898305, 5.20833333],
[5.55555556, 7.5 , 5.25423729, 5.20833333],
[5.41666667, 7.29166667, 5.42372881, 5.20833333],
[5.97222222, 8.33333333, 5.33898305, 5.20833333]])
标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data1)
x_std = scaler.transform(data1)
scaler.mean_
array([5.84333333, 3.05733333, 3.758 , 1.19933333])
scaler.var_
array([0.68112222, 0.18871289, 3.09550267, 0.57713289])
print('mean: ',x_std.mean())
print('std: ',x_std.std())
mean: -1.4684549872375404e-15
std: 1.0
scaler.fit_transform(data1)[:5]
array([[-0.90068117, 1.01900435, -1.34022653, -1.3154443 ],
[-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
[-1.38535265, 0.32841405, -1.39706395, -1.3154443 ],
[-1.50652052, 0.09821729, -1.2833891 , -1.3154443 ],
[-1.02184904, 1.24920112, -1.34022653, -1.3154443 ]])
scaler.inverse_transform(x_std)[:5]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2]])
标准化和归一化的选择
sklearn中的其他库
特征编码
离散型数据-针对标签label
from sklearn.preprocessing import LabelEncoder
y = data2.iloc[:,-1]
le = LabelEncoder()
le = le.fit(y)
label = le.transform(y)
label[:3]
array([0, 2, 2])
label = le.fit_transform(y)
label[:3]
array([0, 2, 2])
le.classes_
array(['No', 'Unknown', 'Yes'], dtype=object)
le.inverse_transform(label)[:3]
array(['No', 'Yes', 'Yes'], dtype=object)
data2.iloc[:,-1] = label
data2.head(3)
|
Age |
Sex |
Embarked |
Survived |
0 |
22.0 |
male |
S |
0 |
1 |
38.0 |
female |
C |
2 |
2 |
26.0 |
female |
S |
2 |
离散型数据-针对特征
from sklearn.preprocessing import OrdinalEncoder
data_ = data2.copy()
data_.head(3)
|
Age |
Sex |
Embarked |
Survived |
0 |
22.0 |
male |
S |
0 |
1 |
38.0 |
female |
C |
2 |
2 |
26.0 |
female |
S |
2 |
model = OrdinalEncoder().fit(data_.iloc[:,1:-1])
model.categories_
[array([0., 1.]), array([0., 1., 2.])]
data_.iloc[:,1:-1] = model.fit_transform(data_.iloc[:,1:-1])
data_.head(3)
|
Age |
Sex |
Embarked |
Survived |
0 |
22.0 |
1.0 |
2.0 |
0 |
1 |
38.0 |
0.0 |
0.0 |
2 |
2 |
26.0 |
0.0 |
2.0 |
2 |
独热编码,创建哑变量
data2.head(3)
|
Age |
Sex |
Embarked |
Survived |
0 |
22.0 |
male |
S |
0 |
1 |
38.0 |
female |
C |
2 |
2 |
26.0 |
female |
S |
2 |
from sklearn.preprocessing import OneHotEncoder
X = data2.iloc[:,1:-1]
model = OneHotEncoder(categories='auto').fit(X)
result = model.transform(X).toarray()
result[:3]
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
OneHotEncoder(categories='auto').fit_transform(X).toarray()[:3]
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
pd.DataFrame(enc.inverse_transform(result))
model.get_feature_names()
result[:3]
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
result.shape
(712, 5)
newdata = pd.concat([data2,pd.DataFrame(result)],axis=1)
newdata.head(3)
|
Age |
Sex |
Embarked |
Survived |
0 |
1 |
2 |
3 |
4 |
0 |
22.0 |
male |
S |
No |
0.0 |
1.0 |
0.0 |
0.0 |
1.0 |
1 |
38.0 |
female |
C |
Yes |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
2 |
26.0 |
female |
S |
Yes |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
newdata.drop(["Sex","Embarked"],axis=1,inplace=True)
newdata.columns = ["Age","Survived","Female","Male","Embarked_C","Embarked_Q","Embarked_S"]
newdata.head(3)
|
Age |
Survived |
Female |
Male |
Embarked_C |
Embarked_Q |
Embarked_S |
0 |
22.0 |
No |
0.0 |
1.0 |
0.0 |
0.0 |
1.0 |
1 |
38.0 |
Yes |
1.0 |
0.0 |
1.0 |
0.0 |
0.0 |
2 |
26.0 |
Yes |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
连续型数据-二分
data_2 = data2.copy()
from sklearn.preprocessing import Binarizer
X = data_2.iloc[:,0].values.reshape(-1,1)
transformer = Binarizer(threshold=30).fit_transform(X)
transformer[:3]
array([[0.],
[1.],
[0.]])
连续型数据-分箱
from sklearn.preprocessing import KBinsDiscretizer
X = data2.iloc[:,0].values.reshape(-1,1)
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit_transform(X)
set(est.fit_transform(X).ravel())
est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
est.fit_transform(X).toarray()[:3]
array([[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.]])