决策树之实现
程序员文章站
2022-05-21 23:29:25
...
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
##http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt
data = pd.read_csv('aaa.txt')
# print(data)
#2,pclass, age, sex
x = data[['pclass','age','sex']]
y = data['survived']
#3.数据清洗:
# print(x.isnull().sum())
#填充缺失值
x['age'].fillna(x['age'].mean(),inplace = True)
# print(x.isnull().sum())
#one-hot编码: —— 替代类别号
y1= x.to_dict(orient='records') ##转换为字典类型
# print(y1)
#转化为数组类型
dict1 = DictVectorizer(sparse=False)
# print(dict1)
x = dict1.fit_transform(y1)
# print(x)
print(dict1.get_feature_names())
##测试集、训练集的分割:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7)
#决策树 :
dc = DecisionTreeClassifier(criterion='entropy',max_depth=5)
y = dc.fit(x_train,y_train)
# print(y)
y1 = dc.predict(x_test)
print('测试结果:\n',y1)
print('真实结果:\n',y_test)
#准确度的方法:
print(dc.score(x_test, y_test))
from sklearn.tree import export_graphviz
export_graphviz(dc,out_file='tree.dot',feature_names=['age', 'pclass=1st', 'sex=female', 'sex=male'])
推荐阅读