python决策树DecisionTreeClassifier模型
程序员文章站
2024-02-16 12:59:58
...
运行环境:win10 64位 py 3.6 pycharm 2018.1.1
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import cross_validation
import matplotlib.pyplot as plt
from sklearn import datasets
#加载数据
def load_data():
iris = datasets.load_iris()
X_train = iris.data
y_train = iris.target
return cross_validation.train_test_split(X_train, y_train, test_size=0.25, random_state=0, stratify=y_train)
#利用决策树进行分类
def test_DecisionTreeClassifier(*data):
X_train, X_test, y_train, y_test = data
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print("Traing score:%f"%(clf.score(X_train,y_train)))
print("Testing score:%f"%(clf.score(X_test,y_test)))
X_train, X_test, y_train, y_test = load_data()
test_DecisionTreeClassifier(X_train, X_test, y_train, y_test)
#考察评价切分质量的评价标准criterion对于分类性能的影响
def test_DecisionTreeClassifier_criterion(*data):
X_train, X_test, y_train, y_test = data
criterions = ['gini','entropy']
for criterion in criterions:
clf = DecisionTreeClassifier(criterion=criterion)
clf.fit(X_train,y_train)
print('criterion:%s'%criterion)
print("Traing score:%f" % (clf.score(X_train, y_train)))
print("Testing score:%f"%(clf.score(X_test,y_test)))
X_train, X_test, y_train, y_test = load_data()
test_DecisionTreeClassifier_criterion(X_train, X_test, y_train, y_test)
# 检测随机划分与最优划分的影响
def test_DecisionTreeClassifier_splitter(*data):
X_train, X_test, y_train, y_test = data
splitters = ['best','random']
for splitter in splitters:
clf = DecisionTreeClassifier(splitter=splitter)
clf.fit(X_train,y_train)
print("splitter:%s"%splitter)
print("Traing score:%f" % (clf.score(X_train, y_train)))
print("Testing score:%f"%(clf.score(X_test,y_test)))
X_train, X_test, y_train, y_test = load_data()
test_DecisionTreeClassifier_splitter(X_train, X_test, y_train, y_test)
#考察深度对分类决策树的影响
def test_DecisionTreeClassifiter_depth(*data,maxdepth):
X_train, X_test, y_train, y_test = data
depths = np.arange(1,maxdepth)
training_scores = []
testing_scores = []
for depth in depths:
clf = DecisionTreeClassifier(max_depth=depth)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
#绘图
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(depths,training_scores,label='traing score',marker='o')
ax.plot(depths,testing_scores,label='testing score',marker='*')
ax.set_xlabel('maxdepth')
ax.set_ylabel('score')
ax.set_title('Decision Tree Classification')
ax.legend(framealpha=0.5,loc='best')
plt.show()
X_train, X_test, y_train, y_test = load_data()
test_DecisionTreeClassifiter_depth(X_train, X_test, y_train, y_test,maxdepth=20)