随机森林(一):分类树
程序员文章站
2022-07-14 14:49:09
...
from sklearn.datasets import load_wine
from scipy.special import comb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
# 处理数据
wine_datas = load_wine()
x, y = wine_datas.data, wine_datas.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# 建立模型
clf = DecisionTreeClassifier(random_state=2)
rfc = RandomForestClassifier(random_state=3)
# 训练模型
clf = clf.fit(x_train, y_train)
rfc = rfc.fit(x_train, y_train)
# 拟合模型
score_clf = clf.score(x_test, y_test)
score_rfc = rfc.score(x_test, y_test)
print("clf :{}".format(score_clf), 'rfc:{}'.format(score_rfc))
# 使用交叉验证试一试
clf_cross = cross_val_score(clf, x, y, cv=10)
rfc_cross = cross_val_score(rfc, x, y, cv=10)
# 画出图来看一看
plt.plot(range(0, 10), clf_cross, label='DecisionTree') # (横坐标,纵坐标,线的名字)
plt.plot(range(0, 10), rfc_cross, label='RandomForest')
plt.legend()
plt.show()
# 然后查看一下100次 交叉验证下面 的分数
clf_l = []
rfc_l = []
for i in range(0, 10):
clf = DecisionTreeClassifier()
clf_score = cross_val_score(clf, x, y, cv=10).mean()
clf_l.append(clf_score)
rfc = RandomForestClassifier(n_estimators=25)
rfc_score = cross_val_score(rfc, x, y, cv=10).mean()
rfc_l.append(rfc_score)
plt.plot(range(0, 10), clf_l, label='DecisionTree')
plt.plot(range(0, 10), rfc_l, label='RandomForest')
plt.legend()
plt.show()
# 确定一个自优秀的基本决策树的个数 estimator的学习曲线
mylist = []
for i in range(20):
rfc = RandomForestClassifier(n_estimators=i+1, n_jobs=-1)
rfc_cross = cross_val_score(rfc, x, y, cv=10).mean()
mylist.append(rfc_cross)
print(max(mylist), mylist.index(max(mylist)))
plt.figure(figsize=[20, 5])
plt.plot(range(0, 20), mylist)
plt.show()
# 实现书中的那个公式 25棵树判错的概率 假设一咳嗽是0.2
a = np.array([comb(25, i)*(0.2**i)*(1-0.2)**(25-i) for i in range(13, 26)]).sum()
# 随即森林因为每个决策树的random_state都不一样,所以给予随机性、
print(rfc.estimators_)
# 随即森林因为有袋外数据,所以可以不划分训练集和和测试集
rfc = RandomForestClassifier(oob_score=True)
rfc.fit(x, y) # 没有划分训练集 测试集 。要么划分训练集和测试集交叉验证,要么使用带袋外数据,都行
print(rfc.oob_score_)