随机森林python实现
程序员文章站
2022-07-14 14:14:27
...
from sklearn.datasets import make_moons
import numpy as np
import matplotlib.pyplot as plt
from queue import Queue
epsilon = 1e-5
judge_forest = []
data_forest = []
type_forest = []
depth_forest = []
def train_test_split(X, y, test_ratio=0.25, seed=None): # 数据分离函数,X,y为样本和标签,test_ratio为测试部分所占比例,scikit-learn默认为0.25
if seed:
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(X)) # 对样本次序进行随机排列
test_size = int(len(X) * test_ratio) # 测试样本数量
train_index = shuffled_indexes[test_size:] # 后半部分为训练
test_index = shuffled_indexes[:test_size] ##前半部分为测试
return X[train_index], X[test_index], y[train_index], y[test_index] # python下标可以为数组
##max_features为可供使用的特征数
def information_gain(X, y, max_features):
X = np.array(X)
(len_X, len_axis) = X.shape
choosen_axis = np.random.choice(len_axis, size=max_features, replace=False) ##选取max_features个特征值
min_value = 1e10
min_feature = 0
min_axis = 0
all_type_count = {}
for i in range(len_X):
all_type_count[int(y[i])] = all_type_count.get(int(y[i]), 0) + 1
if len(all_type_count) == 1: ##只有一种类型
return -1, -1
for axis in choosen_axis:
sorted_X = X[:, axis].argsort()
type_count = {}
min_value1 = 1e10
min_feature1 = 0
for i in range(len(sorted_X) - 1):
type_count[y[sorted_X[i]]] = type_count.get(y[sorted_X[i]], 0) + 1
min_value0 = 0
others = []
for j in all_type_count.keys():
if j not in type_count.keys():
others += [j]
for j in type_count.keys():
min_value0 += -((i + 1) / len_X) * (type_count[j] / (i + 1)) * np.log2(
type_count[j] / (i + 1) + epsilon) - (
(len_X - i - 1) / len_X) * (
(all_type_count[j] - type_count[j]) / (len_X - i - 1)) * np.log2(
(all_type_count[j] - type_count[j]) / (len_X - i - 1) + epsilon)
for j in others:
min_value0 += - ((len_X - i - 1) / len_X) * ((all_type_count[j]) / (len_X - i - 1)) * np.log2(
(all_type_count[j]) / (len_X - i - 1) + epsilon)
if min_value0 <= min_value1:
min_value1 = min_value0
min_feature1 = X[sorted_X[i]][axis]
if min_value > min_value1: # 多个特征中靠前的特征优先
min_value = min_value1
min_feature = min_feature1
min_axis = axis
return min_feature, min_axis
def DecisionTreeClassifier(X, y, max_depth, max_features, times):
q = Queue(maxsize=0)
i = 1
q.put(i)
judge_forest[times][i] = judge_forest[times].get(i, ()) + information_gain(X, y, max_features)
if judge_forest[times][i] == (-1, -1):
data_forest[times][i] = data_forest[times].get(i, []) + list(X)
type_forest[times][i] = type_forest[times].get(i, 0) + y[0]
return
data_forest[times][i] = data_forest[times].get(i, []) + list(X)
depth_forest[times][i] = depth_forest[times].get(i, 0) + 1
type_forest[times][i] = type_forest[times].get(i, []) + list(y)
while not q.empty():
j = q.get()
(boundary, axis) = judge_forest[times][j]
left_child = []
right_child = []
left_type = []
right_type = []
for i in range(len(data_forest[times][j])):
if data_forest[times][j][i][axis] <= boundary:
left_child += [data_forest[times][j][i]]
left_type += [type_forest[times][j][i]]
else:
right_child += [data_forest[times][j][i]]
right_type += [type_forest[times][j][i]]
left_j = 2 * j
right_j = 2 * j + 1
data_forest[times][left_j] = data_forest[times].get(left_j, []) + left_child
data_forest[times][right_j] = data_forest[times].get(right_j, []) + right_child
depth_forest[times][left_j] = depth_forest[times].get(left_j, 0) + depth_forest[times][j] + 1
depth_forest[times][right_j] = depth_forest[times].get(right_j, 0) + depth_forest[times][j] + 1
type_forest[times][left_j] = type_forest[times].get(left_j, []) + left_type
type_forest[times][right_j] = type_forest[times].get(right_j, []) + right_type
if depth_forest[times][j] + 1 == max_depth:
left_type_count = {}
right_type_count = {}
for element in range(len(type_forest[times][left_j])):
left_type_count[type_forest[times][left_j][element]] = left_type_count.get(
type_forest[times][left_j][element], 0) + 1
type_forest[times][left_j] = [int(max(left_type_count))]
for element in range(len(type_forest[times][right_j])):
right_type_count[type_forest[times][right_j][element]] = right_type_count.get(
type_forest[times][right_j][element],
0) + 1
type_forest[times][right_j] = [int(max(right_type_count))]
else:
(boundary_left, axis_left) = information_gain(data_forest[times][left_j], type_forest[times][left_j],
max_features)
if axis_left == -1:
left_type_count = {}
for element in range(len(type_forest[times][left_j])):
left_type_count[type_forest[times][left_j][element]] = left_type_count.get(
type_forest[times][left_j][element],
0) + 1
type_forest[times][left_j] = [int(max(left_type_count))]
else:
judge_forest[times][left_j] = judge_forest[times].get(left_j, ()) + (boundary_left, axis_left)
q.put(left_j)
(boundary_right, axis_right) = information_gain(data_forest[times][right_j], type_forest[times][right_j],
max_features)
if axis_right == -1:
right_type_count = {}
for element in range(len(type_forest[times][right_j])):
right_type_count[type_forest[times][right_j][element]] = right_type_count.get(
type_forest[times][right_j][element],
0) + 1
type_forest[times][right_j] = [int(max(right_type_count))]
else:
judge_forest[times][right_j] = judge_forest[times].get(right_j, ()) + (boundary_right, axis_right)
q.put(right_j)
def solve(X, node, times):
if len(type_forest[times][node]) == 1:
return type_forest[times][node][0]
else:
if X[judge_forest[times][node][1]] <= judge_forest[times][node][0]:
return solve(X, 2 * node, times)
else:
return solve(X, 2 * node + 1, times)
def RandomForestClassifier(X_train, y_train, n_estimators=100, max_features=0, max_depth=1):
(r, c) = X_train.shape
if max_features <= 0:
max_features = c
for times in range(n_estimators):
global judge_forest, data_forest, type_forest, depth_forest
judge_forest = judge_forest + [{}]
data_forest = data_forest + [{}]
type_forest = type_forest + [{}]
depth_forest = depth_forest + [{}]
choose = np.random.randint(0, r, r)
times_X = X_train[choose]
times_y = y_train[choose]
##创建决策树
DecisionTreeClassifier(X=times_X, y=times_y, max_depth=max_depth, max_features=max_features, times=times)
def predict(X):
forest_shape = len(judge_forest)
ananyse = {}
for i in range(forest_shape):
kind = solve(X, 1, i)
ananyse[kind] = ananyse.get(kind, 0) + 1
return max(ananyse, key=ananyse.get)
def forest_score(X_test, y_test):
len_test = X_test.shape[0]
correct = 0
for i in range(len_test):
temp = predict(X_test[i, :])
if temp == y_test[i]:
correct += 1
return correct / len_test
X, y = make_moons(n_samples=1000, noise=0.25, random_state=3) # 生成make_moon型数据
X_train, X_test, y_train, y_test = train_test_split(X, y) # 分成训练集和测试集
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train) # 绘制训练集的散点图
plt.show()
n_estimators = int(input("n_estimators:"))
RandomForestClassifier(X_train, y_train, n_estimators)
print("Accuracy on training set: {:.3f}".format(forest_score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest_score(X_test, y_test)))
运行结果:
n_estimators:39
Accuracy on training set: 0.999
Accuracy on test set: 0.972
下一篇: Python随机森林