欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

随机森林python实现

程序员文章站 2022-07-14 14:14:27
...
from sklearn.datasets import make_moons
import numpy as np
import matplotlib.pyplot as plt
from queue import Queue

epsilon = 1e-5

judge_forest = []
data_forest = []
type_forest = []
depth_forest = []


def train_test_split(X, y, test_ratio=0.25, seed=None):  # 数据分离函数,X,y为样本和标签,test_ratio为测试部分所占比例,scikit-learn默认为0.25
    if seed:
        np.random.seed(seed)
    shuffled_indexes = np.random.permutation(len(X))  # 对样本次序进行随机排列
    test_size = int(len(X) * test_ratio)  # 测试样本数量
    train_index = shuffled_indexes[test_size:]  # 后半部分为训练
    test_index = shuffled_indexes[:test_size]  ##前半部分为测试
    return X[train_index], X[test_index], y[train_index], y[test_index]  # python下标可以为数组


##max_features为可供使用的特征数
def information_gain(X, y, max_features):
    X = np.array(X)
    (len_X, len_axis) = X.shape
    choosen_axis = np.random.choice(len_axis, size=max_features, replace=False)  ##选取max_features个特征值
    min_value = 1e10
    min_feature = 0
    min_axis = 0
    all_type_count = {}
    for i in range(len_X):
        all_type_count[int(y[i])] = all_type_count.get(int(y[i]), 0) + 1
    if len(all_type_count) == 1:  ##只有一种类型
        return -1, -1
    for axis in choosen_axis:
        sorted_X = X[:, axis].argsort()
        type_count = {}
        min_value1 = 1e10
        min_feature1 = 0
        for i in range(len(sorted_X) - 1):
            type_count[y[sorted_X[i]]] = type_count.get(y[sorted_X[i]], 0) + 1
            min_value0 = 0
            others = []
            for j in all_type_count.keys():
                if j not in type_count.keys():
                    others += [j]
            for j in type_count.keys():
                min_value0 += -((i + 1) / len_X) * (type_count[j] / (i + 1)) * np.log2(
                    type_count[j] / (i + 1) + epsilon) - (
                                      (len_X - i - 1) / len_X) * (
                                      (all_type_count[j] - type_count[j]) / (len_X - i - 1)) * np.log2(
                    (all_type_count[j] - type_count[j]) / (len_X - i - 1) + epsilon)
            for j in others:
                min_value0 += - ((len_X - i - 1) / len_X) * ((all_type_count[j]) / (len_X - i - 1)) * np.log2(
                    (all_type_count[j]) / (len_X - i - 1) + epsilon)
            if min_value0 <= min_value1:
                min_value1 = min_value0
                min_feature1 = X[sorted_X[i]][axis]
        if min_value > min_value1:  # 多个特征中靠前的特征优先
            min_value = min_value1
            min_feature = min_feature1
            min_axis = axis
    return min_feature, min_axis


def DecisionTreeClassifier(X, y, max_depth, max_features, times):
    q = Queue(maxsize=0)
    i = 1
    q.put(i)
    judge_forest[times][i] = judge_forest[times].get(i, ()) + information_gain(X, y, max_features)
    if judge_forest[times][i] == (-1, -1):
        data_forest[times][i] = data_forest[times].get(i, []) + list(X)
        type_forest[times][i] = type_forest[times].get(i, 0) + y[0]
        return
    data_forest[times][i] = data_forest[times].get(i, []) + list(X)
    depth_forest[times][i] = depth_forest[times].get(i, 0) + 1
    type_forest[times][i] = type_forest[times].get(i, []) + list(y)
    while not q.empty():
        j = q.get()
        (boundary, axis) = judge_forest[times][j]
        left_child = []
        right_child = []
        left_type = []
        right_type = []
        for i in range(len(data_forest[times][j])):
            if data_forest[times][j][i][axis] <= boundary:
                left_child += [data_forest[times][j][i]]
                left_type += [type_forest[times][j][i]]
            else:
                right_child += [data_forest[times][j][i]]
                right_type += [type_forest[times][j][i]]
        left_j = 2 * j
        right_j = 2 * j + 1
        data_forest[times][left_j] = data_forest[times].get(left_j, []) + left_child
        data_forest[times][right_j] = data_forest[times].get(right_j, []) + right_child
        depth_forest[times][left_j] = depth_forest[times].get(left_j, 0) + depth_forest[times][j] + 1
        depth_forest[times][right_j] = depth_forest[times].get(right_j, 0) + depth_forest[times][j] + 1
        type_forest[times][left_j] = type_forest[times].get(left_j, []) + left_type
        type_forest[times][right_j] = type_forest[times].get(right_j, []) + right_type
        if depth_forest[times][j] + 1 == max_depth:
            left_type_count = {}
            right_type_count = {}
            for element in range(len(type_forest[times][left_j])):
                left_type_count[type_forest[times][left_j][element]] = left_type_count.get(
                    type_forest[times][left_j][element], 0) + 1
            type_forest[times][left_j] = [int(max(left_type_count))]
            for element in range(len(type_forest[times][right_j])):
                right_type_count[type_forest[times][right_j][element]] = right_type_count.get(
                    type_forest[times][right_j][element],
                    0) + 1
            type_forest[times][right_j] = [int(max(right_type_count))]
        else:
            (boundary_left, axis_left) = information_gain(data_forest[times][left_j], type_forest[times][left_j],
                                                          max_features)
            if axis_left == -1:
                left_type_count = {}
                for element in range(len(type_forest[times][left_j])):
                    left_type_count[type_forest[times][left_j][element]] = left_type_count.get(
                        type_forest[times][left_j][element],
                        0) + 1
                type_forest[times][left_j] = [int(max(left_type_count))]
            else:
                judge_forest[times][left_j] = judge_forest[times].get(left_j, ()) + (boundary_left, axis_left)
                q.put(left_j)
            (boundary_right, axis_right) = information_gain(data_forest[times][right_j], type_forest[times][right_j],
                                                            max_features)
            if axis_right == -1:
                right_type_count = {}
                for element in range(len(type_forest[times][right_j])):
                    right_type_count[type_forest[times][right_j][element]] = right_type_count.get(
                        type_forest[times][right_j][element],
                        0) + 1
                type_forest[times][right_j] = [int(max(right_type_count))]
            else:
                judge_forest[times][right_j] = judge_forest[times].get(right_j, ()) + (boundary_right, axis_right)
                q.put(right_j)


def solve(X, node, times):
    if len(type_forest[times][node]) == 1:
        return type_forest[times][node][0]
    else:
        if X[judge_forest[times][node][1]] <= judge_forest[times][node][0]:
            return solve(X, 2 * node, times)
        else:
            return solve(X, 2 * node + 1, times)


def RandomForestClassifier(X_train, y_train, n_estimators=100, max_features=0, max_depth=1):
    (r, c) = X_train.shape
    if max_features <= 0:
        max_features = c
    for times in range(n_estimators):
        global judge_forest, data_forest, type_forest, depth_forest
        judge_forest = judge_forest + [{}]
        data_forest = data_forest + [{}]
        type_forest = type_forest + [{}]
        depth_forest = depth_forest + [{}]
        choose = np.random.randint(0, r, r)
        times_X = X_train[choose]
        times_y = y_train[choose]
        ##创建决策树
        DecisionTreeClassifier(X=times_X, y=times_y, max_depth=max_depth, max_features=max_features, times=times)


def predict(X):
    forest_shape = len(judge_forest)
    ananyse = {}
    for i in range(forest_shape):
        kind = solve(X, 1, i)
        ananyse[kind] = ananyse.get(kind, 0) + 1
    return max(ananyse, key=ananyse.get)


def forest_score(X_test, y_test):
    len_test = X_test.shape[0]
    correct = 0
    for i in range(len_test):
        temp = predict(X_test[i, :])
        if temp == y_test[i]:
            correct += 1
    return correct / len_test


X, y = make_moons(n_samples=1000, noise=0.25, random_state=3)  # 生成make_moon型数据
X_train, X_test, y_train, y_test = train_test_split(X, y)  # 分成训练集和测试集
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)  # 绘制训练集的散点图
plt.show()
n_estimators = int(input("n_estimators:"))
RandomForestClassifier(X_train, y_train, n_estimators)
print("Accuracy on training set: {:.3f}".format(forest_score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest_score(X_test, y_test)))

运行结果:
随机森林python实现

n_estimators:39
Accuracy on training set: 0.999
Accuracy on test set: 0.972