随机森林python实现

程序员文章站 2022-07-14 14:14:27

...

from sklearn.datasets import make_moons
import numpy as np
import matplotlib.pyplot as plt
from queue import Queue

epsilon = 1e-5

judge_forest = []
data_forest = []
type_forest = []
depth_forest = []


def train_test_split(X, y, test_ratio=0.25, seed=None):  # 数据分离函数，X,y为样本和标签，test_ratio为测试部分所占比例,scikit-learn默认为0.25
    if seed:
        np.random.seed(seed)
    shuffled_indexes = np.random.permutation(len(X))  # 对样本次序进行随机排列
    test_size = int(len(X) * test_ratio)  # 测试样本数量
    train_index = shuffled_indexes[test_size:]  # 后半部分为训练
    test_index = shuffled_indexes[:test_size]  ##前半部分为测试
    return X[train_index], X[test_index], y[train_index], y[test_index]  # python下标可以为数组


##max_features为可供使用的特征数
def information_gain(X, y, max_features):
    X = np.array(X)
    (len_X, len_axis) = X.shape
    choosen_axis = np.random.choice(len_axis, size=max_features, replace=False)  ##选取max_features个特征值
    min_value = 1e10
    min_feature = 0
    min_axis = 0
    all_type_count = {}
    for i in range(len_X):
        all_type_count[int(y[i])] = all_type_count.get(int(y[i]), 0) + 1
    if len(all_type_count) == 1:  ##只有一种类型
        return -1, -1
    for axis in choosen_axis:
        sorted_X = X[:, axis].argsort()
        type_count = {}
        min_value1 = 1e10
        min_feature1 = 0
        for i in range(len(sorted_X) - 1):
            type_count[y[sorted_X[i]]] = type_count.get(y[sorted_X[i]], 0) + 1
            min_value0 = 0
            others = []
            for j in all_type_count.keys():
                if j not in type_count.keys():
                    others += [j]
            for j in type_count.keys():
                min_value0 += -((i + 1) / len_X) * (type_count[j] / (i + 1)) * np.log2(
                    type_count[j] / (i + 1) + epsilon) - (
                                      (len_X - i - 1) / len_X) * (
                                      (all_type_count[j] - type_count[j]) / (len_X - i - 1)) * np.log2(
                    (all_type_count[j] - type_count[j]) / (len_X - i - 1) + epsilon)
            for j in others:
                min_value0 += - ((len_X - i - 1) / len_X) * ((all_type_count[j]) / (len_X - i - 1)) * np.log2(
                    (all_type_count[j]) / (len_X - i - 1) + epsilon)
            if min_value0 <= min_value1:
                min_value1 = min_value0
                min_feature1 = X[sorted_X[i]][axis]
        if min_value > min_value1:  # 多个特征中靠前的特征优先
            min_value = min_value1
            min_feature = min_feature1
            min_axis = axis
    return min_feature, min_axis


def DecisionTreeClassifier(X, y, max_depth, max_features, times):
    q = Queue(maxsize=0)
    i = 1
    q.put(i)
    judge_forest[times][i] = judge_forest[times].get(i, ()) + information_gain(X, y, max_features)
    if judge_forest[times][i] == (-1, -1):
        data_forest[times][i] = data_forest[times].get(i, []) + list(X)
        type_forest[times][i] = type_forest[times].get(i, 0) + y[0]
        return
    data_forest[times][i] = data_forest[times].get(i, []) + list(X)
    depth_forest[times][i] = depth_forest[times].get(i, 0) + 1
    type_forest[times][i] = type_forest[times].get(i, []) + list(y)
    while not q.empty():
        j = q.get()
        (boundary, axis) = judge_forest[times][j]
        left_child = []
        right_child = []
        left_type = []
        right_type = []
        for i in range(len(data_forest[times][j])):
            if data_forest[times][j][i][axis] <= boundary:
                left_child += [data_forest[times][j][i]]
                left_type += [type_forest[times][j][i]]
            else:
                right_child += [data_forest[times][j][i]]
                right_type += [type_forest[times][j][i]]
        left_j = 2 * j
        right_j = 2 * j + 1
        data_forest[times][left_j] = data_forest[times].get(left_j, []) + left_child
        data_forest[times][right_j] = data_forest[times].get(right_j, []) + right_child
        depth_forest[times][left_j] = depth_forest[times].get(left_j, 0) + depth_forest[times][j] + 1
        depth_forest[times][right_j] = depth_forest[times].get(right_j, 0) + depth_forest[times][j] + 1
        type_forest[times][left_j] = type_forest[times].get(left_j, []) + left_type
        type_forest[times][right_j] = type_forest[times].get(right_j, []) + right_type
        if depth_forest[times][j] + 1 == max_depth:
            left_type_count = {}
            right_type_count = {}
            for element in range(len(type_forest[times][left_j])):
                left_type_count[type_forest[times][left_j][element]] = left_type_count.get(
                    type_forest[times][left_j][element], 0) + 1
            type_forest[times][left_j] = [int(max(left_type_count))]
            for element in range(len(type_forest[times][right_j])):
                right_type_count[type_forest[times][right_j][element]] = right_type_count.get(
                    type_forest[times][right_j][element],
                    0) + 1
            type_forest[times][right_j] = [int(max(right_type_count))]
        else:
            (boundary_left, axis_left) = information_gain(data_forest[times][left_j], type_forest[times][left_j],
                                                          max_features)
            if axis_left == -1:
                left_type_count = {}
                for element in range(len(type_forest[times][left_j])):
                    left_type_count[type_forest[times][left_j][element]] = left_type_count.get(
                        type_forest[times][left_j][element],
                        0) + 1
                type_forest[times][left_j] = [int(max(left_type_count))]
            else:
                judge_forest[times][left_j] = judge_forest[times].get(left_j, ()) + (boundary_left, axis_left)
                q.put(left_j)
            (boundary_right, axis_right) = information_gain(data_forest[times][right_j], type_forest[times][right_j],
                                                            max_features)
            if axis_right == -1:
                right_type_count = {}
                for element in range(len(type_forest[times][right_j])):
                    right_type_count[type_forest[times][right_j][element]] = right_type_count.get(
                        type_forest[times][right_j][element],
                        0) + 1
                type_forest[times][right_j] = [int(max(right_type_count))]
            else:
                judge_forest[times][right_j] = judge_forest[times].get(right_j, ()) + (boundary_right, axis_right)
                q.put(right_j)


def solve(X, node, times):
    if len(type_forest[times][node]) == 1:
        return type_forest[times][node][0]
    else:
        if X[judge_forest[times][node][1]] <= judge_forest[times][node][0]:
            return solve(X, 2 * node, times)
        else:
            return solve(X, 2 * node + 1, times)


def RandomForestClassifier(X_train, y_train, n_estimators=100, max_features=0, max_depth=1):
    (r, c) = X_train.shape
    if max_features <= 0:
        max_features = c
    for times in range(n_estimators):
        global judge_forest, data_forest, type_forest, depth_forest
        judge_forest = judge_forest + [{}]
        data_forest = data_forest + [{}]
        type_forest = type_forest + [{}]
        depth_forest = depth_forest + [{}]
        choose = np.random.randint(0, r, r)
        times_X = X_train[choose]
        times_y = y_train[choose]
        ##创建决策树
        DecisionTreeClassifier(X=times_X, y=times_y, max_depth=max_depth, max_features=max_features, times=times)


def predict(X):
    forest_shape = len(judge_forest)
    ananyse = {}
    for i in range(forest_shape):
        kind = solve(X, 1, i)
        ananyse[kind] = ananyse.get(kind, 0) + 1
    return max(ananyse, key=ananyse.get)


def forest_score(X_test, y_test):
    len_test = X_test.shape[0]
    correct = 0
    for i in range(len_test):
        temp = predict(X_test[i, :])
        if temp == y_test[i]:
            correct += 1
    return correct / len_test


X, y = make_moons(n_samples=1000, noise=0.25, random_state=3)  # 生成make_moon型数据
X_train, X_test, y_train, y_test = train_test_split(X, y)  # 分成训练集和测试集
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train)  # 绘制训练集的散点图
plt.show()
n_estimators = int(input("n_estimators:"))
RandomForestClassifier(X_train, y_train, n_estimators)
print("Accuracy on training set: {:.3f}".format(forest_score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest_score(X_test, y_test)))

运行结果：
随机森林python实现

n_estimators:39
Accuracy on training set: 0.999
Accuracy on test set: 0.972

上一篇：机器学习-随机森林(Random Forest)

下一篇： Python随机森林

随机森林python实现

C#中文随机数实现方法

python实现计算倒数的方法

Python实现的最近最少使用算法

python实现查找两个字符串中相同字符并输出的方法

Python实战购物车项目的实现参考

python实现在字符串中查找子字符串的方法

python3实现zabbix告警推送钉钉的示例

Python实现分割文件及合并文件的方法

python Selenium实现付费音乐批量下载的实现方法

python+pyqt5实现KFC点餐收银系统