python sklearn 随机森林
程序员文章站
2022-07-14 13:46:40
...
python sklearn 随机森林
文件下载地址
链接: https://pan.baidu.com/s/1dAnxc-6EaP9sUfyXtnUy7A 提取码: 9pf8
"""
随机森林
"""
import numpy as np
import sklearn.utils as su
import sklearn.ensemble as se
import matplotlib.pyplot as mp
import sklearn.metrics as sm
def train_day():
# 1.准备数据
lines = np.loadtxt(r".\bike_day.csv", delimiter=',', dtype='str')
header_name = lines[:1, 2:13].astype('str').reshape(11, )
x = lines[1:, 2:13].astype('float')
y = lines[1:, -1].astype('float')
print(header_name.shape, header_name.dtype)
print(x.shape, x.dtype)
print(y.shape, y.dtype)
print(header_name)
# 2.训练集 验证集分开
x, y = su.shuffle(x, y, random_state=7)
train_size = int(len(x) * 0.9)
train_x, train_y = x[:train_size], y[:train_size]
test_x, test_y = x[train_size:], y[train_size:]
# 3.随机森林模型训练
model = se.RandomForestRegressor(
max_depth=10, n_estimators=1000, min_samples_split=2)
model.fit(train_x, train_y)
predict_test_y = model.predict(test_x)
# 4.模型评估
print(sm.r2_score(test_y, predict_test_y))
print(sm.mean_absolute_error(test_y, predict_test_y))
day_feature = model.feature_importances_
return day_feature, header_name
def train_hour():
# 1.准备数据
lines = np.loadtxt(r".\bike_hour.csv", delimiter=',', dtype='str')
header_name = lines[:1, 2:14].astype('str').reshape(12, )
x = lines[1:, 2:14].astype('float')
y = lines[1:, -1].astype('float')
print(header_name.shape, header_name.dtype)
print(x.shape, x.dtype)
print(y.shape, y.dtype)
print(header_name)
# 2.训练集 验证集分开
x, y = su.shuffle(x, y, random_state=7)
train_size = int(len(x) * 0.9)
train_x, train_y = x[:train_size], y[:train_size]
test_x, test_y = x[train_size:], y[train_size:]
# 3.随机森林模型训练
model = se.RandomForestRegressor(
max_depth=10, n_estimators=1000, min_samples_split=2)
model.fit(train_x, train_y)
predict_test_y = model.predict(test_x)
# 4.模型评估
print(sm.r2_score(test_y, predict_test_y))
print(sm.mean_absolute_error(test_y, predict_test_y))
hour_feature = model.feature_importances_
return hour_feature, header_name
def draw_result(day_feature, day_header_name, hour_feature, hour_header_name):
# 绘图
mp.figure("Random Forest", facecolor="lightgray")
mp.subplot(211)
mp.title("Day Feature importance", fontsize=16)
mp.ylabel("Feature importance", fontsize=14)
x = np.arange(day_feature.size)
# 对FI进行排序
sorted_index = day_feature.argsort()[::-1]
day_feature = day_feature[sorted_index]
mp.bar(x, day_feature, 0.8, color="dodgerblue", label="day Feature importance")
mp.grid(linestyle=":", axis="y")
mp.xticks(x, day_header_name[sorted_index])
mp.legend()
mp.tight_layout()
mp.subplot(212)
mp.title("Hour Feature importance", fontsize=16)
mp.ylabel("Feature importance", fontsize=14)
x = np.arange(hour_feature.size)
# 对FI进行排序
sorted_index = hour_feature.argsort()[::-1]
hour_feature = hour_feature[sorted_index]
mp.bar(x, hour_feature, 0.8, color="orangered", label="Hour Feature importance")
mp.grid(linestyle=":", axis="y")
mp.xticks(x, hour_header_name[sorted_index])
mp.legend()
mp.tight_layout()
mp.show()
if __name__ == '__main__':
# 训练 day.csv数据
day_fi, day_header = train_day()
# 训练 hour.csv数据
hour_hi, hour_header = train_hour()
# 绘制结果图
draw_result(day_fi, day_header, hour_hi, hour_header)