机器学习 - 随机森林算法RandomForestRegressor
程序员文章站
2022-03-22 20:22:59
...
前言
随机森林Python版本有很可以调用的库,使用随机森林非常方便,主要用到以下的库
sklearn
Scikit learn 也简称 sklearn, 是机器学习领域当中最知名的 python 模块之一.
Sklearn 包含了很多种机器学习的方式:
- Classification 分类
- Regression 回归
- Clustering 非监督分类
- Dimensionality reduction 数据降维
- Model Selection 模型选择
- Preprocessing 数据预处理
快速入门:传送门
numpy
numpy(Numerical Python)提供了python对多维数组对象的支持:ndarray,具有矢量运算能力,快速、节省空间。numpy支持高级大量的维度数组与矩阵运算,此外也针对数组运算提供大量的数学函数库。
numpy快速入门: 传送门
pandas
pandas 是基于 Numpy 构建的含有更高级数据结构和工具的数据分析包 类似于 Numpy 的核心是 ndarray,pandas 也是围绕着 Series 和 DataFrame 两个核心数据结构展开的 。Series 和 DataFrame 分别对应于一维的序列和二维的表结构。
pandas使用教程:传送门
RandomForestRegressor 基础
1.导入模块,创建模型
import matplotlib.pyplot as plt #可视化图形库
import numpy as np #numpy多维数值操作库
import pandas as pd #pandas数据分析库
from sklearn import datasets, cross_validation, ensemble #sklearn机器学习库
2.引入数据,对数据进行分集
'''
加载用于回归问题的数据集
:return: 一个元组,用于回归问题。元组元素依次为:训练样本集、测试样本集、训练样本集对应的值、测试样本集对应的值
'''
diabetes = datasets.load_diabetes() # 使用 scikit-learn 自带的一个糖尿病病人的数据集
return cross_validation.train_test_split(diabetes.data, diabetes.target,
test_size=0.25, random_state=0) # 拆分成训练集和测试集,测试集大小为原始数据集大小的 1/4
3.模型预测
'''
测试 RandomForestRegressor 的用法
:param data: 可变参数。它是一个元组,这里要求其元素依次为:训练样本集、测试样本集、训练样本的 值、测试样本的值
:return: None
'''
X_train, X_test, y_train, y_test = data
regr = ensemble.RandomForestRegressor()
regr.fit(X_train, y_train)
print("Traing Score:%f" % regr.score(X_train, y_train))
print("Testing Score:%f" % regr.score(X_test, y_test))
训练集:0.89 测试集 :0.24
更多使用参考:链接
Demo
自定义数据拓展模型
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.externals import joblib
# 各指数计算
# PCA降维
def pcachange(pcadata):
global MType
pcadata1 = pcadata.fillna(0)
pca = PCA(n_components=1)
data_new = pca.fit_transform(pcadata1)
print pcadata1.columns, pca.explained_variance_ratio_
joblib.dump(pca, unicode('D:/pca/' + str(MType) + '.m', 'utf-8'))
minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0, 100))
joblib.dump(pca, unicode('D:/minmax/' + str(MType) + '.m', 'utf-8'))
data_minmax = minmax_scaler.fit_transform(data_new)
return data_minmax
#分组计算
def cal_zs(input_features):
for col in input_features:
'print(data[col])'
data_output = data[col]
data_output1 = pcachange(data_output)
data_output2 = pd.DataFrame(data_output1)
return data_output2
#整个预处理过程
def preprocess(data_raw):
global MType
MType = 0
info = data_raw.iloc[:,0:12]
#diannei = data_raw.iloc[:,22:89]
ylsale =data_raw.iloc[:,-2]
sale =data_raw.iloc[:,-1]
ls_features = [['t130403','t130201','t130200','t130204','t130207','t130102','t130103','t130105','t130106','t130104','t130101']]
xzl_features = [['t200103','t200104']]
yl_features = [['t180201','t180400','t180402','t180403','t180209','t180205','t180202','t180210','t180203',
't180103','t180106','t180105','t180104','t180110','t180107','t180102','t180111','t180101','t180100','t120201','t120202','t120101','t120102']]
cy_features = [['t110101','t110102','t110103','t110200','t110301','t110303','t110302']]
fw_features = [['t230224','t230212','t230206','t230213','t230230','t230223','t230129','t230112','t230125','t230107','t230126','t230100','t230103','t230108']]
jy_features = [[ 't160103', 't160104', 't160105']]
jj_features = [
['t800000', 't800001', 't800010', 't800011', 't800012', 't800013', 't800014', 't800020', 't800030', 't800031',
't800032', 't800035', 't800036', 't800037', 't8_0_19', 't8_0_29', 't8_0_39', 't8_0_49', 't8_10_29', 't8_10_39',
't8_10_49', 't8_20_39', 't8_20_49', 't8_30_49']]
MType = MType + 1
lszs = cal_zs(ls_features)
MType = MType + 1
xzlzs = cal_zs(xzl_features)
MType = MType + 1
ylzs = cal_zs(yl_features)
MType = MType + 1
cyzs = cal_zs(cy_features)
MType = MType + 1
fwzs = cal_zs(fw_features)
MType = MType + 1
jyzs = cal_zs(jy_features)
MType = MType + 1
jjzs = cal_zs(jj_features)
lszs.columns = ['lszs']
xzlzs.columns = ['xzlzs']
ylzs.columns = ['ylzs']
cyzs.columns = ['cyzs']
jyzs.columns = ['jyzs']
jjzs.columns = ['jjzs']
ls = data_raw[['t130403','t130201','t130200','t130204','t130207','t130102','t130103','t130105','t130106','t130104','t130101']]
cy = data_raw[['t110101','t110102','t110103','t110200','t110301','t110303','t110302']]
fw = data_raw[['t230224','t230212','t230206','t230213','t230230','t230223','t230129','t230112','t230125','t230107','t230126','t230100','t230103','t230108']]
yl = data_raw[['t180201', 't180400', 't180402', 't180403', 't180209', 't180205', 't180202', 't180210', 't180203',
't180103', 't180106', 't180105', 't180104', 't180110', 't180107', 't180102', 't180111', 't180101',
't180100', 't120201', 't120202', 't120101', 't120102']]
jj = data_raw[['t800000', 't800001', 't800010', 't800011', 't800012', 't800013', 't800014', 't800020', 't800030', 't800031',
't800032','t800035','t800036', 't800037', 't8_0_19', 't8_0_29', 't8_0_39', 't8_0_49', 't8_10_29', 't8_10_39',
't8_10_49', 't8_20_39', 't8_20_49', 't8_30_49']]
data_pre = pd.concat([info,lszs,xzlzs,ylzs,jyzs,ls,cy,cyzs,fw,fwzs,jjzs,jj,yl,ylsale,sale],axis = 1)
return data_pre
filepath = u'D:/data/f1.csv'
labelpath = u'D:/data/f2.csv'
data = pd.read_csv(filepath, header=0, sep=',',na_values='NULL')
label = pd.read_csv(labelpath, header=0, sep=',')
data2 = preprocess(data)
x_labels = data2.columns[12:-2]
x_labels_t = np.array(x_labels).T
print x_labels
# 销量分等级
def cat_sale(inputdata):
y1 = inputdata.iloc[:, -1] # 销量
inputdata['salecat'] = pd.qcut(y1, 10, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
inputdata['salecat'] = inputdata['salecat'].astype(int)
return inputdata
# 随机森林算法
def rt_method(x_train, y_train, x_test, y_test):
global CityIndex,CityName
x_train1 = x_train.iloc[:, 12:]
info_train = x_train.iloc[:, 0:12]
info_train1 = info_train.reset_index(drop=True)
rf0 = RandomForestRegressor(n_estimators=100, max_features='sqrt',oob_score=True)
x_test1 = x_test.iloc[:, 12:]
#rf0.fit(x_test1, y_test)
rf0.fit(x_train1, y_train)
y1 = rf0.predict(x_train1)
y_train_pred = pd.DataFrame(y1, columns=['cat_pred'])
y_train1 = y_train.reset_index(drop=True)
info_test = x_test.iloc[:, 0:12]
info_test1 = info_test.reset_index(drop=True)
y2 = rf0.predict(x_test1)
y_test_pred = pd.DataFrame(y2, columns=['cat_pred'])
y_test1 = y_test.reset_index(drop=True)
result_train = pd.concat([info_train1, y_train_pred, y_train1], axis=1)
result_train1 = result_train.rename(columns={'salecat': 'cat_true'})
result_train1['PCV'] = result_train1['cat_pred'] * 10
result_train1.to_csv(unicode('D:/train.csv','utf-8'), index=False, sep=',')
result_test = pd.concat([info_test1, y_test_pred, y_test1], axis=1)
result_test1 = result_test.rename(columns={'salecat': 'cat_true'})
result_test1['PCV'] = result_test1['cat_pred'] * 10
result_test1.to_csv(unicode('D:/test.csv','utf-8'), index=False, sep=',')
r1 = result_train1.cat_pred.corr(result_train1.cat_true)
r2 = result_test1.cat_pred.corr(result_test1.cat_true)
print r1, r2
result.loc[CityIndex, ['train_R']] = r1
result.loc[CityIndex, ['test_R']] = r2
importances = rf0.feature_importances_
df_ipt = pd.DataFrame(importances, columns=["feature_importance"])
feature_imp["feature_importance"] = df_ipt
return rf0
global CityName,CityIndex
CityIndex = 0
feature_imp = pd.DataFrame(data=[])
feature_imp['feature'] = x_labels_t
result = pd.DataFrame(data=[], index=[], columns=['city', 'train_R', 'test_R', 'num'])
data3 = cat_sale(data)
X = data3.iloc[:, 0:-2]
Y = data3['salecat']
Y = Y.fillna(1)
X1 = X.fillna(0) # 用0填充空值
X2 = X1.replace('t', 1) # 用1填充t值
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.3, random_state=42)
rf = rt_method(X_train, Y_train, X_test, Y_test)
joblib.dump(rf, unicode('D:/data/model/0116/ly/全国.m', 'utf-8'))
'''for city, ctdata in data3.groupby(['city_name']):
print city, CityIndex
CityName = city
result.loc[CityIndex, ['city']] = city
n = ctdata.iloc[:, 0].size # 行数
if n > 20:
X = ctdata.iloc[:, 0:-2]
Y = ctdata['salecat']
Y = Y.fillna(1)
X1 = X.fillna(0) # 用0填充空值
X2 = X1.replace('t', 1) # 用1填充t值
X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.3, random_state=42)
try:
rf = rt_method(X_train, Y_train, X_test, Y_test)
joblib.dump(rf, unicode('D:/data/model/0115/ly/' + str(CityName) + '.m', 'utf-8'))
except:
print ('wrong')
else:
print n
result.loc[CityIndex, ['num']] = n
CityIndex = CityIndex + 1'''
feature_imp1 = pd.merge(label, feature_imp, on='feature',how = 'right')
result.to_csv(u'D:/R.csv', index=False, sep=',',encoding = 'gbk')
feature_imp1.to_csv(u'D:/变量重要性.csv', index=False, sep=',',encoding = 'gbk')
上一篇: php rmdir删除非空目录的方法