随机森林的应用简例
程序员文章站
2022-05-06 19:14:35
用jupyter lab 做的 太多了懒得排版%matplotlib inlineimport numpy as npimport pandas as pdfrom sklearn.model_selection import GridSearchCVfrom sklearn.ensemble import RandomForestClassifierdf_t = pd.read_excel(r'D:\EdgeDownloadPlace\复赛数据集\train.xlsx',header=Non...
用jupyter lab 做的 太多了懒得排版
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
df_t = pd.read_excel(r'D:\EdgeDownloadPlace\复赛数据集\train.xlsx',header=None)
import re
with open(r'D:\EdgeDownloadPlace\复赛数据集\features.txt') as f:
features = re.findall('[0-9] (.*)\n', f.read())
features.insert(0,'uid')
features.append('target')
df_t.columns = features
df_t = df_t.drop(columns = 'uid')
arr_t = df_t.values
import time
start_time = time.time()
param_grid = {'n_estimators' : np.arange(1,201,40)}
rfc = RandomForestClassifier(random_state = 435681971
,criterion = 'entropy')
gs = GridSearchCV(rfc, param_grid, cv=4)
gs.fit(arr_t[:,:-1],arr_t[:,-1])
peak_n_lst = [gs.best_score_, gs.best_params_]
end_time = time.time()
time_span = end_time - start_time
peak_n_lst
[0.9373961218836566, {'n_estimators': 161}]
time_span
107.33542943000793
peak_n = peak_n_lst[1]['n_estimators']
start_time = time.time()
param_grid = {'n_estimators' : np.arange(peak_n-20,peak_n+20)}
rfc = RandomForestClassifier(random_state = 435681971
,criterion = 'entropy')
gs = GridSearchCV(rfc, param_grid, cv=4)
gs.fit(arr_t[:,:-1],arr_t[:,-1])
peak_n_lst = [gs.best_score_, gs.best_params_]
end_time = time.time()
time_span = end_time - start_time
peak_n = peak_n_lst[1]['n_estimators']
peak_n
170
start_time = time.time()
param_grid = {'max_depth' : np.arange(1,561//2,30)}
rfc = RandomForestClassifier(random_state = 435681971
,n_estimators = peak_n
,criterion = 'entropy')
gs = GridSearchCV(rfc, param_grid, cv=4)
gs.fit(arr_t[:,:-1],arr_t[:,-1])
peak_depth_lst = [gs.best_score_, gs.best_params_]
end_time = time.time()
time_span = end_time - start_time
peak_depth_lst
[0.9386426592797784, {'max_depth': 31}]
time_span
407.307009935379
peak_depth = peak_depth_lst[1]['max_depth']
peak_depth
31
start_time = time.time()
param_grid = {'max_depth' : np.arange(peak_depth-20, peak_depth+30)}
rfc = RandomForestClassifier(random_state = 435681971
,n_estimators = peak_n
,criterion = 'entropy')
gs = GridSearchCV(rfc, param_grid, cv=4)
gs.fit(arr_t[:,:-1],arr_t[:,-1])
peak_depth_lst = [gs.best_score_, gs.best_params_]
end_time = time.time()
time_span = end_time - start_time
peak_depth_lst
[0.9393351800554016, {'max_depth': 14}]
time_span/60
31.484332279364267
peak_depth = peak_depth_lst[1]['max_depth']
peak_depth
14
start_time = time.time()
param_grid = {'min_samples_split' : np.arange(2,125,30)}
rfc = RandomForestClassifier(random_state = 435681971
,n_estimators = peak_n
,max_depth = peak_depth
,criterion = 'entropy')
gs = GridSearchCV(rfc, param_grid, cv=4)
gs.fit(arr_t[:,:-1],arr_t[:,-1])
peak_depth_lst = [gs.best_score_, gs.best_params_]
end_time = time.time()
time_span = end_time - start_time
time_span/60
3.157407291730245
#peak_depth_lst 上面忘记换名字了
peak_minss = peak_depth_lst[1]['min_samples_split']
peak_minss
2
start_time = time.time()
param_grid = {'min_samples_split' : np.arange(2,30)}
rfc = RandomForestClassifier(random_state = 435681971
,n_estimators = peak_n
,max_depth = peak_depth
,criterion = 'entropy')
gs = GridSearchCV(rfc, param_grid, cv=4)
gs.fit(arr_t[:,:-1],arr_t[:,-1])
peak_depth_lst = [gs.best_score_, gs.best_params_]
end_time = time.time()
time_span = end_time - start_time
time_span
1053.7646670341492
peak_depth_lst
[0.9393351800554016, {'min_samples_split': 2}]
peak_minss = peak_depth_lst[1]['min_samples_split']
peak_minss
2
import matplotlib.pyplot as plt
peak_n = 170
peak_depth = 14
peak_minss = 3
#rfc = RandomForestClassifier(random_state = 435681971
# ,n_estimators = peak_n
# ,max_depth = peak_depth
# ,min_samples_split = peak_minss
# ,oob_score = True)
#rfc.fit(arr_t[:,:-1],arr_t[:,-1])
#rfc.oob_score_
#plt.figure(figsize = [20,5])
#score_lst=[]
#for i in range(30):
# rfc = RandomForestClassifier(#random_state = 435681971
# n_estimators = peak_n
# ,max_depth = peak_depth
# ,min_samples_split = peak_minss
# ,oob_score = True)
# rfc.fit(arr_t[:,:-1],arr_t[:,-1])
# score_lst.append(rfc.oob_score_)
#plt.plot(range(1,31),score_lst)
score_lst=[]
for i in range(30):
rfc = RandomForestClassifier(#random_state = 435681971
n_estimators = peak_n
,max_depth = peak_depth
,min_samples_split = peak_minss
,oob_score = True
,criterion = 'entropy')
rfc.fit(arr_t[:,:-1],arr_t[:,-1])
score_lst.append(rfc.oob_score_)
plt.plot(range(1,31),score_lst,color = 'red')
plt.show()
plt.figure(figsize = [15,6])
plt.plot(range(1,31),score_lst,color = 'red')
plt.show()
while True:
rfc = RandomForestClassifier(#random_state = 435681971
n_estimators = peak_n
,max_depth = peak_depth
,min_samples_split = peak_minss
,oob_score = True
,criterion = 'entropy')
rfc.fit(arr_t[:,:-1],arr_t[:,-1])
if rfc.oob_score_ > 0.978:
break
df_a = pd.read_excel(r'D:\EdgeDownloadPlace\复赛数据集\test.xlsx',header=None)
df_a.columns = features[:-1]
df_a = df_a.drop(columns= 'uid')
df_a
tBodyAcc-mean()-X tBodyAcc-mean()-Y tBodyAcc-mean()-Z tBodyAcc-std()-X tBodyAcc-std()-Y tBodyAcc-std()-Z tBodyAcc-mad()-X tBodyAcc-mad()-Y tBodyAcc-mad()-Z tBodyAcc-max()-X ... fBodyBodyGyroJerkMag-meanFreq() fBodyBodyGyroJerkMag-skewness() fBodyBodyGyroJerkMag-kurtosis() angle(tBodyAccMean,gravity) angle(tBodyAccJerkMean),gravityMean) angle(tBodyGyroMean,gravityMean) angle(tBodyGyroJerkMean,gravityMean) angle(X,gravityMean) angle(Y,gravityMean) angle(Z,gravityMean)
0 0.278 -0.01640 -0.1240 -0.998 -0.9750 -0.960 -0.999 -0.9750 -0.958 -0.9430 ... 0.1580 -0.5950 -0.861 0.0535 -0.00743 -0.733 0.7040 -0.845 0.180 -0.0543
1 0.281 -0.00996 -0.1060 -0.995 -0.9730 -0.986 -0.995 -0.9740 -0.986 -0.9400 ... 0.2670 0.3400 0.140 -0.0206 -0.12800 -0.483 -0.0707 -0.848 0.190 -0.0344
2 0.277 -0.01470 -0.1070 -0.999 -0.9910 -0.993 -0.999 -0.9910 -0.992 -0.9430 ... 0.7400 -0.5640 -0.766 0.1060 -0.09030 -0.132 0.4990 -0.850 0.189 -0.0351
3 0.279 -0.02300 -0.1220 -0.997 -0.9750 -0.983 -0.997 -0.9730 -0.984 -0.9420 ... 0.6620 -0.7820 -0.954 -0.1220 -0.02910 -0.013 -0.0569 -0.761 0.263 0.0242
4 0.280 -0.01390 -0.1060 -0.998 -0.9880 -0.990 -0.998 -0.9880 -0.992 -0.9420 ... 0.4290 -0.3290 -0.597 -0.0283 0.09240 -0.822 0.3680 -0.759 0.264 0.0297
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3074 0.231 -0.04230 -0.0899 -0.309 -0.0791 -0.152 -0.391 -0.0870 -0.257 0.0562 ... -0.0310 -0.1390 -0.589 0.2730 0.85600 -0.962 0.9530 -0.657 0.276 0.1770
3075 0.357 -0.04460 -0.1300 -0.314 -0.0556 -0.173 -0.386 -0.0575 -0.217 0.0262 ... 0.0168 -0.1630 -0.593 -0.7110 -0.06120 -0.706 0.0646 -0.660 0.274 0.1760
3076 0.284 -0.00796 -0.1190 -0.309 -0.0804 -0.211 -0.369 -0.0971 -0.301 -0.1170 ... -0.1100 0.0245 -0.393 -0.0761 -0.23900 0.960 0.0866 -0.657 0.272 0.1830
3077 0.207 0.02460 -0.1040 -0.365 -0.1690 -0.216 -0.449 -0.1860 -0.326 -0.1760 ... -0.2140 -0.3520 -0.734 0.5350 -0.25700 0.927 -0.0843 -0.657 0.267 0.1880
3078 0.331 -0.06400 -0.1170 -0.068 0.1560 -0.317 -0.149 0.0701 -0.291 0.4120 ... -0.0214 -0.0863 -0.468 -0.3510 -0.33600 0.967 -0.7150 -0.810 0.185 0.1210
3079 rows × 561 columns
arr_a = df_a.values
arr_a
array([[ 0.278 , -0.0164 , -0.124 , ..., -0.845 , 0.18 , -0.0543 ],
[ 0.281 , -0.00996, -0.106 , ..., -0.848 , 0.19 , -0.0344 ],
[ 0.277 , -0.0147 , -0.107 , ..., -0.85 , 0.189 , -0.0351 ],
...,
[ 0.284 , -0.00796, -0.119 , ..., -0.657 , 0.272 , 0.183 ],
[ 0.207 , 0.0246 , -0.104 , ..., -0.657 , 0.267 , 0.188 ],
[ 0.331 , -0.064 , -0.117 , ..., -0.81 , 0.185 , 0.121 ]])
answer = rfc.predict(arr_a).astype(np.int8).tolist()
len(answer)
3079
answer_df = pd.DataFrame(answer)
answer_df
0
0 5
1 5
2 5
3 5
4 5
... ...
3074 2
3075 2
3076 2
3077 2
3078 3
3079 rows × 1 columns
answer_df.to_excel(r'D:\EdgeDownloadPlace\复赛数据集\ANS\20201104try.xlsx')
print('ok')
ok
本文地址:https://blog.csdn.net/wldcmzy/article/details/109637639
上一篇: 曾忠勇报国的仆固怀恩,为何后来会起兵*?