[Course] Advanced Computer Programming, Homework, week 15, scikit-learn
程序员文章站
2022-06-02 12:41:45
...
scikit-learn example
import numpy as np
from sklearn import metrics
from sklearn import datasets
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
data = datasets.make_classification(n_samples=2000, n_features=10)
data[0][:5]
array([[ 1.94224588, -0.90004129, -1.29373233, -2.68480168, 0.33210151,
0.12402016, -0.85576276, 1.17830754, 0.49223751, 1.96238429],
[ 0.12985395, 0.54406644, -0.35623636, 0.42980539, -0.36361682,
0.43473325, -0.66051895, -0.45035764, -1.12449465, -0.82254233],
[-0.38586157, 0.82526339, -1.11620907, -0.78525796, -0.40823213,
0.13700943, 0.24143533, -1.02235791, 1.03706833, -1.50767359],
[-0.48923486, 0.13611755, 0.70740195, -1.74092075, -1.15887133,
0.96963557, -0.94061012, -2.04747774, -1.61738509, -0.41771673],
[-0.54016059, 0.13232637, 0.62608625, -0.81078144, -0.31259478,
-0.10926979, 0.78459312, -1.09810794, 1.88645398, -0.11686013]])
data[1][:5]
array([0, 0, 0, 0, 1])
clfs = [GaussianNB(),
SVC(C=0.1, kernel='rbf', gamma=0.1),
RandomForestClassifier(n_estimators=100)]
scoring = ['f1_micro', 'f1_macro']
for clf in clfs:
scores = cross_validate(clf, data[0], data[1], scoring=scoring, cv=10)
print('--------------------')
print(str(clf))
print()
print('micro: ')
print(scores['test_f1_micro'])
print('macro: ')
print(scores['test_f1_macro'])
print('ave: ', np.mean(scores['test_f1_micro']), np.mean(scores['test_f1_macro']))
--------------------
GaussianNB(priors=None)
micro:
[0.90547264 0.91 0.905 0.96 0.925 0.93
0.94 0.905 0.945 0.94974874]
macro:
[0.90546328 0.90985577 0.90499762 0.95998399 0.92498312 0.92997199
0.93994595 0.90497862 0.94498762 0.949717 ]
ave: 0.9275221380534514 0.9274884968705537
--------------------
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
micro:
[0.91044776 0.945 0.93 0.985 0.935 0.965
0.96 0.945 0.965 0.9798995 ]
macro:
[0.91026786 0.94498762 0.92993694 0.98499962 0.93498537 0.96497811
0.959996 0.9449656 0.96499912 0.97989899]
ave: 0.9520347258681466 0.9520015248604876
--------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
micro:
[0.92039801 0.945 0.935 0.96 0.935 0.965
0.945 0.95 0.965 0.97487437]
macro:
[0.9202381 0.94499862 0.93492028 0.95998399 0.93499837 0.96499212
0.94499862 0.94995496 0.96499912 0.97487437]
ave: 0.9495272381809545 0.9494958570594563
采用 micro-f1 和 macro-f1 指标评估分类器性能,采用十折交叉验证。通过上述测试,发现在随机构造的这个二分类问题中,SVM在测试集上的效果最佳。
上一篇: Homework Matplotlib
下一篇: OC 中 self 与 super 总结