基于随机森林模型的红酒品质分析
程序员文章站
2022-07-14 15:18:23
...
看了南京大学的《用python玩转数据视频》,Python非常强大。代码做了些注释。
# url: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
# 导入模块
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
try:
wine = pd.read_csv('winequality-red.csv', sep = ';') # 读取文件,sep设置分隔符为;
except:
print("Cannot find the file!")
print(wine.info()) # 葡萄酒数据
print(wine.describe()) # 数据的基本统计信息
wine = wine.drop_duplicates() # 删除重复的记录
# 饼图展示quality每一类数据的值
wine['quality'].value_counts().plot(kind = 'pie', autopct = '%.2f')
plt.show()
# quality与其他属性之间的皮尔逊相关系数
print(wine.corr().quality)
# 每个quality对应的volatile acidity和quality属性的均值分布
plt.subplot(121)
sns.barplot(x = 'quality', y = 'volatile acidity', data = wine)
plt.subplot(122)
sns.barplot(x = 'quality', y = 'alcohol', data = wine)
plt.show()
from sklearn.preprocessing import LabelEncoder
# bins构成左开右闭的区间 (2,4],(4,6],(6,8]
bins = (2, 4, 6, 8)
# 组名
group_names = ['low', 'medium', 'high']
# 使用cut进行划分数据
wine['quality_lb'] = pd.cut(wine['quality'], bins = bins, labels = group_names)
# 为quality_lb属性分配标签0,1,2 label为具体的标签
lb_quality = LabelEncoder()
wine['label'] = lb_quality.fit_transform(wine['quality_lb'])
# wine.label.value_counts()统计新类别的分布
print(wine.label.value_counts())
wine_copy = wine.copy()
wine.drop(['quality', 'quality_lb'], axis = 1, inplace = True)
# 通过数据选择方式,将特征属性和目标属性分开存入x,y
X = wine.iloc[:,:-1]
y = wine.label
from sklearn.model_selection import train_test_split
# train_test_split可以从样本中按照比例选取训练数据和测试数据,test_size设置比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
# 进行规范化处理
from sklearn.preprocessing import scale
# 将特征属性训练集合测试集用scale进行标准化处理
X_train = scale(X_train)
X_test = scale(X_test)
from sklearn.metrics import confusion_matrix
# 使用RandomForestClassifier构建一个分类器,n_estimators是使用最大投票数或均值建立子树的数量
rfc = RandomForestClassifier(n_estimators = 200)
# 使用fit进行训练
rfc.fit(X_train, y_train)
# 使用predict进行预测
y_pred = rfc.predict(X_test)
# 实际值与预测值比较,使用confusion_matrix混淆矩阵来观察
print(confusion_matrix(y_test, y_pred))
# 选取的参数
param_rfc = {
"n_estimators": [10,20,30,40,50,60,70,80,90,100,150,200],
"criterion": ["gini", "entropy"]
}
# GridSearchCV暴力搜索
grid_rfc = GridSearchCV(rfc, param_rfc, iid = False, cv = 5)
grid_rfc.fit(X_train, y_train)
best_param_rfc = grid_rfc.best_params_
# best_param_rfc是已取得最佳结果的参数的组合
print(best_param_rfc)
# 重新预测
rfc = RandomForestClassifier(n_estimators = best_param_rfc['n_estimators'], criterion = best_param_rfc['criterion'], random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, y_pred))