基于随机森林模型的红酒品质分析

程序员文章站 2022-07-14 15:18:23

...

看了南京大学的《用python玩转数据视频》，Python非常强大。代码做了些注释。

# url: https://archive.ics.uci.edu/ml/datasets/Wine+Quality
# 导入模块
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore') 
 
try:
    wine = pd.read_csv('winequality-red.csv', sep = ';') # 读取文件，sep设置分隔符为；
except:
    print("Cannot find the file!")
 
 
print(wine.info()) # 葡萄酒数据
print(wine.describe()) # 数据的基本统计信息
wine = wine.drop_duplicates() # 删除重复的记录
 
# 饼图展示quality每一类数据的值
wine['quality'].value_counts().plot(kind = 'pie', autopct = '%.2f')
plt.show()
 
# quality与其他属性之间的皮尔逊相关系数
print(wine.corr().quality)
 
# 每个quality对应的volatile acidity和quality属性的均值分布
plt.subplot(121)
sns.barplot(x = 'quality', y = 'volatile acidity', data = wine)
plt.subplot(122)
sns.barplot(x = 'quality', y = 'alcohol', data = wine)
plt.show()
 
from sklearn.preprocessing import LabelEncoder
# bins构成左开右闭的区间 (2,4],(4,6],(6,8]
bins = (2, 4, 6, 8)
# 组名
group_names  = ['low', 'medium', 'high']
# 使用cut进行划分数据
wine['quality_lb'] = pd.cut(wine['quality'], bins = bins, labels = group_names)
 
# 为quality_lb属性分配标签0,1,2 label为具体的标签
lb_quality = LabelEncoder()    
wine['label'] = lb_quality.fit_transform(wine['quality_lb']) 
 
# wine.label.value_counts()统计新类别的分布
print(wine.label.value_counts())
 
wine_copy = wine.copy()

wine.drop(['quality', 'quality_lb'], axis = 1, inplace = True) 
# 通过数据选择方式，将特征属性和目标属性分开存入x，y
X = wine.iloc[:,:-1]
y = wine.label
 
from sklearn.model_selection import train_test_split
# train_test_split可以从样本中按照比例选取训练数据和测试数据，test_size设置比例
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
 
# 进行规范化处理
from sklearn.preprocessing import scale  
# 将特征属性训练集合测试集用scale进行标准化处理   
X_train = scale(X_train)
X_test = scale(X_test)
 
from sklearn.metrics import confusion_matrix
 
# 使用RandomForestClassifier构建一个分类器，n_estimators是使用最大投票数或均值建立子树的数量
rfc = RandomForestClassifier(n_estimators = 200)
# 使用fit进行训练
rfc.fit(X_train, y_train)
# 使用predict进行预测
y_pred = rfc.predict(X_test)
# 实际值与预测值比较，使用confusion_matrix混淆矩阵来观察
print(confusion_matrix(y_test, y_pred))
 
# 选取的参数
param_rfc = {
            "n_estimators": [10,20,30,40,50,60,70,80,90,100,150,200],
            "criterion": ["gini", "entropy"]
            }
# GridSearchCV暴力搜索
grid_rfc = GridSearchCV(rfc, param_rfc, iid = False, cv = 5)
grid_rfc.fit(X_train, y_train)
best_param_rfc = grid_rfc.best_params_
# best_param_rfc是已取得最佳结果的参数的组合
print(best_param_rfc)
# 重新预测
rfc = RandomForestClassifier(n_estimators = best_param_rfc['n_estimators'], criterion = best_param_rfc['criterion'], random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_test, y_pred))