K -近邻算法 -- 鸢尾花案例讲解
程序员文章站
2024-01-25 08:02:46
...
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
%matplotlib inline
#获取鸢尾花数据
iris = load_iris()
# 特征值
data = iris.data
# 目标值
target = iris.target
# 构造 k_近邻算法对象
knn = KNeighborsClassifier() # knn: sklearn.neighbors.classification.KNeighborsClassifier
X_train = data[:120]
X_test = data[120:]
y_train = target[:120]
y_text = target[120:]
knn.fit(X_train, y_train)
y_ = knn.predict(X_test)
# 准确率 = 预测正确的样本个数/预测的总样本数
acc = (y_text == y_).sum()/y_.size
# 为避免样本集拆分的不平衡, 需要将样本集打乱
iris_index = np.random.permutation(150)
target = target[iris_index]
data = data[iris_index]
X_train = data[:120]
X_test = data[120:]
y_train = target[:120]
y_test = target[120:]
# 重新训练模型,之前的训练结果会覆盖
knn.fit(X_train, y_train)
# 一个模型的好坏.不能只看某一次的评分, 训练多次, 用平均值来评价
def split_train_test(X, y, train_size, random_seed):
'''
X: 样本特征值
y: 样本目标值
train_size: 训练集与测试集 分割比例
random_seed: 添加随机数种子, 用来比较, 训练集与测试集, 不同比例时的模型评分
'''
np.random.seed(random_seed)
iris_index = np.random.permutation(y.size)
target = y[iris_index]
data = X[iris_index]
n_split = int(y.size*train_size)
X_train = data[:n_split]
X_test = data[n_split:]
y_train = target[:n_split]
y_test = target[n_split:]
return X_train, X_test ,y_train, y_test
# 7/3分
acc_list = []
for i in range(10):
X_train, X_test, y_train, y_test = split_train_test(data,target, 0.7,random_seed=i)
knn.fit(X_train, y_train)
acc = knn.score(X_test, y_test)
acc_list.append(acc)
np.array(acc_list).mean()
# # 8/2分
# acc_list = []
# for i in range(10):
# X_train, X_test, y_train, y_test = split_train_test(data,target, 0.8,random_seed=i)
# knn.fit(X_train, y_train)
# acc = knn.score(X_test, y_test)
# acc_list.append(acc)
# np.array(acc_list).mean()
# 封装 --> 模型评价
def acc_model(knn, data, target, train_size, times):
acc_list = []
for i in range(times):
X_train, X_test, y_train, y_test = split_train_test(data,target, 0.7,random_seed=i)
knn.fit(X_train, y_train)
acc = knn.score(X_test, y_test)
acc_list.append(acc)
return np.array(acc_list).mean()
# 寻找合适的K值
k_list = np.arange(1,21,step=2)
# 定义一个保存所有模型的平均准确率的列表
mean_acc_list = []
for k in k_list:
knn = KNeighborsClassifier(n_neighbors=k)
m_acc = acc_model(knn, data, target, 0.8, 100).mean()
mean_acc_list.append(m_acc)
寻找合适的K值 – 学习曲线
sns.set()
plt.plot(k_list, mean_acc_list, label="X_test_acc")
plt.xlabel("k-list", fontsize=16)
plt.ylabel("acc", fontsize=16)
plt.xticks(k_list)
plt.legend()
# 一个好的模型,应该是测试集评分高,并且训练集评分相近
# 所以对评分函数优化如下:添加训练集评分的保存
# 用于评价模型的函数
def acc_model(knn, X, y, train_size, times):
train_acc_list = []
test_acc_list = []
for i in range(times):
X_train, X_test, y_train, y_test = split_train_test(X, y, train_size, random_seed=i)
knn.fit(X_train, y_train)
train_acc = knn.score(X_train, y_train)
test_acc = knn.score(X_test, y_test)
train_acc_list.append(train_acc)
test_acc_list.append(test_acc)
return np.array(train_acc_list), np.array(test_acc_list)
# 同时保存测试集和训练集的平均评分
mean_acc_train_list = []
mean_acc_test_list = []
for k in k_list:
knn = KNeighborsClassifier(n_neighbors=k)
train_acc, test_acc = acc_model(knn, data, target, 0.8, 10)
mean_acc_train_list.append(train_acc.mean())
mean_acc_test_list.append(test_acc.mean())
# 训练集和测试集的评分,选双高的,并且接近的
plt.plot(k_list, mean_acc_train_list, label="X_train_acc")
plt.plot(k_list, mean_acc_test_list, label="X_test_acc")
plt.xlabel("k-list", fontsize=16)
plt.ylabel("acc", fontsize=16)
plt.xticks(k_list)
plt.legend()
plt.show()
下一篇: php断点续传之如何分割合并文件