欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

K -近邻算法 -- 鸢尾花案例讲解

程序员文章站 2024-01-25 08:02:46
...
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
%matplotlib inline

#获取鸢尾花数据
iris = load_iris()

# 特征值
data = iris.data
# 目标值
target = iris.target

# 构造 k_近邻算法对象
knn = KNeighborsClassifier() # knn: sklearn.neighbors.classification.KNeighborsClassifier

X_train =  data[:120]
X_test = data[120:]

y_train = target[:120]
y_text = target[120:]

knn.fit(X_train, y_train)

y_ = knn.predict(X_test)

# 准确率 = 预测正确的样本个数/预测的总样本数
acc = (y_text == y_).sum()/y_.size

# 为避免样本集拆分的不平衡, 需要将样本集打乱
iris_index = np.random.permutation(150)
target = target[iris_index]
data = data[iris_index]

X_train = data[:120]
X_test = data[120:]

y_train = target[:120]
y_test = target[120:]

# 重新训练模型,之前的训练结果会覆盖
knn.fit(X_train, y_train)

# 一个模型的好坏.不能只看某一次的评分, 训练多次, 用平均值来评价
def split_train_test(X, y, train_size, random_seed):
    '''
    X: 样本特征值 
    y: 样本目标值
    train_size: 训练集与测试集 分割比例 
    random_seed: 添加随机数种子, 用来比较, 训练集与测试集, 不同比例时的模型评分
    '''
    np.random.seed(random_seed)
    iris_index = np.random.permutation(y.size)
    
    target = y[iris_index]
    data = X[iris_index]
    
    n_split = int(y.size*train_size)
    
    X_train = data[:n_split]
    X_test = data[n_split:]
    
    y_train = target[:n_split]
    y_test = target[n_split:]
    return X_train, X_test ,y_train, y_test

# 7/3分
acc_list = []
for i in range(10):
    X_train, X_test, y_train, y_test = split_train_test(data,target, 0.7,random_seed=i)
    knn.fit(X_train, y_train)
    acc = knn.score(X_test, y_test)
    acc_list.append(acc)
np.array(acc_list).mean()

# # 8/2分
# acc_list = []
# for i in range(10):
#     X_train, X_test, y_train, y_test = split_train_test(data,target, 0.8,random_seed=i)
#     knn.fit(X_train, y_train)
#     acc = knn.score(X_test, y_test)
#     acc_list.append(acc)
# np.array(acc_list).mean()

# 封装 --> 模型评价
def acc_model(knn, data, target, train_size, times):
    acc_list = []
    for i in range(times):
        X_train, X_test, y_train, y_test = split_train_test(data,target, 0.7,random_seed=i)
        knn.fit(X_train, y_train)
        acc = knn.score(X_test, y_test)
        acc_list.append(acc)
    return np.array(acc_list).mean()

# 寻找合适的K值
k_list = np.arange(1,21,step=2)

# 定义一个保存所有模型的平均准确率的列表
mean_acc_list = []
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    m_acc = acc_model(knn, data, target, 0.8, 100).mean()
    mean_acc_list.append(m_acc)
寻找合适的K值 – 学习曲线
sns.set()
plt.plot(k_list, mean_acc_list, label="X_test_acc")
plt.xlabel("k-list", fontsize=16)
plt.ylabel("acc", fontsize=16)
plt.xticks(k_list)
plt.legend()

K -近邻算法 -- 鸢尾花案例讲解


# 一个好的模型,应该是测试集评分高,并且训练集评分相近
# 所以对评分函数优化如下:添加训练集评分的保存
# 用于评价模型的函数
def acc_model(knn, X, y, train_size, times):
    train_acc_list = []
    test_acc_list = []
    for i in range(times):
        X_train, X_test, y_train, y_test = split_train_test(X, y, train_size, random_seed=i)
        knn.fit(X_train, y_train)
        train_acc = knn.score(X_train, y_train)
        test_acc = knn.score(X_test, y_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
    return np.array(train_acc_list), np.array(test_acc_list) 

# 同时保存测试集和训练集的平均评分
mean_acc_train_list = []
mean_acc_test_list = []
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    train_acc, test_acc = acc_model(knn, data, target, 0.8, 10)
    mean_acc_train_list.append(train_acc.mean())
    mean_acc_test_list.append(test_acc.mean())
    
# 训练集和测试集的评分,选双高的,并且接近的
plt.plot(k_list, mean_acc_train_list, label="X_train_acc")
plt.plot(k_list, mean_acc_test_list, label="X_test_acc")
plt.xlabel("k-list", fontsize=16)
plt.ylabel("acc", fontsize=16)
plt.xticks(k_list)
plt.legend()
plt.show()

K -近邻算法 -- 鸢尾花案例讲解