机器学习实战-第二章-k近邻算法(kNN)-代码超详细解析(python3.7)
程序员文章站
2022-05-02 14:29:19
...
想直接要官网程序和数据完整包的(python2.6),请访问:https://www.manning.com/books/machine-learning-in-action
想直接要我的程序和数据完整包的(python3.7),请访问:https://download.csdn.net/download/m0_37738114/12894377
学习过程中,参考了太多前人的资料,我忘记记录了,总之感谢各位前辈!
1、首先本章给我们介绍了一个kNN小示例,里面写了必要流程函数:
# 导入科学计算包 numpy
# 导入运算符模块 operator,kNN算法执行排序是需要用到
from numpy import *
import operator
# 创建数据集与标签
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
# 将数据点(1,1,1)定义为类A,将数据点(0,0,1)定义为类B
labels = ['A', 'A', 'B', 'B']
return group, labels
'''
计算当前点与每一个已知点的距离
按照距离排序
确定前k个点所在类别的出现频率
返回频率最高的类别作为当前点的类别
'''
# 待分类数据向量 inX,训练样本集 dataSets
def classify0(inX, dataSet, labels, k):
# shape 返回维度,即训练数的行数
dataSetSize = dataSet.shape[0]
# --------------- 计算距离 start ----------------
# tile(a,(2,3))表示把数组a重复2*3次,2行3列,构建新的数组
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
# 数组中的每个元素取平方
sqDiffMat = diffMat ** 2
# 数组中按行相加
sqDistances = sqDiffMat.sum(axis=1)
# 数组中的每个元素开根号
distances = sqDistances ** 0.5
# --------------- 计算距离 end ----------------
# argsort函数将距离大小排序之后(由小到大),返回排序后的索引
sortedDistIndicies = distances.argsort()
# 定义空字典
classCount = {}
for i in range(k):
# 获取前k个标签
voteIlabel = labels[sortedDistIndicies[i]]
# 字典.get(voteIlabel, 0) 表示如果get不到则默认初始值为0
# 累计前k个最近邻的类别
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# classCount.items() 将字典分解为元祖列表,字典只支持对键的迭代,如果要按值排序,需要生成可迭代对象
# key=operator.itemgetter(1) 表示根据第二个值进行排序,即类别出现的频率
# reverse=True 表示降序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# 返回k个最近邻中出现频率最多的类别,作为当前数据的类别返回
return sortedClassCount[0][0]
group = createDataSet()[0]
labels = createDataSet()[1]
print(classify0([0,0], group, labels, 3))
运行结果:
2、其次,以改进约会网站的配对效果为例
'''
改进约会网站的配对效果
'''
from numpy import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import operator
# 对特征值数据进行归一化处理
def autoNorm(dataSet):
# min(0)返回矩阵中每一列的最小值,min(1)返回矩阵中每一行的最小值
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0] # 训练集的行数
# tile(minVals, (m, 1)) 构建对应于所有矩阵数据在所在列的最小值矩阵
# 归一化处理,(当前值-最小值)/(最大值 - 最小值)
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet / tile(ranges, (m,1))
return normDataSet, ranges, minVals
# 准备数据:datingTestSet.txt(1000行):每年获得的飞行常客里程数、玩视频游戏所耗时间百分比、每周消费的冰淇淋公升数
def file2matrix(filename):
fr = open(filename)
arrayOflines = fr.readlines()
# 获得文件行数
numberOfLines = len(arrayOflines)
# 创建需要返回的矩阵
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
index = 0
# 解析文件数据到列表
for line in arrayOflines:
# 截掉所有的回车字符
line = line.strip()
listFromLine = line.split('\t')
# 构建特征存储矩阵
returnMat[index,:] = listFromLine[0:3]
# 构建存储标签的数组
classLabelVector.append(int(listFromLine[-1]))
index = index + 1
return returnMat, classLabelVector
# 待分类数据向量 inX,训练样本集 dataSets
def classify0(inX, dataSet, labels, k):
# shape 返回维度,即训练数的行数
dataSetSize = dataSet.shape[0]
# --------------- 计算距离 start ----------------
# tile(a,(2,3))表示把数组a重复2*3次,2行3列,构建新的数组
diffMat = tile(inX, (dataSetSize, 1)) - dataSet
# 数组中的每个元素取平方
sqDiffMat = diffMat ** 2
# 数组中按行相加
sqDistances = sqDiffMat.sum(axis=1)
# 数组中的每个元素开根号
distances = sqDistances ** 0.5
# --------------- 计算距离 end ----------------
# argsort函数将距离大小排序之后(由小到大),返回排序后的索引
sortedDistIndicies = distances.argsort()
# 定义空字典
classCount = {}
for i in range(k):
# 获取前k个标签
voteIlabel = labels[sortedDistIndicies[i]]
# 字典.get(voteIlabel, 0) 表示如果get不到则默认初始值为0
# 累计前k个最近邻的类别
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
# classCount.items() 将字典分解为元祖列表,字典只支持对键的迭代,如果要按值排序,需要生成可迭代对象
# key=operator.itemgetter(1) 表示根据第二个值进行排序,即类别出现的频率
# reverse=True 表示降序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# 返回k个最近邻中出现频率最多的类别,作为当前数据的类别返回
return sortedClassCount[0][0]
# 测试数据:测试算法准确率
def datingClassTest():
# 从样本集中选取测试集的比例
hoRatio = 0.10
# 准备数据
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
# 数据归一化
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
# 选取测试集
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for j in range(numTestVecs):
classifierResult = classify0(normMat[j,:], normMat[numTestVecs:m,:], datingLabels[numTestVecs:m], 3)
if (classifierResult != datingLabels[j]):
errorCount = errorCount + 1.0
print("正确率为:", 1-errorCount/float(numTestVecs))
# 测试算法准确率
# datingClassTest()
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent playing video games? "))
ffMiles = float(input("frequent flier miles earned per year? "))
iceCream = float(input("liters of ice cream consumed per year? "))
datingDataMat, datingLabels = file2matrix("datingTestSet2.txt")
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
print(resultList[classifierResult - 1])
# 使用算法
classifyPerson()
# datingDataMat = file2matrix("datingTestSet2.txt")[0]
# datingLabels = file2matrix("datingTestSet2.txt")[1]
# print(datingDataMat)
# print()
# print(classLabelVector)
# 画图1: 画出所有3张图
# fig = plt.figure()
# 创建子图:22表示2行2列,最后一个1表示第1个子图
# ax1 = fig.add_subplot(221)
# ax1.scatter(datingDataMat[:, 0], datingDataMat[:, 1], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
# ax2 = fig.add_subplot(222)
# ax2.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
# ax3 = fig.add_subplot(223)
# ax3.scatter(datingDataMat[:, 0], datingDataMat[:, 2], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
# plt.savefig("fly_and_play_and_ice.jpg")
# plt.show()
# datingDataMat = file2matrix("datingTestSet2.txt")[0]
# datingLabels = file2matrix("datingTestSet2.txt")[1]
# 画图2:画出分类最明显的1张图
# fig = plt.figure()
# ax = fig.add_subplot(111)
# datingLabels = array(datingLabels)
# idx_1 = where(datingLabels == 1)
# p1 = ax.scatter(datingDataMat[idx_1, 0], datingDataMat[idx_1, 1], s=20, marker='o', c='r', label='Do Not Like')
# idx_2 = where(datingLabels == 2)
# p2 = ax.scatter(datingDataMat[idx_2, 0], datingDataMat[idx_2, 1], s=10, marker='o', c='b', label='Liked in Small Doses')
# idx_3 = where(datingLabels == 3)
# p3 = ax.scatter(datingDataMat[idx_3, 0], datingDataMat[idx_3, 1], s=30, marker='o', c='g', label='Liked in Large Doses')
# plt.legend(loc='upper left', fontsize=10)
# plt.xlabel('Frequent-flier miles per year')
# plt.ylabel('The percentage of time spent on playing video games(%)')
# plt.savefig("fly_and_play.jpg")
# plt.show()
首先要手动输入三个值:
运行结果:
如果取消画图部分的注释,运行结果是:
3、最后,又以构建手写识别系统为例
'''
构建手写识别系统
'''
from numpy import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import operator
from os import listdir
from kNN_demo import classify0
'''
该算法要对每一个测试向量进行2000次距离计算,耗时很长,
而且需要2M的存储空间,为了减少时空开销,后期可以用k决策树进行优化
'''
# 将数字图像信息转换为矩阵
def img2vector(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0, 32*i+j] = int(lineStr[j])
# returnVect矩阵的第一行内容就是一个数字
return returnVect
# img2vector("handwriting_image_data/trainingDigits/0_0.txt")
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('handwriting_image_data/trainingDigits')
m = len(trainingFileList)
trainingMat = zeros((m, 1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = fileStr.split('_')[0]
# 构建标签集合
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('handwriting_image_data/trainingDigits/%s' % fileNameStr)
testFileList = listdir('handwriting_image_data/testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = fileStr.split('_')[0]
vectorUnderTest = img2vector('handwriting_image_data/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
if (classifierResult != classNumStr):
errorCount = errorCount + 1.0
print("正确率为 ", 1-(errorCount/float(mTest)))
# 测试算法
handwritingClassTest()
运行结果:
上一篇: 机器学习-线性回归
下一篇: 机器学习——线性回归