欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

KNN分类算法

程序员文章站 2024-03-19 16:16:04
...

1:使用机器学习实战中的knn算法对 普通样本进行了分类

2:分别使用该算法与scikit_learning中的knn 进行了手写数字的对比,发现scikit_learning分类贼慢

 

实现的功能:

    1、对样本特征为[x1,x2,.....,xn-1,y],其*有n-1个特征,1个y标签 进行knn样本分类

 部分数据截图:

KNN分类算法

knn.py

import numpy as np
import operator

def classify0(inX, dataset, labels, k):
    """
    inX 是输入的测试样本,是一个[x1, x2,......]样式的
    dataset 是训练样本集
    labels 是训练样本标签
    k 是top k最相近的
    """
    dataSetSize = dataset.shape[0] 
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataset
    sqDiffMat = diffMat ** 2
    sqDistance = sqDiffMat.sum(axis=1)
    distance = sqDistance ** 0.5
    sortedDistIndicies = distance.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        #classCount.get(voteIlabel, 0) classCount中有 voteIlabel,则取出classCount[voteIlabel]的值,否则为置为0
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    #从大到小的排序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    #返回[('B', 2), ('A', 1)]中最大的

    return sortedClassCount[0][0]

def file2matrix(filename):
    """
    从文件中读入训练数据,并存储为矩阵
    针对样本特征为n-1个样本特征,1个样本标签   [x1,x2,x3,....,xn-1,y]
    """
    fr = open(filename)
    arrayOlines = fr.readlines()
    numberOfLines = len(arrayOlines)   #获取 n=样本的行数
    
    numberOffeatures = len(arrayOlines[0].split('\t')) - 1
    print(numberOffeatures)

    returnMat = np.zeros((numberOfLines,numberOffeatures))   #创建一个2维矩阵用于存放训练样本数据,一共有n行,每一行存放3个数据
    classLabelVector = []    #创建一个1维数组用于存放训练样本标签。  
    index = 0
    for line in arrayOlines:
        # 把回车符号给去掉
        line = line.strip()    
        # 把每一行数据用\t分割
        listFromLine = line.split('\t')
        # 把分割好的数据放至数据集,其中index是该样本数据的下标,就是放到第几行
        returnMat[index,:] = listFromLine[0:numberOffeatures]
        # 把该样本对应的标签放至标签集,顺序与样本集对应。
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector
    
def autoNorm(dataSet):
    """
    训练数据归一化
    这边只是把每个训练样本作为最值比较单元
    """
    minVals = dataSet.min(0) 
    maxVals = dataSet.max(0) 
    ranges = maxVals - minVals
    normDataSet = np.zeros(dataSet.shape)
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet/np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals
   
def datingClassTest():
    # 将数据集中10%的数据留作测试用,其余的90%用于训练
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
#        print("the classifier came back with: %d, the real answer is: %d, result is :%s" % (classifierResult, datingLabels[i],classifierResult==datingLabels[i]))
        if (classifierResult != datingLabels[i]): 
            errorCount += 1.0
    print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
    print('errorCount:',errorCount)

if __name__== "__main__":
    datingClassTest()   
    

输出:

the total error rate is: 0.050000
errorCount: 5.0

2:使用scikit_learn 中的knn 对0-9数字进行识别,太特么慢了,但是只要把分类算法换成classify0 不用scikit_learn就快很多

import numpy as np
import operator
from os import listdir


from sklearn import neighbors  
import sklearn  

def classify0(inX, dataset, labels, k):
    """
    inX 是输入的测试样本,是一个[x1, x2,......]样式的
    dataset 是训练样本集
    labels 是训练样本标签
    k 是top k最相近的
    """
    dataSetSize = dataset.shape[0] 
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataset
    sqDiffMat = diffMat ** 2
    sqDistance = sqDiffMat.sum(axis=1)
    distance = sqDistance ** 0.5
    sortedDistIndicies = distance.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        #classCount.get(voteIlabel, 0) classCount中有 voteIlabel,则取出classCount[voteIlabel]的值,否则为置为0
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    #从大到小的排序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    #返回[('B', 2), ('A', 1)]中最大的

    return sortedClassCount[0][0]

def file2matrix(filename):
    """
    从文件中读入训练数据,并存储为矩阵
    针对样本特征为n-1个样本特征,1个样本标签   [x1,x2,x3,....,xn-1,y]
    """
    fr = open(filename)
    arrayOlines = fr.readlines()
    numberOfLines = len(arrayOlines)   #获取 n=样本的行数
    
    numberOffeatures = len(arrayOlines[0].split('\t')) - 1
    #print(numberOffeatures)

    returnMat = np.zeros((numberOfLines,numberOffeatures))   #创建一个2维矩阵用于存放训练样本数据,一共有n行,每一行存放3个数据
    classLabelVector = []    #创建一个1维数组用于存放训练样本标签。  
    index = 0
    for line in arrayOlines:
        # 把回车符号给去掉
        line = line.strip()    
        # 把每一行数据用\t分割
        listFromLine = line.split('\t')
        # 把分割好的数据放至数据集,其中index是该样本数据的下标,就是放到第几行
        returnMat[index,:] = listFromLine[0:numberOffeatures]
        # 把该样本对应的标签放至标签集,顺序与样本集对应。
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector
    
def autoNorm(dataSet):
    """
    训练数据归一化
    这边只是把每个训练样本作为最值比较单元
    """
    minVals = dataSet.min(0) 
    maxVals = dataSet.max(0) 
    ranges = maxVals - minVals
    normDataSet = np.zeros(dataSet.shape)
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet/np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals
   
def datingClassTest():
    # 将数据集中10%的数据留作测试用,其余的90%用于训练
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
#        print("the classifier came back with: %d, the real answer is: %d, result is :%s" % (classifierResult, datingLabels[i],classifierResult==datingLabels[i]))
        if (classifierResult != datingLabels[i]): 
            errorCount += 1.0
    print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
    print('errorCount:',errorCount)

def img2vector(filename):
    """
    将图片数据转换为01矩阵。
    每张图片是32*32像素,也就是一共1024个字节。
    因此转换的时候,每行表示一个样本,每个样本含1024个字节。
    """
    # 每个样本数据是1024=32*32个字节
    returnVect = np.zeros((1,1024))
    fr = open(filename)
    # 循环读取32行,32列。
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

def handwritingClassTest():
    hwLabels = []
    # 加载训练数据
    trainingFileList = listdir('trainingDigits')           
    m = len(trainingFileList)
    trainingMat = np.zeros((m,1024))
    for i in range(m):
        # 从文件名中解析出当前图像的标签,也就是数字是几
        # 文件名格式为 0_3.txt 表示图片数字是 0
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    # 加载测试数据
    testFileList = listdir('testDigits1')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits1/%s' % fileNameStr)
       # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        classifierResult = classf(vectorUnderTest, trainingMat, hwLabels)
        print("the classifier came back with: %d, the real answer is: %d, The predict result is: %s" % (classifierResult, classNumStr, classifierResult==classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d / %d" %(errorCount, mTest))
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)))

def classf(vectorUnderTest, trainingMat, hwLabels):
    knn = neighbors.KNeighborsClassifier()  
    #训练数据集  
    knn.fit(trainingMat, hwLabels)
    #训练准确率
    score = knn.score(trainingMat, hwLabels)
    #预测
    predict = knn.predict(vectorUnderTest)
    return predict

if __name__== "__main__":
#     datingClassTest()   
#     handwritingClassTest()
    handwritingClassTest()

只训练了三个样本的输出结果:

the classifier came back with: 0, the real answer is: 0, The predict result is: [ True]
the classifier came back with: 1, the real answer is: 1, The predict result is: [ True]
the classifier came back with: 2, the real answer is: 2, The predict result is: [ True]

the total number of errors is: 0 / 3

the total error rate is: 0.000000