KNN分类算法

程序员文章站 2024-03-19 16:16:04

...

1：使用机器学习实战中的knn算法对普通样本进行了分类

2：分别使用该算法与scikit_learning中的knn 进行了手写数字的对比，发现scikit_learning分类贼慢

实现的功能：

1、对样本特征为[x1,x2,.....,xn-1,y]，其*有n-1个特征，1个y标签进行knn样本分类

部分数据截图：

KNN分类算法

knn.py

import numpy as np
import operator

def classify0(inX, dataset, labels, k):
    """
    inX 是输入的测试样本，是一个[x1, x2,......]样式的
    dataset 是训练样本集
    labels 是训练样本标签
    k 是top k最相近的
    """
    dataSetSize = dataset.shape[0] 
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataset
    sqDiffMat = diffMat ** 2
    sqDistance = sqDiffMat.sum(axis=1)
    distance = sqDistance ** 0.5
    sortedDistIndicies = distance.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        #classCount.get(voteIlabel, 0) classCount中有 voteIlabel,则取出classCount[voteIlabel]的值，否则为置为0
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    #从大到小的排序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    #返回[('B', 2), ('A', 1)]中最大的

    return sortedClassCount[0][0]

def file2matrix(filename):
    """
    从文件中读入训练数据，并存储为矩阵
    针对样本特征为n-1个样本特征，1个样本标签   [x1,x2,x3,....,xn-1,y]
    """
    fr = open(filename)
    arrayOlines = fr.readlines()
    numberOfLines = len(arrayOlines)   #获取 n=样本的行数
    
    numberOffeatures = len(arrayOlines[0].split('\t')) - 1
    print(numberOffeatures)

    returnMat = np.zeros((numberOfLines,numberOffeatures))   #创建一个2维矩阵用于存放训练样本数据，一共有n行，每一行存放3个数据
    classLabelVector = []    #创建一个1维数组用于存放训练样本标签。  
    index = 0
    for line in arrayOlines:
        # 把回车符号给去掉
        line = line.strip()    
        # 把每一行数据用\t分割
        listFromLine = line.split('\t')
        # 把分割好的数据放至数据集，其中index是该样本数据的下标，就是放到第几行
        returnMat[index,:] = listFromLine[0:numberOffeatures]
        # 把该样本对应的标签放至标签集，顺序与样本集对应。
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector
    
def autoNorm(dataSet):
    """
    训练数据归一化
    这边只是把每个训练样本作为最值比较单元
    """
    minVals = dataSet.min(0) 
    maxVals = dataSet.max(0) 
    ranges = maxVals - minVals
    normDataSet = np.zeros(dataSet.shape)
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet/np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals
   
def datingClassTest():
    # 将数据集中10%的数据留作测试用，其余的90%用于训练
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
#        print("the classifier came back with: %d, the real answer is: %d, result is :%s" % (classifierResult, datingLabels[i],classifierResult==datingLabels[i]))
        if (classifierResult != datingLabels[i]): 
            errorCount += 1.0
    print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
    print('errorCount:',errorCount)

if __name__== "__main__":
    datingClassTest()

输出：

the total error rate is: 0.050000
errorCount: 5.0

2：使用scikit_learn 中的knn 对0-9数字进行识别，太特么慢了，但是只要把分类算法换成classify0 不用scikit_learn就快很多

import numpy as np
import operator
from os import listdir


from sklearn import neighbors  
import sklearn  

def classify0(inX, dataset, labels, k):
    """
    inX 是输入的测试样本，是一个[x1, x2,......]样式的
    dataset 是训练样本集
    labels 是训练样本标签
    k 是top k最相近的
    """
    dataSetSize = dataset.shape[0] 
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataset
    sqDiffMat = diffMat ** 2
    sqDistance = sqDiffMat.sum(axis=1)
    distance = sqDistance ** 0.5
    sortedDistIndicies = distance.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        #classCount.get(voteIlabel, 0) classCount中有 voteIlabel,则取出classCount[voteIlabel]的值，否则为置为0
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    #从大到小的排序
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    #返回[('B', 2), ('A', 1)]中最大的

    return sortedClassCount[0][0]

def file2matrix(filename):
    """
    从文件中读入训练数据，并存储为矩阵
    针对样本特征为n-1个样本特征，1个样本标签   [x1,x2,x3,....,xn-1,y]
    """
    fr = open(filename)
    arrayOlines = fr.readlines()
    numberOfLines = len(arrayOlines)   #获取 n=样本的行数
    
    numberOffeatures = len(arrayOlines[0].split('\t')) - 1
    #print(numberOffeatures)

    returnMat = np.zeros((numberOfLines,numberOffeatures))   #创建一个2维矩阵用于存放训练样本数据，一共有n行，每一行存放3个数据
    classLabelVector = []    #创建一个1维数组用于存放训练样本标签。  
    index = 0
    for line in arrayOlines:
        # 把回车符号给去掉
        line = line.strip()    
        # 把每一行数据用\t分割
        listFromLine = line.split('\t')
        # 把分割好的数据放至数据集，其中index是该样本数据的下标，就是放到第几行
        returnMat[index,:] = listFromLine[0:numberOffeatures]
        # 把该样本对应的标签放至标签集，顺序与样本集对应。
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat,classLabelVector
    
def autoNorm(dataSet):
    """
    训练数据归一化
    这边只是把每个训练样本作为最值比较单元
    """
    minVals = dataSet.min(0) 
    maxVals = dataSet.max(0) 
    ranges = maxVals - minVals
    normDataSet = np.zeros(dataSet.shape)
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m,1))
    normDataSet = normDataSet/np.tile(ranges, (m,1))
    return normDataSet, ranges, minVals
   
def datingClassTest():
    # 将数据集中10%的数据留作测试用，其余的90%用于训练
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')       #load data setfrom file
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
#        print("the classifier came back with: %d, the real answer is: %d, result is :%s" % (classifierResult, datingLabels[i],classifierResult==datingLabels[i]))
        if (classifierResult != datingLabels[i]): 
            errorCount += 1.0
    print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
    print('errorCount:',errorCount)

def img2vector(filename):
    """
    将图片数据转换为01矩阵。
    每张图片是32*32像素，也就是一共1024个字节。
    因此转换的时候，每行表示一个样本，每个样本含1024个字节。
    """
    # 每个样本数据是1024=32*32个字节
    returnVect = np.zeros((1,1024))
    fr = open(filename)
    # 循环读取32行，32列。
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j])
    return returnVect

def handwritingClassTest():
    hwLabels = []
    # 加载训练数据
    trainingFileList = listdir('trainingDigits')           
    m = len(trainingFileList)
    trainingMat = np.zeros((m,1024))
    for i in range(m):
        # 从文件名中解析出当前图像的标签，也就是数字是几
        # 文件名格式为 0_3.txt 表示图片数字是 0
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    # 加载测试数据
    testFileList = listdir('testDigits1')        #iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits1/%s' % fileNameStr)
       # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        classifierResult = classf(vectorUnderTest, trainingMat, hwLabels)
        print("the classifier came back with: %d, the real answer is: %d, The predict result is: %s" % (classifierResult, classNumStr, classifierResult==classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d / %d" %(errorCount, mTest))
    print("\nthe total error rate is: %f" % (errorCount/float(mTest)))

def classf(vectorUnderTest, trainingMat, hwLabels):
    knn = neighbors.KNeighborsClassifier()  
    #训练数据集  
    knn.fit(trainingMat, hwLabels)
    #训练准确率
    score = knn.score(trainingMat, hwLabels)
    #预测
    predict = knn.predict(vectorUnderTest)
    return predict

if __name__== "__main__":
#     datingClassTest()   
#     handwritingClassTest()
    handwritingClassTest()

只训练了三个样本的输出结果：

the classifier came back with: 0, the real answer is: 0, The predict result is: [ True]
the classifier came back with: 1, the real answer is: 1, The predict result is: [ True]
the classifier came back with: 2, the real answer is: 2, The predict result is: [ True]

the total number of errors is: 0 / 3

the total error rate is: 0.000000

上一篇：分布式id生成策略，我和面试官扯了一个半小时

下一篇：移动视频: QuickTime for Java API 入门博客分类： RTSPjava

KNN分类算法

1：使用机器学习实战中的knn算法对普通样本进行了分类

2：分别使用该算法与scikit_learning中的knn 进行了手写数字的对比，发现scikit_learning分类贼慢

KNN分类算法

GlassFish替换Tomcat 博客分类： Java GlassfishTomcatEclipseJSP浏览器

连锁百货企业数据系统整理解决方案博客分类：综合解决方案连锁百货企业财务管理企业信息化数据系统解决方案

Experiments in Streaming Content in Java ME(三)-----Back to RTPSourceStream and StreamingDataSource 博客分类： java

jQuery获取Select选择的Text和 Value[转] 博客分类： web开发 jQueryJavaScriptHTML

[转贴]做好个人的时间管理博客分类：杂七杂八项目管理生活工作咨询活动

Spring Boot 踩坑之路之 Configuration Annotation Proessor not found in classpath 博客分类： springboot

spring aop的原理博客分类： Java AOPSpringBean正则表达式配置管理

.NET可视化权限功能界面设计博客分类： web .net可视化权限控制

Springboot JpaRepository findOne() 方法报错博客分类： springboot

KNN分类算法

1：使用机器学习实战中的knn算法对 普通样本进行了分类

2：分别使用该算法与scikit_learning中的knn 进行了手写数字的对比，发现scikit_learning分类贼慢

KNN分类算法

GlassFish替换Tomcat 博客分类： Java GlassfishTomcatEclipseJSP浏览器

连锁百货企业数据系统整理解决方案 博客分类： 综合解决方案 连锁百货企业财务管理企业信息化数据系统解决方案

Experiments in Streaming Content in Java ME(三)-----Back to RTPSourceStream and StreamingDataSource 博客分类： java

jQuery获取Select选择的Text和 Value[转] 博客分类： web开发 jQueryJavaScriptHTML

[转贴]做好个人的时间管理 博客分类： 杂七杂八 项目管理生活工作咨询活动

Spring Boot 踩坑之路之 Configuration Annotation Proessor not found in classpath 博客分类： springboot

spring aop的原理 博客分类： Java AOPSpringBean正则表达式配置管理

.NET可视化权限功能界面设计 博客分类： web .net可视化权限控制

Springboot JpaRepository findOne() 方法报错 博客分类： springboot

1：使用机器学习实战中的knn算法对普通样本进行了分类

连锁百货企业数据系统整理解决方案博客分类：综合解决方案连锁百货企业财务管理企业信息化数据系统解决方案

[转贴]做好个人的时间管理博客分类：杂七杂八项目管理生活工作咨询活动

spring aop的原理博客分类： Java AOPSpringBean正则表达式配置管理

.NET可视化权限功能界面设计博客分类： web .net可视化权限控制

Springboot JpaRepository findOne() 方法报错博客分类： springboot