KNN分类算法
程序员文章站
2024-03-19 16:16:04
...
1:使用机器学习实战中的knn算法对 普通样本进行了分类
2:分别使用该算法与scikit_learning中的knn 进行了手写数字的对比,发现scikit_learning分类贼慢
实现的功能:
1、对样本特征为[x1,x2,.....,xn-1,y],其*有n-1个特征,1个y标签 进行knn样本分类
部分数据截图:
knn.py
import numpy as np
import operator
def classify0(inX, dataset, labels, k):
"""
inX 是输入的测试样本,是一个[x1, x2,......]样式的
dataset 是训练样本集
labels 是训练样本标签
k 是top k最相近的
"""
dataSetSize = dataset.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataset
sqDiffMat = diffMat ** 2
sqDistance = sqDiffMat.sum(axis=1)
distance = sqDistance ** 0.5
sortedDistIndicies = distance.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
#classCount.get(voteIlabel, 0) classCount中有 voteIlabel,则取出classCount[voteIlabel]的值,否则为置为0
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
#从大到小的排序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
#返回[('B', 2), ('A', 1)]中最大的
return sortedClassCount[0][0]
def file2matrix(filename):
"""
从文件中读入训练数据,并存储为矩阵
针对样本特征为n-1个样本特征,1个样本标签 [x1,x2,x3,....,xn-1,y]
"""
fr = open(filename)
arrayOlines = fr.readlines()
numberOfLines = len(arrayOlines) #获取 n=样本的行数
numberOffeatures = len(arrayOlines[0].split('\t')) - 1
print(numberOffeatures)
returnMat = np.zeros((numberOfLines,numberOffeatures)) #创建一个2维矩阵用于存放训练样本数据,一共有n行,每一行存放3个数据
classLabelVector = [] #创建一个1维数组用于存放训练样本标签。
index = 0
for line in arrayOlines:
# 把回车符号给去掉
line = line.strip()
# 把每一行数据用\t分割
listFromLine = line.split('\t')
# 把分割好的数据放至数据集,其中index是该样本数据的下标,就是放到第几行
returnMat[index,:] = listFromLine[0:numberOffeatures]
# 把该样本对应的标签放至标签集,顺序与样本集对应。
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def autoNorm(dataSet):
"""
训练数据归一化
这边只是把每个训练样本作为最值比较单元
"""
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(dataSet.shape)
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m,1))
normDataSet = normDataSet/np.tile(ranges, (m,1))
return normDataSet, ranges, minVals
def datingClassTest():
# 将数据集中10%的数据留作测试用,其余的90%用于训练
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
# print("the classifier came back with: %d, the real answer is: %d, result is :%s" % (classifierResult, datingLabels[i],classifierResult==datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
print('errorCount:',errorCount)
if __name__== "__main__":
datingClassTest()
输出:
the total error rate is: 0.050000
errorCount: 5.0
2:使用scikit_learn 中的knn 对0-9数字进行识别,太特么慢了,但是只要把分类算法换成classify0 不用scikit_learn就快很多
import numpy as np
import operator
from os import listdir
from sklearn import neighbors
import sklearn
def classify0(inX, dataset, labels, k):
"""
inX 是输入的测试样本,是一个[x1, x2,......]样式的
dataset 是训练样本集
labels 是训练样本标签
k 是top k最相近的
"""
dataSetSize = dataset.shape[0]
diffMat = np.tile(inX, (dataSetSize, 1)) - dataset
sqDiffMat = diffMat ** 2
sqDistance = sqDiffMat.sum(axis=1)
distance = sqDistance ** 0.5
sortedDistIndicies = distance.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
#classCount.get(voteIlabel, 0) classCount中有 voteIlabel,则取出classCount[voteIlabel]的值,否则为置为0
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
#从大到小的排序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
#返回[('B', 2), ('A', 1)]中最大的
return sortedClassCount[0][0]
def file2matrix(filename):
"""
从文件中读入训练数据,并存储为矩阵
针对样本特征为n-1个样本特征,1个样本标签 [x1,x2,x3,....,xn-1,y]
"""
fr = open(filename)
arrayOlines = fr.readlines()
numberOfLines = len(arrayOlines) #获取 n=样本的行数
numberOffeatures = len(arrayOlines[0].split('\t')) - 1
#print(numberOffeatures)
returnMat = np.zeros((numberOfLines,numberOffeatures)) #创建一个2维矩阵用于存放训练样本数据,一共有n行,每一行存放3个数据
classLabelVector = [] #创建一个1维数组用于存放训练样本标签。
index = 0
for line in arrayOlines:
# 把回车符号给去掉
line = line.strip()
# 把每一行数据用\t分割
listFromLine = line.split('\t')
# 把分割好的数据放至数据集,其中index是该样本数据的下标,就是放到第几行
returnMat[index,:] = listFromLine[0:numberOffeatures]
# 把该样本对应的标签放至标签集,顺序与样本集对应。
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def autoNorm(dataSet):
"""
训练数据归一化
这边只是把每个训练样本作为最值比较单元
"""
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(dataSet.shape)
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m,1))
normDataSet = normDataSet/np.tile(ranges, (m,1))
return normDataSet, ranges, minVals
def datingClassTest():
# 将数据集中10%的数据留作测试用,其余的90%用于训练
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
# print("the classifier came back with: %d, the real answer is: %d, result is :%s" % (classifierResult, datingLabels[i],classifierResult==datingLabels[i]))
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
print('errorCount:',errorCount)
def img2vector(filename):
"""
将图片数据转换为01矩阵。
每张图片是32*32像素,也就是一共1024个字节。
因此转换的时候,每行表示一个样本,每个样本含1024个字节。
"""
# 每个样本数据是1024=32*32个字节
returnVect = np.zeros((1,1024))
fr = open(filename)
# 循环读取32行,32列。
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
# 加载训练数据
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList)
trainingMat = np.zeros((m,1024))
for i in range(m):
# 从文件名中解析出当前图像的标签,也就是数字是几
# 文件名格式为 0_3.txt 表示图片数字是 0
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
# 加载测试数据
testFileList = listdir('testDigits1') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits1/%s' % fileNameStr)
# classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
classifierResult = classf(vectorUnderTest, trainingMat, hwLabels)
print("the classifier came back with: %d, the real answer is: %d, The predict result is: %s" % (classifierResult, classNumStr, classifierResult==classNumStr))
if (classifierResult != classNumStr): errorCount += 1.0
print("\nthe total number of errors is: %d / %d" %(errorCount, mTest))
print("\nthe total error rate is: %f" % (errorCount/float(mTest)))
def classf(vectorUnderTest, trainingMat, hwLabels):
knn = neighbors.KNeighborsClassifier()
#训练数据集
knn.fit(trainingMat, hwLabels)
#训练准确率
score = knn.score(trainingMat, hwLabels)
#预测
predict = knn.predict(vectorUnderTest)
return predict
if __name__== "__main__":
# datingClassTest()
# handwritingClassTest()
handwritingClassTest()
只训练了三个样本的输出结果:
the classifier came back with: 0, the real answer is: 0, The predict result is: [ True]
the classifier came back with: 1, the real answer is: 1, The predict result is: [ True]
the classifier came back with: 2, the real answer is: 2, The predict result is: [ True]
the total number of errors is: 0 / 3
the total error rate is: 0.000000
推荐阅读
-
KNN分类算法
-
GlassFish替换Tomcat 博客分类: Java GlassfishTomcatEclipseJSP浏览器
-
连锁百货企业数据系统整理解决方案 博客分类: 综合解决方案 连锁百货企业财务管理企业信息化数据系统解决方案
-
Experiments in Streaming Content in Java ME(三)-----Back to RTPSourceStream and StreamingDataSource 博客分类: java
-
jQuery获取Select选择的Text和 Value[转] 博客分类: web开发 jQueryJavaScriptHTML
-
[转贴]做好个人的时间管理 博客分类: 杂七杂八 项目管理生活工作咨询活动
-
Spring Boot 踩坑之路之 Configuration Annotation Proessor not found in classpath 博客分类: springboot
-
spring aop的原理 博客分类: Java AOPSpringBean正则表达式配置管理
-
.NET可视化权限功能界面设计 博客分类: web .net可视化权限控制
-
Springboot JpaRepository findOne() 方法报错 博客分类: springboot