k近邻算法
程序员文章站
2022-07-13 11:36:12
...
k近邻算法代码实现
import numpy as np
import operator
##给出训练数据以及对应的类别
def create_dataset():
group = np.array([[1.0, 2.0], [1.2, 0.1],
[0.1, 1.4], [0.3, 3.5]])
labels = ['A', 'A', 'B', 'B']
return group, labels
##通过KNN进行分类
def classify(input, dataSet, label, k):
dataSize = dataSet.shape[0]
## 重复input为dataSet的大小
diff = np.tile(input, (dataSize, 1)) - dataSet
sqdiff = diff ** 2
## 列向量分别相加,得到一列新的向量
squareDist = np.array([sum(x) for x in sqdiff])
dist = squareDist ** 0.5
## 对距离进行排序
## argsort()根据元素的值从大到小对元素进行排序,返回下标
sortedDistIndex = np.argsort(dist)
classCount = {}
for i in range(k):
## 因为已经对距离进行排序,所以直接循环sortedDistIndx
voteLabel = label[sortedDistIndex[i]]
## 对选取的k个样本所属的类别个数进行统计
## 如果获取的标签不在classCount中,返回0
classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
## 选取出现的类别次数最多的类别
maxCount = 0
for key, value in classCount.items():
if value > maxCount:
maxCount = value
classes = key
return classes
data, labels = create_dataset()
input = [1.0, 2.0]
print(classify(input, data, labels, 2))
from math import sqrt
from random import randint
##Generate KD tree
def createTree(dataSet, layer=0, feature=2):
length = len(dataSet)
dataSetCopy = dataSet[:]
featureNum = layer % feature
dataSetCopy.sort(key=lambda x: x[featureNum])
layer += 1
if length == 0:
return None
elif length == 1:
return {'Value': dataSet[0], 'Layer': layer, 'feature': featureNum, 'Left': None, 'Right': None}
elif length != 1:
midNum = length // 2
dataSetLeft = dataSetCopy[:midNum]
dataSetRight = dataSetCopy[midNum + 1:]
return {'Value': dataSetCopy[midNum], 'Layer': layer, 'feature': featureNum,
'Left': createTree(dataSetLeft, layer)
, 'Right': createTree(dataSetRight, layer)}
# calculate distance
def calDistance(sourcePoint, targetPoint):
length = len(targetPoint)
sum = 0.0
for i in range(length):
sum += (sourcePoint[i] - targetPoint[i]) ** 2
sum = sqrt(sum)
return sum
# DFS algorithm
def dfs(kdTree, target, tracklist=[]):
tracklistCopy = tracklist[:]
if not kdTree:
return None, tracklistCopy
elif not kdTree['Left']:
tracklistCopy.append(kdTree['Value'])
return kdTree['Value'], tracklistCopy
elif kdTree['Left']:
pointValue = kdTree['Value']
feature = kdTree['feature']
tracklistCopy.append(pointValue)
# return kdTree['Value'], tracklistCopy
if target[feature] <= pointValue[feature]:
return dfs(kdTree['Left'], target, tracklistCopy)
elif target[feature] > pointValue[feature]:
return dfs(kdTree['Right'], target, tracklistCopy)
# A function use to find a point in KDtree
def findPoint(Tree, value):
if Tree != None and Tree['Value'] == value:
return Tree
else:
if Tree['Left'] != None:
return findPoint(Tree['Left'], value) or findPoint(Tree['Right'], value)
# KDtree search algorithm
def kdTreeSearch(tracklist, target, usedPoint=[], minDistance=float('inf'), minDistancePoint=None):
tracklistCopy = tracklist[:]
usedPointCopy = usedPoint[:]
if not minDistancePoint:
minDistancePoint = tracklistCopy[-1]
if len(tracklistCopy) == 1:
return minDistancePoint
else:
point = findPoint(kdTree, tracklist[-1])
if calDistance(point['Value'], target) < minDistance:
minDistance = calDistance(point['Value'], target)
minDistancePoint = point['Value']
fatherPoint = findPoint(kdTree, tracklistCopy[-2])
fatherPointval = fatherPoint['Value']
fatherPointfea = fatherPoint['feature']
if calDistance(fatherPoint['Value'], target) < minDistance:
minDistance = calDistance(fatherPoint['Value'], target)
minDistancePoint = fatherPoint['Value']
if point == fatherPoint['Left']:
anotherPoint = fatherPoint['Right']
elif point == fatherPoint['Right']:
anotherPoint = fatherPoint['Left']
if (anotherPoint == None or anotherPoint['Value'] in usedPointCopy or
abs(fatherPointval[fatherPointfea] - target[fatherPointfea]) > minDistance):
usedPoint = tracklistCopy.pop()
usedPointCopy.append(usedPoint)
return kdTreeSearch(tracklistCopy, target, usedPointCopy, minDistance, minDistancePoint)
else:
usedPoint = tracklistCopy.pop()
usedPointCopy.append(usedPoint)
subvalue, subtrackList = dfs(anotherPoint, target)
tracklistCopy.extend(subtrackList)
return kdTreeSearch(tracklistCopy, target, usedPointCopy, minDistance, minDistancePoint)
trainingSet = [(2, 3), (5, 4), (9, 6), (4, 7), (8, 1), (7, 2)]
kdTree = createTree(trainingSet)
target = eval(input('Input target point:'))
value, trackList = dfs(kdTree, target)
nnPoint = kdTreeSearch(trackList, target)
print(nnPoint)
k近邻算法实例(计算房租价格)
https://blog.csdn.net/Arwen_H/article/details/81978432
import pandas as pd #导入pandas库
from sklearn.neighbors import KNeighborsRegressor #导入机器学习库中的K近邻回归模型
from sklearn.metrics import mean_squared_error #导入机器学习库中的均方误差回归损失模型
dc_listings=pd.read_csv(r'C:\Users\宋益东\Downloads\listings.csv') #将数据导入,路径请根据自己的文件路径设置
features = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews'] #因数据太多设置只选取这8列
dc_listings = dc_listings[features] #获取只需要的数据列,并覆盖原始数据
dc_listings.head()#先查看一下数据情况,对数据有个初步了解
dc_listings['price']=dc_listings.price.str.replace("\$|,",'').astype(float) #将价格price变成数字类型
dc_listings=dc_listings.dropna() #过滤缺失数据
normalized_listings = dc_listings #复制一下数据
norm_train_df=normalized_listings.copy().iloc[0:2792] #创建训练集训练集取2792个样本
norm_test_df=normalized_listings.copy().iloc[2792:] #创建测试集取879个样本
cols = ['accommodates','bedrooms','bathrooms','beds','minimum_nights','maximum_nights','number_of_reviews'] #选择测试集的训练的列
knn = KNeighborsRegressor(10) #模型近邻值手动设置成10,其他为默认参数
knn.fit(norm_train_df[cols], norm_train_df['price']) #X放入训练集数据,Y放入目标输出数据
two_features_predictions = knn.predict(norm_test_df[cols]) #输出测试集结果
two_features_mse = mean_squared_error(norm_test_df['price'], two_features_predictions)
two_features_rmse = two_features_mse ** (1/2)
print(two_features_rmse) #输出模型验证结果,根据结果调整近邻参数
print(knn.predict([[1,3,3,3,1,30,0]])) #设置自己房子的信息,预测出对应的价格