【ML】K近邻算法(2)
程序员文章站
2022-07-14 13:40:32
...
前文链接
距离函数
根据距离计算公式,计算两点之间的距离
import math
from itertools import combinations
def L(x, y, p=2):
if x and y and len(x)==len(y) and p>=1:
ans=0
for i in range(len(x)):
ans+=math.pow(abs(x[i]-y[i]), p)
return math.pow(ans, 1/p)
else: return -1
print(L([1, 2], [3, 7]))
KNN
使用KNN算法在iris
数据集上进行聚类预测
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import Counter
iris=load_iris()
df=pd.DataFrame(iris.data, columns=iris.feature_names)
df['label']=iris.target
df.columns=['sepal length', 'sepal width', 'petal lenght', 'petal width', 'label']
# print(df[:100])
data=np.array(df.iloc[:100, [0, 1, -1]])
X, y=data[:, :-1], data[:, -1] # 提取特征和标签
# 分离训练集和测试集8:2
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2)
# print(X_train.shape)
# print(X_test.shape)
class KNN:
def __init__(self, X_train, y_train, k_neighbors=3, p=2):
self.k=k_neighbors
self.p=p
self.X_train=X_train
self.y_train=y_train
# 预测样本点的label
def predict(self, X):
# 维护一个容量为k的列表,存储距离X最近的k的样本点
knn_pts=[]
# 对前k个点进行检测,存储距离和标签
for i in range(self.k):
d=np.linalg.norm(X-self.X_train[i], ord=self.p)
knn_pts.append((d, self.y_train[i]))
# 对k~n个点进行检测,更新列表
for i in range(self.k, len(self.X_train)):
max_index=knn_pts.index(max(knn_pts, key=lambda x: x[0]))
d=np.linalg.norm(X-self.X_train[i], ord=self.p)
if knn_pts[max_index][0]>d:
knn_pts[max_index]=(d, self.y_train[i])
# 投票
knn=[pt[-1] for pt in knn_pts]
pairs=Counter(knn)
ans=sorted(pairs.items(), key=lambda x: x[1])[-1][0]
return ans
def score(self, X_test, y_test):
cnt=0
for X, y in zip(X_test, y_test):
label=self.predict(X)
if label==y: cnt+=1
return cnt/len(X_test)
def demo_1():
clf=KNN(X_train, y_train, k_neighbors=5)
ans=clf.score(X_test, y_test)
print('the correct ratio is {}'.format(ans))
test_pt=[6.0, 3.0]
# 绘制结果图像
plt.scatter(df[:50]['sepal length'], df[:50]['sepal width'], label='0', color='blue', alpha=0.8)
plt.scatter(df[50:100]['sepal length'], df[50:100]['sepal width'], label='1', color='yellow', alpha=0.8)
plt.plot(test_pt[0], test_pt[1], 'rx', label='test point')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()
plt.show()
demo_1()
数据聚类预测结果如下
可以直接调用scikit-learn
包中函数
from sklearn.neighbors import KNeighborsClassifier as KNN
clf=KNN()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
参数说明:n_neighbors
:近邻点的个数p
:距离参数algorithm
:近邻算法,可以用auto
, ball
, kd_tree
, brute
weights
:确定近邻的权重
KD树
KD树是二叉树到维向量的扩展,在每个级别,选择一个特征进行分类,KD树的实现主要包括建树和查询两部分,保持平衡的KD树查询的时间复杂度为. 但是当时,复杂度会趋向于.
代码
from math import sqrt
from collections import namedtuple
class KDNode(object):
def __init__(self, sample, split, left, right):
self.sample=sample # k维样本点
self.split=split # 当前分割维度的序号
self.left=left
self.right=right
class KDTree(object):
def __init__(self, data):
k=len(data[0])
def makeNode(split, data_set):
if not data_set: return None
data_set.sort(key=lambda x: x[split])
# 在中位数点处划分
med_pos=len(data_set)//2
med=data_set[med_pos]
# 下一个划分维度
ne_split=(split+1)%k
return KDNode(med, split,
makeNode(ne_split, data_set[:med_pos]), # 递归创建左子树
makeNode(ne_split, data_set[med_pos+1:]) # 递归创建右子树
)
self.root=makeNode(0, data) # 从第0维开始创建kd树
ans=namedtuple('Res', 'nearest_point nearest_dist nodes_visited')
# kd树的先序遍历
def preorder(root):
print(root.sample)
if root.left: preorder(root.left)
if root.right: preorder(root.right)
# 查询kd树
def find(tree, point):
k=len(point) # k维
def travel(kd_node, target, max_dist):
if not kd_node: return ans([0]*k, float('inf'), 0)
nodes_vis=1
d=kd_node.split # 分裂维度
pivot=kd_node.sample
if target[d]<=pivot[d]: # 左子树
near_node=kd_node.left
ne_node=kd_node.right
else:
near_node=kd_node.right # 右子树
ne_node=kd_node.left
t=travel(near_node, target, max_dist) # 进行遍历找到包含目标点的区域
nearest=t.nearest_point # 当前最近点
dist=t.nearest_dist # 最近距离
nodes_vis+=t.nodes_visited
if dist<max_dist: max_dist=dist
cur_dist=abs(pivot[d]-target[d]) # 当前维度上目标点与分割超平面的距离
# 如果超球体与超平面不相交则直接返回
if max_dist<cur_dist: return ans(nearest, dist, nodes_vis)
cur_dist=sqrt(sum((p1-p2)**2 for p1, p2 in zip(pivot, target)))
if cur_dist<dist:
nearest=pivot
dist=cur_dist
max_dist=dist
# 检查另一侧
t2=travel(ne_node, target, max_dist)
nodes_vis+=t2.nodes_visited
if t2.nearest_dist<dist:
nearest=t2.nearest_point
dist=t2.nearest_dist
return ans(nearest, dist, nodes_vis)
return travel(tree.root, point, float('inf'))
def demo():
data=[[2, 3], [5, 4], [9, 6], [4, 7], [8, 1], [7, 2]]
kd=KDTree(data)
preorder(kd.root)
ans=find(kd, [3, 4.5])
print(ans)
demo()
from random import random
def demo2():
def random_vec(k):
return [random() for _ in range(k)]
# 产生n个k维向量
def random_pts(k, n):
return [random_vec(k) for _ in range(n)]
n_samples=1005
samples=random_pts(3, n_samples)
kd=KDTree(samples)
ans=find(kd, [0.1, 0.5, 0.8])
print(ans)
demo2()
Ball-Tree
相关论文下载链接:ball tree and kd tree
ball tree的数据结构是建立以半径为基础的邻域,以样本为中心的求在形式上等于,其中为半径,通过将较小的球嵌入较大的球进行建树. 样本中重要的条件属于单个球,这样可以使得计算复杂度保持在,根据球的属性来测量点和中心之间的距离以判断是否属于该球.
上一篇: tomcat bio nio apr