DBSCAN密度聚类
程序员文章站
2022-07-03 11:47:34
...
def distance(data):
m, n = np.shape(data)
dis = np.mat(np.zeros((m, m)))
for i in range(m):
for j in range(i, m):
# 计算i和j之间的欧式距离
tmp = 0
for k in range(n):
tmp += (data[i, k] - data[j, k]) * (data[i, k] - data[j, k])
dis[i, j] = np.sqrt(tmp)
dis[j, i] = dis[i, j]
return dis
def find_eps(distance_D, eps):
ind = []
n = np.shape(distance_D)[1]
for j in range(n):
if distance_D[0, j] <= eps:
ind.append(j)
return ind
def dbscan(data, eps, MinPts):
m = np.shape(data)[0]
# 区分核心点1,边界点0和噪音点-1
types = np.mat(np.zeros((1, m)))
sub_class = np.mat(np.zeros((1, m)))
# 用于判断该点是否处理过,0表示未处理过
dealed = np.mat(np.zeros((m, 1)))
# 计算每个数据点之间的距离
dis = distance(data)
# 用于标记类别
number = 1
# 对每一个点进行处理
for i in range(m):
# 找到未处理的点
if dealed[i, 0] == 0:
# 找到第i个点到其他所有点的距离
D = dis[i,]
# 找到半径eps内的所有点
ind = find_eps(D, eps)
# 区分点的类型
# 边界点
if len(ind) > 1 and len(ind) < MinPts + 1:
types[0, i] = 0
sub_class[0, i] = 0
# 噪音点
if len(ind) == 1:
types[0, i] = -1
sub_class[0, i] = -1
dealed[i, 0] = 1
# 核心点
if len(ind) >= MinPts + 1:
types[0, i] = 1
for x in ind:
sub_class[0, x] = number
# 判断核心点是否密度可达
while len(ind) > 0:
dealed[ind[0], 0] = 1
D = dis[ind[0],]
tmp = ind[0]
del ind[0]
ind_1 = find_eps(D, eps)
if len(ind_1) > 1: # 处理非噪音点
for x1 in ind_1:
sub_class[0, x1] = number
if len(ind_1) >= MinPts + 1:
types[0, tmp] = 1
else:
types[0, tmp] = 0
for j in range(len(ind_1)):
if dealed[ind_1[j], 0] == 0:
dealed[ind_1[j], 0] = 1
ind.append(ind_1[j])
sub_class[0, ind_1[j]] = number
number += 1
# 最后处理所有未分类的点为噪音点
ind_2 = ((sub_class == 0).nonzero())[1]
for x in ind_2:
sub_class[0, x] = -1
types[0, x] = -1
return types, sub_class
from sklearn.datasets.samples_generator import make_moons
X,y_true = make_moons(n_samples=1000,noise=0.15)
# print(X)
# print(y_true)
plt.scatter(X[:,0],X[:,1],c=y_true)
plt.show()
data = X
eps = 0.05
MinPts = 40
type1,class1 = dbscan(data, eps, MinPts)
plt.scatter(X[:,0],X[:,1],c=class1.tolist()[0])
plt.show()
上一篇: PHP根据二维数组中某个字段排序