k-均值聚类
程序员文章站
2022-07-14 11:41:16
...
1、k-均值聚类
1.1、伪代码
创建k个点作为起始质心(经常是随机选择)
当任意一个点的簇分配结果发生改变时
对数据集中的每个数据点
. 对每个质心
计算质心与数据点之间的距离
将数据点分配到距其最近的簇
对每一个簇,计算簇中所有点的均值并将均值作为质心
当任意一个点的簇分配结果发生改变时
对数据集中的每个数据点
. 对每个质心
计算质心与数据点之间的距离
将数据点分配到距其最近的簇
对每一个簇,计算簇中所有点的均值并将均值作为质心
1.2、核心代码
from numpy import *
#将数据集每一行按照tab符号分割,并转为float类型,添加到dataMat列表里面
def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float,curLine)) #map all elements to float()
dataMat.append(fltLine)
# dataMat = mat(dataMat)
return dataMat
#计算两个向量的欧氏距离
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
#找到k个随机质心的集合
def randCent(dataSet, k):
#取出dataset的列数
n = shape(dataSet)[1]
#创建一个k行n列的零矩阵
centroids = mat(zeros((k,n)))#create centroid mat
dataSet = mat(dataSet)
for j in range(n):#create random cluster centers, within bounds of each dimension
#找到每一列的最小值
minJ = min(dataSet[:,j])
#计算出每一列最大值-最小值
rangeJ = float(max(dataSet[:,j]) - minJ)
#random.rand(k,1) k行1列的0-1之间的随机数
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1))
return centroids
'''
k-均值算法
dataSet:数据集
k:簇的个数
distMeas=distEclud:可选参数,计算欧氏距离
createCent=randCent:可选参数,计算k个随机质心的集合
'''
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
#获取数据集的行数
m = shape(dataSet)[0]
#簇分配结果矩阵
#一列用来记录簇索引值,一列用来记录存储误差:当前点到簇质心的距离
clusterAssment = mat(zeros((m,2)))#create mat to assign data points
#to a centroid, also holds SE of each point
centroids = createCent(dataSet, k)
#标志变量,控制迭代的终止
clusterChanged = True
while clusterChanged:
clusterChanged = False
#遍历所有的数据点,找到距离最近的质心
#对于每个数据点遍历所有的质心
for i in range(m):#for each data point assign it to the closest centroid
#定义最小的距离为无穷大
minDist = inf;
#最小距离的索引值为-1
minIndex = -1
#遍历所有的质心
dataSet = mat(dataSet)
for j in range(k):
#计算每个数据点到所有质心的距离
distJI = distMeas(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI;
minIndex = j
#对于当前数据点找到了距离最小的质心之后
#如果簇分配结果矩阵的索引值不等于minIndex
if clusterAssment[i,0] != minIndex:
#改变标志位,簇发生了改变
clusterChanged = True
clusterAssment[i,:] = minIndex,minDist**2
#遍历完所有数据点之后,打印
print (centroids)
#遍历每个簇,重新计算质心,将属于该簇的所有点的均值作为质心
for cent in range(k):#recalculate centroids
#(clusterAssment[:,0]每个数据点对应的最小距离质心的索引值
#nonzero(clusterAssment[:,0].A==cent)[0]找出每个数据点对应的最小距离质心的索引值=cent的行索引
b = clusterAssment[:,0].A==cent
a = nonzero(b)
c = a[0]
d = dataSet[c]
ptsInClust = d #get all the point in this cluster
# mean(ptsInClust, axis=0) 沿着矩阵的列方向进行均值计算 对各列求均值
centroids[cent,:] = mean(ptsInClust, axis=0) #assign centroid to mean
return centroids, clusterAssment
# return b,a,c,d
# if __name__ == '__main__':
# dataset = loadDataSet('E:\\Python36\\testSet.txt')
# centroids, clusterAssment = kMeans(dataset,4)
# print(centroids, clusterAssment)
# print(centroids, clusterAssment)
# return b
帮助理解的变量b,a,c,d
>>> b
array([[False],
[False],
[ True],
[False],
[False],
.
.
.
.
[False]], dtype=bool)
>>> a
(array([ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 66,
70, 74, 78], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64))
>>> c
array([ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 66,
70, 74, 78], dtype=int64)
>>> d
matrix([[ 4.838138, -1.151539],
[ 0.450614, -3.302219],
[ 3.165506, -3.999838],
[ 0.704199, -0.479481],
[ 2.943496, -3.357075],
[ 2.190101, -1.90602 ],
[ 2.592976, -2.054368],
[ 0.939512, -4.023563],
[ 4.372646, -0.822248],
[ 2.83652 , -2.658556],
[ 3.367037, -3.184789],
[ 3.091414, -3.815232],
[ 2.127073, -2.98368 ],
[ 2.624081, -3.260715],
[ 1.864375, -3.176309],
[ 3.491078, -3.947487],
[ 2.280615, -2.559444],
[ 3.031012, -3.620252],
[ 4.668892, -2.562705],
[ 4.479332, -1.764772]])
>>>
结果给出了4个质心>>> centroids, clusterAssment= kmeans.kMeans(dataset,4)
[[-3.75936799 -1.09260055]
[-4.02437446 -3.08040251]
[-4.21145728 4.13632112]
[ 1.2397954 2.19994591]]
[[-3.018601 -0.7139044 ]
[-3.02164253 -3.178224 ]
[-2.64711247 3.04478547]
[ 2.5346791 0.55190468]]
[[-3.083058 -0.4696565 ]
[-2.9890561 -3.0737179 ]
[-2.45009747 2.89275747]
[ 2.82102866 0.39129192]]
1.3、k-均值聚类的缺点
1、存在局部最小值,而非全局最小值
2、由于簇数k是由用户自己指定的,不知道是否合理
3、不知道生成的簇是否是最优的,误差距离最小的
4、最初的质心是随机生成的
2、对k-均值聚类的改进
2.1改进途径一
将具有最大SSE(Sum of squared Error)的簇进行划分成两个簇,将该簇包含的点筛选出来进行k-均值算法(这里k=2)
,此时会增加一个簇,所以就需要对其中两个簇进行合并。
合并的方法可以是:
- 合并两个最近的质心(计算所有质心两两之间的距离)
- 合并两个使得SSE增加最小的质心(计算合并前后的SSE值,首先是合并后<合并前,其次是合并后的SSE增量最小)
2.2改进途径二
二分k-均值算法
将所有点看成一个簇
当簇数目小于K时
对于每一个簇
计算总误差
在给定的簇上面进行K-均值聚类(k=2)
计算将该簇一分为二之后的总误差
选择使得误差最小的那个簇进行划分操作
2.2.1代码实现
def biKmeans(dataSet, k, distMeas=distEclud):
dataSet = mat(dataSet)
m = shape(dataSet)[0]
#簇分配矩阵
clusterAssment = mat(zeros((m,2)))
#计算每列的均值,初始化质心,将所有点看成一个簇
'''
>>> mean(dataset,axis=0)
matrix([[-0.15772275, 1.22533012]])
>>> mean(dataset,axis=0).tolist()
[[-0.15772275000000002, 1.2253301166666664]]
>>> mean(dataset,axis=0).tolist()[0]
[-0.15772275000000002, 1.2253301166666664]
'''
centroid0 = mean(dataSet, axis=0).tolist()[0]
#创建质心列表
centList =[centroid0] #create a list with one centroid
#计算每个数据点到质心的欧氏距离,将之存储在簇分配矩阵的第二列
for j in range(m):#calc initial Error
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
#当簇的个数小于用户指定的个数时
while (len(centList) < k):
#将距离平方和设置为无穷大
lowestSSE = inf
#遍历每个簇
for i in range(len(centList)):
#找到数据集中所有属于簇i的点,取出来
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]#get the data points currently in cluster i
#重新计算该簇的质心和簇分配列表
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
#计算所有点到该质心的距离之和
sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
#统计不属于该簇的点距离该质心的距离之和
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print ("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
#如果分配之后的误差小于最初的误差,保存此次划分
if (sseSplit + sseNotSplit) < lowestSSE:
#将该簇更新为最佳划分簇
bestCentToSplit = i
#将该质心作为最佳新质心
bestNewCents = centroidMat
#将质心对应所有点的距离作为最佳簇分配列表
bestClustAss = splitClustAss.copy()
#将该簇的SSE作为最小SSE
lowestSSE = sseSplit + sseNotSplit
#更新簇的分配结果
#使用kmeans函数并且指定簇数为2时会得到两个编号为0和1的簇
#取出簇编号为1的簇编号更新为簇列表的长度
#新增的簇编号
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
#编号为0的簇,为最佳分裂簇的编号
#另外一个簇的编号为被分割的簇的编号
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
print ('the bestCentToSplit is: ',bestCentToSplit)
print ('the len of bestClustAss is: ', len(bestClustAss))
#添加分裂的质心到质心列表中
#更新被分割的编号的质心
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids
#添加新的簇质心
centList.append(bestNewCents[1,:].tolist()[0])
#更新原来的簇分配列表
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
return mat(centList), clusterAssment
2.2.2运行:
>>> centlist,mynewassements = kmeans.biKmeans(dataset,3)
[[-4.42137474 -2.52685636]
[-1.1132596 -4.05926245]]
[[-2.94737575 3.3263781 ]
[ 1.23710375 0.17480612]]
sseSplit, and notSplit: 570.722757425 0.0
the bestCentToSplit is: 0
the len of bestClustAss is: 60
[[-0.91346951 3.95162102]
[-3.38936877 3.79724626]]
[[-1.71356433 3.538928 ]
[-3.47615207 3.23528529]]
[[-1.76576557 3.39794014]
[-3.58362738 3.28784469]]
sseSplit, and notSplit: 22.9717718963 532.659806789
[[ 1.23912026 0.33664571]
[-0.60107285 -3.21699603]]
[[ 2.93386365 3.12782785]
[-0.45965615 -2.7782156 ]]
sseSplit, and notSplit: 68.6865481262 38.0629506357
the bestCentToSplit is: 1
the len of bestClustAss is: 40
>>> centlist
matrix([[-2.94737575, 3.3263781 ],
[ 2.93386365, 3.12782785],
[-0.45965615, -2.7782156 ]])
>>> mynewassements
matrix([[ 1.00000000e+00, 1.45461050e-01],
[ 0.00000000e+00, 6.80213825e-01],
[ 2.00000000e+00, 1.02184582e+00],
[ 1.00000000e+00, 1.34548760e+00],
[ 0.00000000e+00, 1.35376464e+00],
[ 2.00000000e+00, 3.87167519e+00],
[ 1.00000000e+00, 8.37259951e-01],
[ 0.00000000e+00, 2.20116272e-01],
[ 2.00000000e+00, 3.53809057e+00],
[ 1.00000000e+00, 7.44081160e+00],
[ 0.00000000e+00, 5.28070040e+00],
[ 2.00000000e+00, 2.56674394e-02],
[ 1.00000000e+00, 1.11946529e+00],
2.2.3画出二分-k均值质心分配图
#二分-k均值
import matplotlib.pyplot as pl
def plotBi():
dataMat = mat(loadDataSet('E:\\Python36\\testSet2.txt'))
centroids, clusterAssment = biKmeans(dataMat, 4)
pl.plot(centroids[:, 0], centroids[:, 1], 'ro')
pl.plot(dataMat[:, 0], dataMat[:, 1], 'go')
pl.show()
上一篇: 分级聚类、K均值聚类
下一篇: K均值聚类