学习笔记【机器学习重点与实战】——10 聚类算法实现与实战
程序员文章站
2022-07-14 19:29:22
...
1 K-Means算法实现
K-Means算法过程伪代码表示如下:
创建k个点作为起始质心(经常是随机选择)
当任意一个点的簇分配结果发生改变时
对数据集中的每个数据点
对每个质心
计算质心与数据点之间的距离
将数据点分配到距其最近的簇
对每一个簇,计算簇中所有点的均值并将均值作为质心
实现代码如下:
def distEclud(vecA, vecB):
"""
计算两个向量的欧式距离
Parameters
----------
:param vecA: 向量A
:param vecB: 向量B
Returns
-------
欧式距离
Author: dkjkls
Blog: https://blog.csdn.net/dkjkls
Modify: 2018/4/30 10:38
"""
return sqrt(sum(power(vecA - vecB, 2))) #la.norm(vecA-vecB)
def randCent(dataSet, k):
"""
随机创建质心点
Parameters
----------
:param dataSet: 样本集
:param k: 簇数目
Returns
-------
:param centroids: 质心集合
Author: dkjkls
Blog: https://blog.csdn.net/dkjkls
Modify: 2018/4/30 11:02
"""
n = shape(dataSet)[1] # 特征个数
centroids = mat(zeros((k,n))) # 初始化质心矩阵
# 遍历特征
for j in range(n):
minJ = min(dataSet[:,j]) # 特征最小值
rangeJ = float(max(dataSet[:,j]) - minJ) # 特征最大间距
centroids[:,j] = mat(minJ + rangeJ * random.rand(k,1)) # 随机取该特征值边界内的点
return centroids
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
"""
K-均值聚类算法
Parameters
----------
:param dataSet: 样本集
:param k: 簇数目
:param distMeas: 距离计算函数
:param createCent: 创建初始质心函数
Returns
-------
:param centroids: 质心集合
:param clusterAssment: 簇分配结果矩阵
Author: dkjkls
Blog: https://blog.csdn.net/dkjkls
Modify: 2018/4/30 11:13
"""
m = shape(dataSet)[0] # 样本个数
clusterAssment = mat(zeros((m,2))) # 簇分配结果矩阵(质点,距质点的距离平方)
centroids = createCent(dataSet, k) # 随机创建k个质心点
clusterChanged = True # 标志位,判断所有数据点的簇分配结果不再改变
while clusterChanged:
clusterChanged = False
for i in range(m): # 遍历所有数据
minDist = inf; minIndex = -1
for j in range(k): # 遍历所有质心
distJI = distMeas(centroids[j,:],dataSet[i,:]) # 距质心的距离
if distJI < minDist:
minDist = distJI; minIndex = j # 找到最近的质心
if clusterAssment[i,0] != minIndex: clusterChanged = True # 有变化则标志位赋值
clusterAssment[i,:] = minIndex,minDist**2
print(centroids)
for cent in range(k): # 遍历质心
# 筛选簇分配结果矩阵中质心为cent的样本
ptsInClust = dataSet[nonzero(clusterAssment[:,0].A==cent)[0]]
# 更新质心为所有值的均值
centroids[cent,:] = mean(ptsInClust, axis=0)
return centroids, clusterAssment
sklearn中代码实现如下:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=4, init='random')
2 二分K-Means算法实现
二分K-Means算法过程伪代码表示如下:
将所有点看成一个簇
当簇数目小于 k 时
对于每一个簇
计算总误差
在给定的簇上面进行 KMeans 聚类(k=2)
计算将该簇一分为二之后的总误差
选择使得误差最小的那个簇进行划分操作
实现代码如下:
def biKmeans(dataSet, k, distMeas=distEclud):
"""
二分K-均值聚类算法
Parameters
----------
:param dataSet: 样本集
:param k: 簇数目
:param distMeas: 距离计算函数
Returns
-------
:param centroids: 质心集合
:param clusterAssment: 簇分配结果矩阵
Author: dkjkls
Blog: https://blog.csdn.net/dkjkls
Modify: 2018/4/30 14:28
"""
m = shape(dataSet)[0] # 样本个数
clusterAssment = mat(zeros((m,2))) # 簇分配结果矩阵(质点,距质点的距离平方)
centroid0 = mean(dataSet, axis=0)[0]# 计算整个样本的质心
centList =[centroid0] # 创建只有一个质心的质心集合
for j in range(m): # 初始化样本点距质心的误差
clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
while (len(centList) < k): # 当划分簇数目小于k时继续划分
lowestSSE = inf # 最小的SSE
for i in range(len(centList)):
# 筛选质心为i的样本子集
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]
# 对样本子集进行期待簇为2的K-均值聚类
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
# 样本子集总误差
sseSplit = sum(splitClustAss[:,1])
# 样本中除样本子集外的总误差
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
# 选择使得误差最小的那个簇进行划分
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
# 改变最佳划分样本子集中新增簇质点(1)为最新的质点编号
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList)
# 改变最佳划分样本子集中质点(0)为被划分的质点的质点编号
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
# 更新被划分的质点坐标
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]
# 增加质点坐标
centList.append(bestNewCents[1,:].tolist()[0])
# 更新样本中被划分的样本子集的簇分配结果矩阵
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss
return mat(centList), clusterAssment
3 聚类实战
3.1 K-Means
使用sklearn.datasets.make_blobs生成三维聚类数据集,并做适当变换。通过KMeans聚类,使用k-means++初始化方法来对生成数据集进行聚类,实现代码如下:
import numpy as np
import matplotlib.colors
import matplotlib.pyplot as plt
import sklearn.datasets as ds
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_mutual_info_score,\
adjusted_rand_score, silhouette_score
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
if __name__ == "__main__":
# 生成样本集
N = 400 # 样本个数
centers = 4 # 簇的个数
# 生成三维四个簇的高斯数据集
data, y = ds.make_blobs(N, n_features=3, centers=centers, random_state=0)
# 变换数据集的标准差
data2, y2 = ds.make_blobs(N, n_features=3, centers=centers, cluster_std=(1,2.5,0.5,2), random_state=0)
# 生成不均衡数据集
data3 = np.vstack((data[y == 0][:], data[y == 1][:50], data[y == 2][:20], data[y == 3][:5]))
y3 = np.array([0] * 100 + [1] * 50 + [2] * 20 + [3] * 5)
# 生成由data经旋转后的数据集
m = np.array(((1, 1, 1), (1, 6, 3), (6, 2, 1)))
data_r = data.dot(m)
data_list = data, data, data_r, data_r, data2, data2, data3, data3
y_list = y, y, y, y, y2, y2, y3, y3
titles = '原始数据', 'KMeans++聚类', '旋转后数据', '旋转后KMeans++聚类', '方差不相等数据', '方差不相等KMeans++聚类', '数量不相等数据', '数量不相等KMeans++聚类'
# KMeans聚类,使用k-means++初始化方法
model = KMeans(n_clusters=4, init='k-means++')
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
cm = matplotlib.colors.ListedColormap(list('rgbm'))
fig = plt.figure(figsize=(8, 9), facecolor='w')
for i, (x, y, title) in enumerate(zip(data_list, y_list, titles), start=1):
if i % 2 == 1:
y_pred = y
else:
# 训练聚类,并得到预测值
y_pred = model.fit_predict(x)
# 输出各评价指标
print(i, title)
print('Homogeneity(均一性):', homogeneity_score(y, y_pred))
print('Completeness(完整性):', completeness_score(y, y_pred))
print('V measure:', v_measure_score(y, y_pred))
print('AMI(调整互信息):', adjusted_mutual_info_score(y, y_pred))
print('ARI(调整兰德指数):', adjusted_rand_score(y, y_pred))
print('Silhouette(轮廓系数):', silhouette_score(x, y_pred), '\n')
# 输出图形
ax = fig.add_subplot(4, 2, i, projection= '3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y_pred, s=10, cmap=cm, edgecolors='none')
plt.title(title)
plt.tight_layout(2, rect=(0, 0, 1, 0.97)) # 大标题的间距
plt.suptitle('数据分布对KMeans聚类的影响', fontsize=18)
plt.show()
评价指标输出如下:
1 原始数据
Homogeneity(均一性): 1.0
Completeness(完整性): 1.0
V measure: 1.0
AMI(调整互信息): 1.0
ARI(调整兰德指数): 1.0
Silhouette(轮廓系数): 0.585991021926362
2 KMeans++聚类
Homogeneity(均一性): 0.9755386902247056
Completeness(完整性): 0.9756970727365012
V measure: 0.9756178750526207
AMI(调整互信息): 0.9753363987158269
ARI(调整兰德指数): 0.9801537796138619
Silhouette(轮廓系数): 0.5893028393427763
3 旋转后数据
Homogeneity(均一性): 1.0
Completeness(完整性): 1.0
V measure: 1.0
AMI(调整互信息): 1.0
ARI(调整兰德指数): 1.0
Silhouette(轮廓系数): 0.4826402978130291
4 旋转后KMeans++聚类
Homogeneity(均一性): 0.750073193909673
Completeness(完整性): 0.7520308201484618
V measure: 0.751050731384886
AMI(调整互信息): 0.7480061623345577
ARI(调整兰德指数): 0.6650917159405569
Silhouette(轮廓系数): 0.5561094523058618
5 方差不相等数据
Homogeneity(均一性): 1.0
Completeness(完整性): 1.0
V measure: 1.0
AMI(调整互信息): 1.0
ARI(调整兰德指数): 1.0
Silhouette(轮廓系数): 0.44077853624904384
6 方差不相等KMeans++聚类
Homogeneity(均一性): 0.7996534861783875
Completeness(完整性): 0.8082360836131177
V measure: 0.8039218787894781
AMI(调整互信息): 0.7979960910662527
ARI(调整兰德指数): 0.7860791946016837
Silhouette(轮廓系数): 0.47746468201023695
7 数量不相等数据
Homogeneity(均一性): 1.0
Completeness(完整性): 1.0
V measure: 1.0
AMI(调整互信息): 1.0
ARI(调整兰德指数): 1.0
Silhouette(轮廓系数): 0.4350603729416132
8 数量不相等KMeans++聚类
Homogeneity(均一性): 0.9126743491313174
Completeness(完整性): 0.7091078879830717
V measure: 0.798115265191655
AMI(调整互信息): 0.7027483564436865
ARI(调整兰德指数): 0.6313988407011663
Silhouette(轮廓系数): 0.408504798563812
生成图像如下:
3.2 层次聚类
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import sklearn.datasets as ds
import warnings
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, silhouette_score
if __name__ == '__main__':
warnings.filterwarnings(action='ignore', category=UserWarning)
# 生成样本集
N = 400 # 样本个数
centers = 4 # 簇的个数
np.random.seed(0) # 撒固定的种子,保证每次样本集数据相同
# 生成二维四个簇的高斯数据集
data1, y1 = ds.make_blobs(n_samples=N, n_features=2, centers=((-1, 1), (1, 1), (1, -1), (-1, -1)),
cluster_std=(0.1, 0.2, 0.3, 0.4), random_state=0)
data1 = np.array(data1)
n_noise = int(0.1*N) # 噪声数量
r = np.random.rand(n_noise, 2) # 二维噪声数组
data_min1, data_min2 = np.min(data1, axis=0)
data_max1, data_max2 = np.max(data1, axis=0)
# 限制噪声在数据集范围内
r[:, 0] = r[:, 0] * (data_max1-data_min1) + data_min1
r[:, 1] = r[:, 1] * (data_max2-data_min2) + data_min2
# 数据集加入噪声
data1_noise = np.concatenate((data1, r), axis=0)
y1_noise = np.concatenate((y1, [4]*n_noise))
# 生成两个交叉的半圆数据集
data2, y2 = ds.make_moons(n_samples=N, noise=.05)
data2 = np.array(data2)
n_noise = int(0.1 * N) # 噪声数量
r = np.random.rand(n_noise, 2) # 二维噪声数组
data_min1, data_min2 = np.min(data2, axis=0)
data_max1, data_max2 = np.max(data2, axis=0)
# 限制噪声在数据集范围内
r[:, 0] = r[:, 0] * (data_max1 - data_min1) + data_min1
r[:, 1] = r[:, 1] * (data_max2 - data_min2) + data_min2
# 数据集加入噪声
data2_noise = np.concatenate((data2, r), axis=0)
y2_noise = np.concatenate((y2, [3] * n_noise))
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
cm = mpl.colors.ListedColormap(['r', 'g', 'b', 'm', 'c'])
plt.figure(figsize=(10, 8), facecolor='w')
plt.cla()
# 合并策略
linkages = ("ward", "complete", "average")
for index, (n_clusters, data, y) in enumerate(((4, data1, y1), (4, data1_noise, y1_noise),
(2, data2, y2), (2, data2_noise, y2_noise))):
# 原始图形
plt.subplot(4, 4, 4*index+1)
plt.scatter(data[:, 0], data[:, 1], c=y, s=12, edgecolors='k', cmap=cm)
plt.title('Prime' + '\n'
+ 'AMI:' + str("%.2f" % adjusted_mutual_info_score(y,y))
+ ' ARI:' + str("%.2f" % adjusted_rand_score(y, y))
+ ' S:' + str("%.2f" % silhouette_score(data, y)), fontsize=12)
plt.grid(b=True, ls=':')
data_min1, data_min2 = np.min(data, axis=0)
data_max1, data_max2 = np.max(data, axis=0)
plt.xlim(extend(data_min1, data_max1))
plt.ylim(extend(data_min2, data_max2))
# 遍历合并策略
for i, linkage in enumerate(linkages):
# 层次聚类,选取欧氏距离,合并策略
ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=linkage)
ac.fit(data) # 训练聚类
y_pred = ac.labels_ # 聚类结果
# 作图
plt.subplot(4, 4, i+2+4*index)
plt.scatter(data[:, 0], data[:, 1], c=y_pred, s=12, edgecolors='k', cmap=cm)
plt.title(linkage + '\n'
+ 'AMI:' + str("%.2f" % adjusted_mutual_info_score(y,y_pred))
+ ' ARI:' + str("%.2f" % adjusted_rand_score(y, y_pred))
+ ' S:' + str("%.2f" % silhouette_score(data, y_pred)), fontsize=12)
plt.grid(b=True, ls=':')
plt.xlim(extend(data_min1, data_max1))
plt.ylim(extend(data_min2, data_max2))
plt.suptitle('层次聚类的不同合并策略', fontsize=15)
plt.tight_layout(0.5, rect=(0, 0, 1, 0.95))
plt.show()
生成图像如下:
3.3 密度聚类
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score, silhouette_score
if __name__ == "__main__":
N = 1000 # 样本个数
centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]] # 簇中心点
# 按照簇中心点生成二维高斯数据集
data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
# 拟合并标准化
data = StandardScaler().fit_transform(data)
# 数据1的参数:(epsilon, min_sample)
params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(9, 7), facecolor='w')
plt.suptitle('DBSCAN聚类', fontsize=15)
for i in range(6):
eps, min_samples = params[i] # DBSCAN参数
# 创建DBSCAN聚类模型
model = DBSCAN(eps=eps, min_samples=min_samples)
model.fit(data) # 训练
y_hat = model.labels_ # 结果集
# 核心对象
core_indices = np.zeros_like(y_hat, dtype=bool)
core_indices[model.core_sample_indices_] = True
# 簇信息
y_unique = np.unique(y_hat)
n_clusters = y_unique.size - (1 if -1 in y_hat else 0)
print(y_unique, '聚类簇的个数为:', n_clusters)
# 作图
plt.subplot(2, 3, i+1)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
for k, clr in zip(y_unique, clrs):
cur = (y_hat == k)
# 噪声点
if k == -1:
plt.scatter(data[cur, 0], data[cur, 1], s=10, c='k')
continue
plt.scatter(data[cur, 0], data[cur, 1], s=15, c=clr, edgecolors='k')
# 核心对象
plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=40, c=clr, marker='d', edgecolors='k')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.plot()
plt.grid(b=True, ls=':', color='#606060')
plt.title('$\epsilon$ = %.1f m = %d,聚类数目:%d \n AMI: %.2f ARI: %.2f S:%.2f' %
(eps, min_samples, n_clusters, adjusted_mutual_info_score(y, y_hat),
adjusted_rand_score(y, y_hat), silhouette_score(data, y_hat)), fontsize=12)
plt.tight_layout()
plt.subplots_adjust(top=0.87)
plt.show()
生成图像如下:
3.4 谱聚类
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import SpectralClustering
from sklearn.metrics import euclidean_distances
if __name__ == "__main__":
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
# 生成圆形数据集
t = np.arange(0, 2*np.pi, 0.1)
data1 = np.vstack((np.cos(t), np.sin(t))).T
data2 = np.vstack((2*np.cos(t), 2*np.sin(t))).T
data3 = np.vstack((3*np.cos(t), 3*np.sin(t))).T
data = np.vstack((data1, data2, data3))
n_clusters = 3 # 簇的个数
# 计算数据集的欧氏距离平方
m = euclidean_distances(data, squared=True)
plt.figure(figsize=(12, 8), facecolor='w')
plt.suptitle('谱聚类', fontsize=16)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters))
for i, s in enumerate(np.logspace(-2, 0, 6)):
af = np.exp(-m ** 2 / (s ** 2)) + 1e-6
# 谱聚类模型
model = SpectralClustering(n_clusters=n_clusters, affinity='precomputed', assign_labels='kmeans', random_state=1)
# 训练聚类,并得到预测值
y_hat = model.fit_predict(af)
# 作图
plt.subplot(2, 3, i+1)
for k, clr in enumerate(clrs):
cur = (y_hat == k)
plt.scatter(data[cur, 0], data[cur, 1], s=40, c=clr, edgecolors='k')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(b=True, ls=':', color='#808080')
plt.title('$\sigma$ = %.2f' % s, fontsize=13)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
生成图像如下:
4 参考
- 机器学习升级版视频 - 邹博
- 《机器学习实战》第10章 利用K-均值聚类算法对未标注数据分组
===========文档信息============
学习笔记由博主整理编辑,供非商用学习交流用
如本文涉及侵权,请随时留言博主,必妥善处置
版权声明:非商用*转载-保持署名-注明出处
署名(BY) :dkjkls(dkj卡洛斯)
文章出处:http://blog.csdn.net/dkjkls