欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

分级聚类、K均值聚类

程序员文章站 2022-07-14 11:41:22
...

分级聚类、K均值聚类

案例

假设有你一批博客数据,请根据博客内容进行聚类,分别用分级聚类和K-均值聚类查看博客的分组情况。

假设你使用API已经爬取了相关数据,并且已经整好数据格式如下:

blogname,china,kids,music,yahoo,search,engine,google,operating,system,python
Read/WriteWeb,5,20,15,0,8,5,1,30,4,5
Search Engine ,1,0,0,3,10,18,8,2,5,1
Operating System,5,0,1,0,7,10,30,12,7,3

思路逻辑图

分级聚类、K均值聚类

Python代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#@Time: 2019-11-17 16:12
#@Author: gaoll

import time
import random
import math
import pandas as pd 
from pandas import Series,DataFrame
import numpy as np
import matplotlib.pyplot as plt


#定义度量函数。距离越接近于0,说明紧密度越高。
#皮尔逊相关度公式。/去中心化的余弦公式。
def pearson(v1,v2):
	N = len(v1)
	#简单求和
	sum1 = sum(v1)
	sum2 = sum(v2)
	#求平方和
	sum1sq = sum([np.power(v,2) for v in v1])
	sum2sq = sum([np.power(v,2) for v in v2])
	#求乘积之和
	pdsum = sum([v1[i]*v2[i] for i in range(N)])
	#计算Pearson系数
	num = pdsum - (sum1*sum2)/N
	den = math.sqrt((sum1sq-np.power(sum1,2)/N) * (sum2sq-np.power(sum2,2)/N))

	if den == 0:
		return 0
	return 1 - num/den

#定义‘聚类’这一类型
class  bicluster:
	"""当前聚类的描述,原两个聚类left,right,距离distance"""
	def __init__(self,vec,left=None,right=None,distance=0.0,id=None):
		self.vec = vec
		self.left = left
		self.right = right
		self.distance = distance
		self.id = id

#递归聚类
def hcluster(rows,distance=pearson):
	#记录计算过相关系数的数据
	distance_dict = {}
	currentclustid = -1
	#定义最开始的聚类
	cluster = [bicluster(rows[i],id=i) for i in range(len(rows))] #开始聚类前每个向量都是一个单独的聚类,id指定为行号
	#主循环,生成分级聚类
	while len(cluster)>1:
		#记录最好的聚类配对	
		best_cluseter = (0,1) 
		lowest_distance = distance(cluster[0].vec,cluster[1].vec)
	
		for i in range(len(cluster)):
			for j in range(i+1,len(cluster)):
				#计算并保存距离
				if (cluster[i].id,cluster[j].id) not in distance_dict:
					d = distance(cluster[i].vec,cluster[j].vec)
					distance_dict.setdefault((i,j),0)
					distance_dict[(i,j)] = d
				else:
					d = distance_dict[(cluster[i].id,cluster[j].id)]
				if d < lowest_distance:
					lowest_distance = d
					best_cluseter = (i,j)
		
		left_clust = cluster[best_cluseter[0]]
		right_clust = cluster[best_cluseter[1]]
		new_vec = [ (left_clust.vec[i]+right_clust.vec[i])/2 for i in range(len(left_clust.vec))] #新的向量坐标
		new_clust=bicluster(vec=new_vec,left=left_clust,right=right_clust,distance=lowest_distance,id=currentclustid)
		del cluster[best_cluseter[1]] #删掉聚合前的聚类
		del cluster[best_cluseter[0]] #注意:这里先删除下标大的best_cluseter[1],再删除下标小的best_cluseter[0],否则反过来下标可能会溢出,也会删错。
		cluster.append(new_clust)  #添加新聚类
		currentclustid -=1
	return cluster[0]

#绘制树状图展示分级聚类
#获取数的整体高度
def getheight(clust):
	#叶节点的高度定为1,整体高度等于所有分支高度之和
	if clust.left==None and clust.right==None:
		return 1
	return	getheight(clust.left) + getheight(clust.right)

#获取数的整体宽度,即分级聚类的最长距离
def getdepth(clust):
	#一个叶节点的深度定为0,一个枝节点的深度定义为自身的距离distance加上最长子分支的深度
	if clust.left==None and clust.right==None:
		return 0
	return max(getdepth(clust.left), getdepth(clust.right)) + clust.distance

#画树状图。节点坐标(x,y),(1) 绘制垂直线,高等等于left、right分支的高度。left(x,y-getheight(right)/2), right(x,y+getheight(left)/2)
#(2)画水平线表示聚类间的距离,x=x+getdepth(left 。 节点上打印文本说明
#以防整体距离太大或太小,可对距离都乘上一个比例系数进行缩放。

def drawnode(fig,clust,x,y,scaling,labels):
	#if clust.left == None and clust.right==None:
	if clust.id >=0:
		#如果是叶节点,打印label说明
		label = labels[clust.id]
		fig.text(x,y,label)
	else:
		#如果是枝节点,打印垂直线和水平线延伸到left\right分支,并绘制分支
		h1 = getheight(clust.left)
		h2 = getheight(clust.right)
		top_y = y + h2/2
		bottom_y = y - h1/2
		#画垂直线
		fig.plot([x,x],[bottom_y,top_y])
		#画水平线
		d = clust.distance * scaling
		fig.plot([x,x+d],[top_y,top_y])
		fig.plot([x,x+d],[bottom_y,bottom_y])
		#画分支
		drawnode(fig,clust.left,x+d,top_y,scaling,labels)
		drawnode(fig,clust.right,x+d,bottom_y,scaling,labels)

def drawdendrogram(clust,labels,tree_jpg='clusters_dendrogram.jpg'):
	width = 5 + 5
	height = getheight(clust) +5
	depth = getdepth(clust)
	scaling = float((width-5)/depth)

	fig = plt.figure(figsize=(width+3, height+3))
	ax = fig.add_subplot(1,1,1)
	ax.set_title('clusters dendrogram')
	ax.plot([0.1,0.5],[height/2,height/2])
	drawnode(ax,clust,0.5,height/2,scaling,labels) #开始画第一个点

	fig.savefig(tree_jpg)
	plt.show()

#K均值聚类
def kcluster(rows,distance=pearson,k=3):
	N = len(rows[0])
	M = len(rows)
	#确定每列的最大最小值
	domain=[(0,0) for i in range(N)]
	for i in range(N):
		(min1,max1) = domain[i]
		tmp_list =[]
		for row in rows:
			tmp_list.append(row[i])
		min1 = min(tmp_list)
		max1 = max(tmp_list)
		domain[i] = (min1,max1)

	#随机K个中心点
	clusters = [[domain[j][0] + random.random()*(domain[j][1]-domain[j][0]) for j in range(N)] for i in range(k)]

	#计算每个数据项离哪个中心点最近
	lastmatches = None
	for t in range(100):
		print('Iteration %d'%t) #打印第几次迭代
		bestmatches = [[] for i in range(k)] #记录此次迭代的最佳聚类结果
		for j in range(M):
			row = rows[j]
			bestmatch = 0
			best_distance = distance(row,clusters[0])
			for i in range(1,k):
				dist= distance(row,clusters[i]) #计算每个数据项和中心点的距离
				#print(dist)
				if dist < best_distance:
					bestmatch = i 
					best_distance = dist
			bestmatches[bestmatch].append(j)  #第j个数据项加入到最合适的聚类组中

		if bestmatches == lastmatches:
			break
		lastmatches = bestmatches
		for i in range(k):
			avgs=[0]*N
			cnt = len(bestmatches[i])
			if cnt>0:
				for rowid in bestmatches[i]:
					for j in range(N):
						avgs[j] +=rows[rowid][j]
			
			for j in range(N):
				avgs[j] = avgs[j]/cnt
			clusters[i] = avgs
	return bestmatches


if __name__ == '__main__':

	#加载数据
	filename = 'blogdata_sample.txt'
	df = pd.read_csv(filename,sep=',',header=0,index_col=0,na_filter=False,skip_blank_lines=True,encoding='utf-8') #nrows=3
	blognames,words = df.index.tolist(),df.columns.tolist()
	data = []
	for i in range(len(blognames)):
		p = df.iloc[i].tolist()
		data.append(p)

	#分级聚类和画树状图
	clust=hcluster(data)
	drawdendrogram(clust,labels=blognames)

	#K均值聚类
	kclust=kcluster(data)
	print(kclust)
	print([[blognames[id] for id in kclust[i]] for i in range(k)])

结果展示

#1、分级聚类树形图

分级聚类、K均值聚类

#2、K均值聚类
print(kclust)
[[2], [0, 1, 3, 5, 6], [4]]
#print([[blognames[id] for id in kclust[i]] for i in range(k)])
[['Read/WriteWeb'], ["John Battelle's Searchblog", 'Search Engine Watch Blog', 'Official Google Blog', 'Google Blogoscoped', 'Google Operating System'], ['Search Engine Roundtable']]

Done