KMeans聚类算法
程序员文章站
2022-03-10 10:29:49
...
一、fit_predict()
def fit_predict(self, X, y=None):
"""Compute cluster centers and predict cluster index for each sample.
Convenience method; equivalent to calling fit(X) followed by
predict(X).
"""
return self.fit(X).labels_
def fit_predict(data)
data=文件所有数据
返回每个数据对应的标签,并将标签值对应到相应的簇。
def fit(self, X, y=None):
"""Compute k-means clustering.
计算簇中心。
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
predict:指定x中每个点所属于的簇的位置。
***KMeans计算方法:
默认采用欧氏距离:
double getDistXY(const Tuple& t1, const Tuple& t2)
{
double sum = 0;
for(int i=1; i<=dimNum; ++i)
{
sum += (t1[i]-t2[i]) * (t1[i]-t2[i]);
}
return sqrt(sum);
}
kMeans代码解析笔记
import numpy as np
from sklearn.cluster import KMeans
def loadData(fileName):
fr=open(fileName,'r+')
lines=fr.readlines()
cityName=[]
data=[]
for line in lines:
items=line.strip().split(',')
cityName.append(items[0])
data.append([float(items[i]) for i in range(1,len(items))])#添加成为二维数组。里面一层数组外面再包一层括号
return data,cityName
if __name__=="__main__":
#加载文件数据代码引用函数
data,cityName=loadData('city.txt')
km=KMeans(n_clusters=4)
label=km.fit_predict(data)#返回的label更像是city的分身,而这些分身经过计算已经分类到四个簇中,本身值等于簇的值。
expenses=np.sum(km.cluster_centers_,axis=1)#横向计算每个城市的总开销,并把它归类到相应的簇里面。然后对每个簇进行求平均数。得到expense[i]i从0到3
print(expenses)
CityCluster=[[],[],[],[]]
for i in range(len(cityName)):
CityCluster[label[i]].append(cityName[i])
for i in range(4):
print('Expense:%.2f'%expenses[i])
print(CityCluster[i])