欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

K均值聚类

程序员文章站 2022-07-14 11:41:10
...

K均值聚类,K-means clustering

作用:无监督的实现数据分类
操作方式:迭代地将距离较近的点聚集在一起,形成簇,每个簇的数据点聚为一类
过程:1)初始化质心,有几个类就需要用户设定几个质心,好的初始化质心可减少迭代次数;
           2)计算每个点到每个质心的距离(下面实验用的是欧氏距离);
           3)比较每个点到各个质心的距离,距离最小则该点聚类的类标号为该质心的标号(如下图);

                         K均值聚类
           4)这样得到K个簇,计算每个簇的均值,作为该簇的新质心;
           5)若K个新质心与K个原质心相同(没有质心改变),则聚类结束,否则,返回第2步。
以下是C++代码:
#include <iostream>
#include <vector>
#include <fstream>

using namespace std;

struct Point {
	float x = 0.;
	float y = 0.;
};

void loadData(string filename, vector<Point> &data){
	fstream file;
	file.open(filename, std::ios::in);
	if (!file.is_open())
	{
		cout << "ERROR!\n";
		exit(0);
	}
	while (!file.eof())
	{
		Point pt;
		char msg[20];
		memset(msg, 0, 20);
		file >> msg;
		pt.x = atof(msg);

		memset(msg, 0, 20);
		file >> msg;
		pt.y = atof(msg);
		data.push_back(pt);
	}
	file.close();
}

void initCenter(const vector<Point> &data, const int class_num, vector<Point> ¢er){
	center.resize(class_num);
	float max_x = -1000,min_x = 1000., y_aver = 0.;
	for (int i = 0; i < data.size(); i++)
	{
		max_x = max_x > data[i].x ? max_x : data[i].x;
		min_x = min_x < data[i].x ? min_x : data[i].x;
		y_aver += data[i].y / data.size();
	}
	for (int i = 0 ; i < class_num; i++)
	{
		center[i].x = min_x + (max_x - min_x) / class_num * i;
		center[i].y = y_aver;
	}
}

bool centerIsChanged(const vector<Point> &c1, const vector<Point> &c2){
	if (c1.size() != c2.size())
		return true;
	for (int i = 0; i < c1.size(); i++)
	{
		if (c1[i].x != c2[i].x || c1[i].y != c2[i].y)
			return true;
	}
	return false;
}

vector<vector<Point>> kmeansCluster(const vector<Point> &data, vector<Point> ¢er){
	vector<vector<float>> data_distance;
	// 计算各数据点到各个质心的距离
	for (int i = 0; i < center.size(); i++)
	{
		vector<float> distance_i;
		for (int j = 0; j < data.size(); j++)
		{
			distance_i.push_back(
				sqrt(pow(data[j].x - center[i].x, 2) + pow(data[j].y - center[i].y, 2)));
		}
		data_distance.push_back(distance_i);
	}
	vector<vector<Point>> cluster(center.size());//存储每个质心对应的簇
	for (int i = 0; i < data.size(); i++)
	{
		int min_index = 0;
		float min = data_distance[0][i];
		for (int j = 0; j < center.size(); j++)
		{
			if (data_distance[j][i] < min)
			{
				min = data_distance[j][i];
				min_index = j;//得到距离最小的类编号
			}
		}
		cluster[min_index].push_back(data[i]);
	}

	//根据聚类结果更新质心位置
	for (int i = 0; i < center.size(); i++)
	{
		center[i].x = 0;
		center[i].y = 0;
		for (int j = 0; j < cluster[i].size(); j++)
		{
			center[i].x += cluster[i][j].x / cluster[i].size();
			center[i].y += cluster[i][j].y / cluster[i].size();
		}
	}
	return cluster;
}

int main(){
	vector<Point> data;
	data.clear();
	loadData("data.txt", data);
	vector<Point> center(3);
	initCenter(data, 3, center);
	
	while (1)
	{
		vector<Point> center_temp;
		center_temp = center;
		kmeansCluster(data, center);
		if (!centerIsChanged(center, center_temp))
			break;
		cout << "迭代结果:\n";
		for (int i = 0; i < center.size(); i++)
			cout << center[i].x << "   " << center[i].y << endl;
	}

	/*while (1)
	{
		vector<vector<float>> distance_value;
		for (int i = 0; i < 3; i++)
		{
			vector<float> distance;
			for (int j = 0; j < data.size(); j++)
			{
				float dis = sqrt(pow(data[j].x - center[i].x, 2)
					+ pow(data[j].y - center[i].y, 2));
				distance.push_back(dis);
			}
			distance_value.push_back(distance);
		}
		vector<vector<Point>> cluster(3);
		for (int i = 0; i < data.size(); i++)
		{
			if (distance_value[0][i] < distance_value[1][i])
			{
				if (distance_value[0][i] < distance_value[2][i])
					cluster[0].push_back(data[i]);
			}
			else if (distance_value[1][i] < distance_value[2][i])
				cluster[1].push_back(data[i]);
			else
				cluster[2].push_back(data[i]);
		}
		vector<Point> new_center(3);
		for (int i = 0; i < 3; i++)
		{
			float x_aver = 0.;
			float y_aver = 0.;
			for (int j = 0; j < cluster[i].size(); j++)
			{
				x_aver += cluster[i][j].x / cluster[i].size();
				y_aver += cluster[i][j].y / cluster[i].size();
			}
			new_center[i].x = x_aver;
			new_center[i].y = y_aver;
		}
		int flag = 0;
		cout << "迭代结果:\n"; 
		for (int i = 0; i < 3; i++)
		{
			if (new_center[i].x == center[i].x && new_center[i].y == center[i].y)
			{
				flag ++;
			}
			center[i].x = new_center[i].x;
			center[i].y = new_center[i].y;
			cout << center[i].x << "   " << center[i].y << endl;
		}
		if (flag == 3)
			break;
	}*/
	
	return 0;
}
聚类结果为
          K均值聚类
迭代结果表示在散点图上为,红色点为聚类后的质心
           K均值聚类