模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法
程序员文章站
2022-05-20 19:34:35
...
题目:本次作业的实验需求是使用分解聚类法与c-means聚类法对IRIS数据集进行聚类,Kmeans聚类代码网上摘录,分解聚类法纯原创,PS:因为时间紧,分解聚类法进行第二次分解时,偷懒了~~有缘人改改吧~~
数据格式:
kmeans代码:
import math
from collections import defaultdict
import numpy as np
dataname = "data.txt"
def loadIRISdata(filename):
data = []
with open(filename, mode="r", encoding="utf-8") as rf:
for line in rf:
if line == '\n':
continue
data.append(list(map(float, line.split(" "))))
return data
def generateCenters(data):
'''求解初始聚类中心'''
centers = []
'''已知维度为4'''
'''分三类,取第0,50,100的三个向量作为分界'''
centers.append(data[0])
centers.append(data[50])
centers.append(data[100])
return centers
def distance(a ,b):
'''欧式距离'''
sum = 0
for i in range(4):
sq = (a[i]-b[i])*(a[i]-b[i])
sum += sq
return math.sqrt(sum)
def point_avg(points):
'''对维度求平均值'''
new_center = []
for i in range(4):
sum = 0
for p in points:
sum += p[i]
new_center.append(float("%.8f" % (sum/float(len(points)))))
return new_center
def updataCenters(data, assigments):
new_means = defaultdict(list)
centers = []
for assigment, point in zip(assigments, data):
new_means[assigment].append(point)
'''将同一类的数据进行整合'''
for i in range(3):
points = new_means[i]
centers.append(point_avg(points))
return centers
def assignment(data, centers):
assignments = []
'''对应位置显示对应类群'''
for point in data:
'''遍历所有数据'''
shortest = float('inf')
shortestindex = 0
for i in range(3):
'''遍历三个中心向量,与哪个类中心欧氏距离最短就将其归为哪类'''
value = distance(point, centers[i])
if value < shortest:
shortest = value
shortestindex = i
assignments.append(shortestindex)
return assignments
def kmeans(data):
k_data = generateCenters(data)
assigments = assignment(data, k_data)
old_assigments = None
while assigments != old_assigments:
new_centers = updataCenters(data, assigments)
old_assigments = assigments
assigments = assignment(data, new_centers)
result = list(zip(assigments, data))
return result
def acc(result):
sum = 0
all = 0
for i in range(50):
if result[i][0] == 0:
sum += 1
all += 1
for i in range(50):
if result[i+50][0] == 1:
sum += 1
all += 1
for i in range(50):
if result[i+100][0] == 2:
sum += 1
all += 1
print('sum:', sum, 'all:', all)
return sum, all
if __name__ == "__main__":
data = loadIRISdata(dataname)
result = kmeans(data)
for i in range(3):
tag = 0
print('\n')
print("第%d类数据有:" % (i+1))
for tuple in range(len(result)):
if(result[tuple][0] == i):
print(tuple, end=' ')
tag += 1
if tag > 20 :
print('\n')
tag = 0
#print(result)
print('\n')
sum, all = acc(result)
print('c-means准确度为:%2f%%' % ((sum/all)*100))
kmeans结果:
分解聚类代码:
import math
from collections import defaultdict
import numpy as np
dataname = "data.txt"
def loadIRISdata(filename):
data = []
with open(filename, mode="r", encoding="utf-8") as rf:
for line in rf:
if line == '\n':
continue
data.append(list(map(float, line.split(" "))))
return data
def E(N, N1, N2, a, b):
return float(((N1*N2)/N)*np.matrix((a-b))*np.matrix((a-b)).T)
def avg(data, k, assignments):
sum = []
tag = 0
for i in range(150):
if assignments[i] == k:
sum.append(data[i])
tag += 1
return np.sum(sum, 0)/tag
def length(k, assignments):
answer = 0
for i in range(150):
if assignments[i] == k:
answer += 1
return answer
def decomposition_clustering(data, assignments):
Er_max = float('-inf')
while True:
# 第一次
place = 0
tag = 0
for i in range(150):
if assignments[i] == 0:
assignments[i] = 1
# print('第%d次循环的assignments:' % i, assignments)
average_1 = avg(data, 0, assignments)
if length(1, assignments) == 0:
average_2 = np.array([0, 0, 0, 0])
else:
average_2 = avg(data, 1, assignments)
Er = E(150, length(0, assignments), length(1, assignments), average_1, average_2)
# print('E值为:', Er)
if Er > Er_max:
place = i
Er_max = Er
tag = 1 # E未到极值
print('max_1:', Er_max)
assignments[i] = 0
if tag == 1:
assignments[place] = 1
else:
break
Er_max = float('-inf')
while True:
# 第二次
place = 0
tag = 0
for i in range(150):
if assignments[i] == 1:
assignments[i] = 2
# print('第%d次循环的assignments:' % i, assignments)
average_1 = avg(data, 1, assignments)
if length(2, assignments) == 0:
average_2 = np.array([0, 0, 0, 0])
else:
average_2 = avg(data, 2, assignments)
Er = E(150, length(1, assignments), length(2, assignments), average_1, average_2)
# print('E值为:', Er)
if Er > Er_max:
place = i
Er_max = Er
tag = 1 # E未到极值
print('max_2:', Er_max)
assignments[i] = 1
if tag == 1:
assignments[place] = 2
else:
break
return assignments
def acc(result):
sum = 0
all = 0
for i in range(50):
if result[i][0] == 0:
sum += 1
all += 1
for i in range(50):
if result[i+50][0] == 1:
sum += 1
all += 1
for i in range(50):
if result[i+100][0] == 2:
sum += 1
all += 1
print('sum:', sum, 'all:', all)
return sum, all
if __name__ == "__main__":
data = loadIRISdata(dataname)
assignments = []
for i in range(150):
assignments.append(0)
answer = decomposition_clustering(data, assignments)
result = list(zip(answer, data))
for i in range(3):
tag = 0
print('\n')
print("第%d类数据有:" % (i+1))
for tuple in range(len(result)):
if(result[tuple][0] == i):
print(tuple, end=' ')
tag += 1
if tag > 20 :
print('\n')
tag = 0
#print(result)
print('\n')
sum, all = acc(result)
print('分解聚类法准确度为:%2f%%' % ((sum/all)*100))
分解聚类结果: