欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法

程序员文章站 2022-05-20 19:34:35
...

题目:本次作业的实验需求是使用分解聚类法与c-means聚类法对IRIS数据集进行聚类,Kmeans聚类代码网上摘录,分解聚类法纯原创,PS:因为时间紧,分解聚类法进行第二次分解时,偷懒了~~有缘人改改吧~~

数据格式:

模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法

kmeans代码:

import math
from collections import defaultdict
import numpy as np
dataname = "data.txt"
def loadIRISdata(filename):
    data = []
    with open(filename, mode="r", encoding="utf-8") as rf:
        for line in rf:
            if line == '\n':
                continue
            data.append(list(map(float, line.split(" "))))
    return data

def generateCenters(data):
    '''求解初始聚类中心'''
    centers = []
    '''已知维度为4'''
    '''分三类,取第0,50,100的三个向量作为分界'''
    centers.append(data[0])
    centers.append(data[50])
    centers.append(data[100])
    return centers

def distance(a ,b):
    '''欧式距离'''
    sum = 0
    for i in range(4):
        sq = (a[i]-b[i])*(a[i]-b[i])
        sum += sq
    return math.sqrt(sum)

def point_avg(points):
    '''对维度求平均值'''
    new_center = []
    for i in range(4):
        sum = 0
        for p in points:
            sum += p[i]
        new_center.append(float("%.8f" % (sum/float(len(points)))))
    return new_center

def updataCenters(data, assigments):
    new_means = defaultdict(list)
    centers = []
    for assigment, point in zip(assigments, data):
        new_means[assigment].append(point)
        '''将同一类的数据进行整合'''
    for i in range(3):
        points = new_means[i]
        centers.append(point_avg(points))
    return centers

def assignment(data, centers):
    assignments = []
    '''对应位置显示对应类群'''
    for point in data:
        '''遍历所有数据'''
        shortest = float('inf')
        shortestindex = 0
        for i in range(3):
            '''遍历三个中心向量,与哪个类中心欧氏距离最短就将其归为哪类'''
            value = distance(point, centers[i])
            if value < shortest:
                shortest = value
                shortestindex = i
        assignments.append(shortestindex)
    return assignments

def kmeans(data):
    k_data = generateCenters(data)
    assigments = assignment(data, k_data)
    old_assigments = None
    while assigments != old_assigments:
        new_centers = updataCenters(data, assigments)
        old_assigments = assigments
        assigments = assignment(data, new_centers)
    result = list(zip(assigments, data))
    return result

def acc(result):
    sum = 0
    all = 0
    for i in range(50):
        if result[i][0] == 0:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+50][0] == 1:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+100][0] == 2:
            sum += 1
        all += 1
    print('sum:', sum, 'all:', all)
    return sum, all

if __name__ == "__main__":
    data = loadIRISdata(dataname)
    result = kmeans(data)
    for i in range(3):
        tag = 0
        print('\n')
        print("第%d类数据有:" % (i+1))
        for tuple in range(len(result)):
            if(result[tuple][0] == i):
                print(tuple, end=' ')
                tag += 1
            if tag > 20 :
                print('\n')
                tag = 0
    #print(result)
    print('\n')
    sum, all = acc(result)
    print('c-means准确度为:%2f%%' % ((sum/all)*100))

kmeans结果:

模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法

模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法

模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法

分解聚类代码:

import math
from collections import defaultdict
import numpy as np
dataname = "data.txt"
def loadIRISdata(filename):
    data = []
    with open(filename, mode="r", encoding="utf-8") as rf:
        for line in rf:
            if line == '\n':
                continue
            data.append(list(map(float, line.split(" "))))
    return data
def E(N, N1, N2, a, b):
    return float(((N1*N2)/N)*np.matrix((a-b))*np.matrix((a-b)).T)
def avg(data, k, assignments):
    sum = []
    tag = 0
    for i in range(150):
        if assignments[i] == k:
            sum.append(data[i])
            tag += 1
    return np.sum(sum, 0)/tag
def length(k, assignments):
    answer = 0
    for i in range(150):
        if assignments[i] == k:
            answer += 1
    return answer
def decomposition_clustering(data, assignments):
    Er_max = float('-inf')
    while True:
        # 第一次
        place = 0
        tag = 0
        for i in range(150):
            if assignments[i] == 0:
                assignments[i] = 1
                # print('第%d次循环的assignments:' % i, assignments)
                average_1 = avg(data, 0, assignments)
                if length(1, assignments) == 0:
                    average_2 = np.array([0, 0, 0, 0])
                else:
                    average_2 = avg(data, 1, assignments)
                Er = E(150, length(0, assignments), length(1, assignments), average_1, average_2)
                # print('E值为:', Er)
                if Er > Er_max:
                    place = i
                    Er_max = Er
                    tag = 1  # E未到极值
                    print('max_1:', Er_max)
                assignments[i] = 0
        if tag == 1:
            assignments[place] = 1
        else:
            break
    Er_max = float('-inf')
    while True:
        # 第二次
        place = 0
        tag = 0
        for i in range(150):
            if assignments[i] == 1:
                assignments[i] = 2
                # print('第%d次循环的assignments:' % i, assignments)
                average_1 = avg(data, 1, assignments)
                if length(2, assignments) == 0:
                    average_2 = np.array([0, 0, 0, 0])
                else:
                    average_2 = avg(data, 2, assignments)
                Er = E(150, length(1, assignments), length(2, assignments), average_1, average_2)
                # print('E值为:', Er)
                if Er > Er_max:
                    place = i
                    Er_max = Er
                    tag = 1  # E未到极值
                    print('max_2:', Er_max)
                assignments[i] = 1
        if tag == 1:
            assignments[place] = 2
        else:
            break
    return assignments
def acc(result):
    sum = 0
    all = 0
    for i in range(50):
        if result[i][0] == 0:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+50][0] == 1:
            sum += 1
        all += 1
    for i in range(50):
        if result[i+100][0] == 2:
            sum += 1
        all += 1
    print('sum:', sum, 'all:', all)
    return sum, all


if __name__ == "__main__":
    data = loadIRISdata(dataname)
    assignments = []
    for i in range(150):
        assignments.append(0)
    answer = decomposition_clustering(data, assignments)
    result = list(zip(answer, data))
    for i in range(3):
        tag = 0
        print('\n')
        print("第%d类数据有:" % (i+1))
        for tuple in range(len(result)):
            if(result[tuple][0] == i):
                print(tuple, end=' ')
                tag += 1
            if tag > 20 :
                print('\n')
                tag = 0
    #print(result)
    print('\n')
    sum, all = acc(result)
    print('分解聚类法准确度为:%2f%%' % ((sum/all)*100))

分解聚类结果:

模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法

模式识别设计(Python编程):IRIS数据集的Kmeans聚类与分解聚类法

 

相关标签: python 模式识别