欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python3机器学习实战ID3决策树实现+matplotlib绘制树形图

程序员文章站 2024-02-11 13:28:04
...

trees.py

import math
import operator


def calc_shannon_entropy(dataset):
    """
    计算香农熵,似乎是全类的标记
    熵越高,混合的数据越多
    另一种度量如基尼不纯度,简单来讲就是随机选子项度量误分概率
    """
    label_counts = {}
    for feature_vec in dataset:
        # 提取的就是那个分类的结果
        current_label = feature_vec[-1]
        # 计算一下各种结果一共有多少个
        label_counts[current_label] =  label_counts.get(current_label,0) + 1
    # 香农熵
    shannon_entropy = 0.
    for key in label_counts.keys():
        # 计算每一种结果的概率
        prob = float(label_counts[key]) / len(dataset)
        # 计算它的香农熵
        shannon_entropy -= prob * math.log(prob,2)
    return shannon_entropy

def create_dataset():
    """
    简单版的创建数据
    数据是列表,各个特征长度一致,最后一个是类别标签
    """
    # 可以把其中一个换成'maybe'试一试,熵明显会变大
    dataset = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no'],
               ]
    labels = ['no surfacing', 'flippers']
    return  dataset, labels

def split_dataset(dataset,axis,value):
    """
    划分数据集
    """
    ret_dataset = []
    for feature_vec in dataset:
        if feature_vec[axis] == value:
            # 反正这这两句的意思就是剔除[axis]对应的元素
            reduced_feature_vec = feature_vec[:axis]
            reduced_feature_vec.extend(feature_vec[axis+1:])
            # 把剔除后的特征向量放进集合中
            ret_dataset.append(reduced_feature_vec)
    return ret_dataset

def choose_best_feature_to_split(dataset):
    """
    熵的计算理解:按各自占的比例一直往下算,最后一步shannon_entropy -= prob * math.log(prob,2)
    都计算完了再根据权值加回来
    返回划分数据集最佳的特征
    """
    # 减去类别标签,看有几个特征
    num_features = len(dataset[0]) - 1
    # 无序时候的基本熵
    base_entropy = calc_shannon_entropy(dataset)
    # 先任意初始化最佳信息增益和与之对应的最佳特征
    best_info_gain = 0.
    best_feature = -1
    # 尝试用每一个特征来划分
    for i in range(num_features):
        feature_list = [example[i] for example in dataset]
        # 每种分类去掉重复的特征
        unique_vals = set(feature_list)
        new_entropy = 0.
        # 对每一个唯一的特征划分数据集,计算新熵,对所有唯一的熵求和
        for value in unique_vals:
            # 开始划分
            sub_dataset = split_dataset(dataset, i, value)
            # 计算概率和新的熵
            prob = len(sub_dataset)/float(len(dataset))
            new_entropy += prob*calc_shannon_entropy(sub_dataset)
        # 信息增益是原本的熵-新的?
        info_gain = base_entropy - new_entropy
        # 假如熵大大减少了
        if(info_gain > best_info_gain):
            # 设定这个信息增益最佳,锁定最佳特征
            best_info_gain = info_gain
            best_feature = i
    # 返回的是最佳特征
    return best_feature

def majority_cnt(class_list):
    """多数表决,用于节点没有特征却有多分类的情况"""
    class_count = {}
    # 计算一下频率
    for vote in class_list:
        class_count[vote] = class_count.get(vote,0) + 1
    # 根据频率排序
    sorted_class_count = sorted(class_count.items(),key=operator.itemgetter(1),reverse=True)
    # 取频率最高的那个的第一个特征
    return sorted_class_count[0][0]

def create_tree(dataset,labels):
    """主方法"""
    # 取出类标签
    class_list = [example[-1] for example in dataset]
    # 如果数据中只含有一种类标签,返回该类
    if class_list.count(class_list[0]) == len (class_list):
        return class_list[0]
    # 如果数据没有特征了,但类别还有多种(因为上一关筛选掉了单种的),就进行多数表决
    if len (dataset[0]) == 1:
        return majority_cnt(class_list)
    # 选择最好的划分特征,并几下这个分类
    best_feature = choose_best_feature_to_split(dataset)
    # 这个特征叫什么名字
    best_feature_label = labels[best_feature]
    # 如果用的是递归,最内层就是最底层的那个分类
    tree = {best_feature_label:{}}
    # 用过这个特征了就删掉,不然会有重复
    del(labels[best_feature])
    # 相同特征只取一份
    feature_values = [example[best_feature] for example in dataset]
    unique_vals = set(feature_values)
    # 开始往底层递归了
    for value in unique_vals:
        # 除去刚那个已经删掉的特征剩下的子集
        sub_labels = labels[:]
        # 对子树进行操作
        tree[best_feature_label][value] = create_tree(split_dataset(dataset,best_feature,value),sub_labels)
    return tree

def store_tree(tree,filename):
    # 序列化,写到本地磁盘文件
    import pickle
    with open(filename,'wb') as f:
        pickle.dump(tree,f)

def grab_tree(filename):
    # 反序列化,从本地文件读出原有的对象
    import pickle
    with open(filename,'rb') as f:
        return pickle.load(f)


def main():
    dataset,labels = create_dataset()
    # print(dataset[0])
    # print(calc_shannon_entropy(dat))
    # a = split_dataset(dataset, 0, 1)
    # b = split_dataset(dataset, 0, 0)
    # c = choose_best_feature_to_split(dataset)
    t = create_tree(dataset,labels)
    print(t)
    store_tree(t,'tree.txt')
    a = grab_tree('tree.txt')
    print(a)

if __name__ == '__main__':
    main()


tree_plotter.py

import matplotlib.pyplot as plt

# 定义文本框和箭头格式
# 文档找不到
# 第一个参数是形状,第二个不知道
decision_node = dict(boxstyle="sawtooth", fc="0.8")
leaf_node = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")

def plot_node(node_text, center, parent, node_type):
    # create_plot.ax1 函数变量??
    create_plot.ax1.annotate(node_text,xy=parent,xycoords='axes fraction',xytext=center,textcoords='axes fraction',
                             va='center',ha='center',bbox=node_type,arrowprops=arrow_args)

def create_plot(tree):
    # 创建新图形并清空绘图区
    figure = plt.figure(1,facecolor='white')
    figure.clf()
    # 下面三行就绘制的那个示例
    # create_plot.ax1 = plt.subplot(111,frameon = False)
    # plot_node('a decision node',(0.5,0.1),(0.1,0.5),decision_node)
    # plot_node('a leaf node',(0.8,0.1),(0.3,0.8),leaf_node)

    axprops = dict(xticks=[],yticks=[])
    create_plot.ax1 = plt.subplot(111,frameon = False,**axprops)
    plot_tree.total_w = float(get_num_leafs(tree))
    plot_tree.total_d = float(get_tree_depth(tree))
    plot_tree.x_off = -0.5/plot_tree.total_w
    plot_tree.y_off = 1.
    plot_tree(tree,(0.5,1.),'')

    plt.show()

def plot_mid_text(center,parent,txt_string):
    # 中间文本的坐标,上减下加上下
    x_mid = (parent[0]-center[0])/2. + center[0]
    y_mid = (parent[1]-center[1])/2. + center[1]
    create_plot.ax1.text(x_mid,y_mid,txt_string)

def plot_tree(tree,parent,node_text):
    """晦涩"""
    # 计算叶子数量
    num_leafs = get_num_leafs(tree)
    depth = get_tree_depth(tree)
    first_str = list(tree.keys())[0]
    # 定位
    center = (plot_tree.x_off + (1.0+float(num_leafs))/2./plot_tree.total_w,plot_tree.y_off)
    # 中间的文本
    plot_mid_text(center,parent,node_text)
    # 节点
    plot_node(first_str,center,parent,decision_node)
    second_dict = tree[first_str]
    plot_tree.y_off -= 1./plot_tree.total_d
    # 开始画了,也是递归
    for key in second_dict.keys():
        if type(second_dict[key]).__name__ == 'dict':
            plot_tree(second_dict[key],center,str(key))
        else:
            plot_tree.x_off += 1./plot_tree.total_w
            plot_node(second_dict[key],(plot_tree.x_off,plot_tree.y_off),center,leaf_node)
            plot_mid_text((plot_tree.x_off,plot_tree.y_off),center,str(key))
    plot_tree.y_off += 1./plot_tree.total_d


def get_num_leafs(tree):
    """递归求叶子"""
    num_leafs = 0
    first_str = list(tree.keys())[0]
    second_dict = tree[first_str]
    for key in second_dict.keys():
        # 如果节点还是一个字典,就说明还可以继续
        if type(second_dict[key]).__name__ == 'dict':
            num_leafs += get_num_leafs(second_dict[key])
        else:
            # 每次发现一个节点就加一,最终的那个子叶也是加个1就跑了
            num_leafs +=1

    return num_leafs

def get_tree_depth(tree):
    """其实差不太多"""
    max_depth = 0
    first_str = list(tree.keys())[0]
    second_dict = tree[first_str]
    for key in second_dict.keys():
        if type(second_dict[key]).__name__ == 'dict':
            this_depth = 1 + get_tree_depth(second_dict[key])
        else:
            this_depth = 1
        if this_depth > max_depth:
            max_depth = this_depth
    return max_depth

def retrieve_tree(i):
    # 先弄两个树来检索,省得一直创建
    list_of_trees = [{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
                     {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0:'no', 1: 'yes'}},1:'no'}}}},
                     ]
    return list_of_trees[i]

def main():
    create_plot(retrieve_tree(0))
    print(get_num_leafs(retrieve_tree(0)))
    print(get_tree_depth(retrieve_tree(0)))

if __name__ == '__main__':
    main()


第一个文件增补:

def classify(tree,feature_labels,test_vec):
    """
    第一个节点为当前母节点,往下找子节点,找到最后就返回
    """
    first_str = list(tree.keys())[0]
    # 字典的值,下面主要是判断这个还是不是字典,不是就是最终类标签
    second_dict = tree[first_str]
    # 找特征标签中对应这个键的序号
    feature_index = feature_labels.index(first_str)
    # 如果第二个字典的值还是字典
    for key in second_dict.keys():
        # 判断是0还是1
        if test_vec[feature_index] == key:
            print(key)
            if type(second_dict[key]).__name__ == 'dict':
                class_label = classify(second_dict[key],feature_labels,test_vec)
            else:
                class_label = second_dict[key]
    return class_label

部分output

生成的决策树:{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}

对应的树形图

Python3机器学习实战ID3决策树实现+matplotlib绘制树形图


Python3机器学习实战ID3决策树实现+matplotlib绘制树形图