欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python编程实现ID3算法,使用西瓜数据集产生结果

程序员文章站 2024-02-11 12:49:52
...

数据集和注意的地方,我已经发在BP算法第一篇,你们看一下吧。
具体代码,不分析了,网上太多,有问题请留言。
给我点个赞加个关注吧。
上代码:

# encoding:utf-8

import pandas as pd
import numpy  as np


class DecisionTree:
    def __init__(self):
        self.model = None

    def calEntropy(self, y):  # 计算熵
        valRate = y.value_counts().apply(lambda x: x / y.size)  # 频次汇总 得到各个特征对应的概率
        valEntropy = np.inner(valRate, np.log2(valRate)) * -1
        return valEntropy

    def fit(self, xTrain, yTrain=pd.Series()):
        if yTrain.size == 0:  # 如果不传,自动选择最后一列作为分类标签
            yTrain = xTrain.iloc[:, -1]
            xTrain = xTrain.iloc[:, :len(xTrain.columns) - 1]
        self.model = self.buildDecisionTree(xTrain, yTrain)
        return self.model

    def buildDecisionTree(self, xTrain, yTrain):
        propNamesAll = xTrain.columns
        # print(propNamesAll)
        yTrainCounts = yTrain.value_counts()
        if yTrainCounts.size == 1:
            # print('only one class', yTrainCounts.index[0])
            return yTrainCounts.index[0]
        entropyD = self.calEntropy(yTrain)

        maxGain = None
        maxEntropyPropName = None
        for propName in propNamesAll:
            propDatas = xTrain[propName]
            propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size)  # 频次汇总 得到各个特征对应的概率

            sumEntropyByProp = 0
            for propClass, dvRate in propClassSummary.items():
                yDataByPropClass = yTrain[xTrain[propName] == propClass]
                entropyDv = self.calEntropy(yDataByPropClass)
                sumEntropyByProp += entropyDv * dvRate
            gainEach = entropyD - sumEntropyByProp
            if maxGain == None or gainEach > maxGain:
                maxGain = gainEach
                maxEntropyPropName = propName
        # print('select prop:', maxEntropyPropName, maxGain)
        propDatas = xTrain[maxEntropyPropName]
        propClassSummary = propDatas.value_counts().apply(lambda x: x / propDatas.size)  # 频次汇总 得到各个特征对应的概率

        retClassByProp = {}
        for propClass, dvRate in propClassSummary.items():
            whichIndex = xTrain[maxEntropyPropName] == propClass
            if whichIndex.size == 0:
                continue
            xDataByPropClass = xTrain[whichIndex]
            yDataByPropClass = yTrain[whichIndex]
            del xDataByPropClass[maxEntropyPropName]  # 删除已经选择的属性列

            # print(propClass)
            # print(pd.concat([xDataByPropClass, yDataByPropClass], axis=1))

            retClassByProp[propClass] = self.buildDecisionTree(xDataByPropClass, yDataByPropClass)

        return {'Node': maxEntropyPropName, 'Edge': retClassByProp}

    def predictBySeries(self, modelNode, data):
        if not isinstance(modelNode, dict):
            return modelNode
        nodePropName = modelNode['Node']
        prpVal = data.get(nodePropName)
        for edge, nextNode in modelNode['Edge'].items():
            if prpVal == edge:
                return self.predictBySeries(nextNode, data)
        return None

    def predict(self, data):
        if isinstance(data, pd.Series):
            return self.predictBySeries(self.model, data)
        return data.apply(lambda d: self.predictBySeries(self.model, d), axis=1)


dataTrain = pd.read_csv(r"C:\Users\杨涵文\PycharmProjects\BP算法\方法一\data\table_4.2.csv", encoding="utf-8")

decisionTree = DecisionTree()
treeData = decisionTree.fit(dataTrain)
print(pd.DataFrame({'预测值': decisionTree.predict(dataTrain), '正取值': dataTrain.iloc[:, -1]}))

import json

print(json.dumps(treeData, ensure_ascii=False))

运行结果:
Python编程实现ID3算法,使用西瓜数据集产生结果
是不是感觉很不错,ID3算法个人感觉更好,哈哈,看自己呀。

相关标签: python 机器学习