利用python实现决策树ID3算法
程序员文章站
2024-02-15 09:40:46
...
通过长发的长短与粗细进行决策树的构造
如下数据:
用熵来表示信息的复杂度,熵越大,则信息越复杂。公式如下:
信息增益(information gain),表示两个信息熵的差值
将数据集设为样本D,则熵为:
H(D)=-[3/8*log2(3/8)+5/8*log2(5/8)]=0.9544
先以头发长短分计算出熵:
以头发长短分可分为长头发和短头发,所以:
H(D|A)=-4/8*(1/4*log2(1/4)+3/4*log2(3/4))-4/8*(2/4*log2(2/4)+2/4*log2(2/4))=0.9057
所以先以头发分的信息增益:
I(D,A)=H(D)-H(D|A)=0.0487
先以声音粗细分计算出熵:
以声音粗细分可分为粗声音和细声音,所以:
H(D|B)=-6/8*(3/6*log2(3/6)+3/6*log2(3/6))-2/8*(2/2*log2(2/2))=0.75
所以先以声音分的信息增益:
I(D,B)=H(D)-H(D|B)=0.2087
因为I(D,B)>I(D,A)
所以先按声音特征分类,信息增益更大,区分样本的能力更强,更具有代表性。 这就是ID3算法的核心思想。
通过python实现ID3算法:
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from asn1crypto.cms import ClassList
from audioop import reverse
from math import log
import operator
def calcShannonEnt(dataSet): #计算数据的熵
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt
def createDataSet1(): #创建数据集
dataSet = [['长', '粗', '男'],
['短', '粗', '男'],
['短', '粗', '男'],
['长', '细', '女'],
['短', '细', '女'],
['短', '粗', '女'],
['长', '粗', '女'],
['长', '粗', '女']]
labels = ['头发', '声音']
return dataSet, labels
def splitDataSet(dataSet, axis, value): #按某个特征分类
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatvec = featVec[:axis]
reducedFeatvec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatvec)
return retDataSet
def chooseBestFeatureToSplit(dataSet): #选择最优的特征分类
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i]for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain): #如果按某特征分类后熵值减少最大,则此特征为最优分类
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList): #按分类后的类别数量排序,例:最后分为2男1女,则判定为男
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(
classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels): #决策树的创建
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) #选择最优特征
bestFeatLable = labels[bestFeat]
myTree = {bestFeatLable: {}} #分类结果以字典形式保存
del(labels[bestFeat])
featValues = [example[bestFeat]for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLable][value] = createTree(splitDataSet
(dataSet, bestFeat, value), subLabels)
return myTree
if __name__ == "__main__":
dataSet, labels = createDataSet1()
print(createTree(dataSet, labels)) #输出模型结果
输出结果为:
画出来的样子就是:
上一篇: 决策树分类算法-ID3
下一篇: 12、PriorityQueue