ML algorithm2 决策树
程序员文章站
2022-05-21 23:29:55
...
注释都写在代码里面了
有些是直接复制粘贴的 有些是抄写的 对于我这个初学者这一阶段 我认为首先要做到理解 才能继续往下学习 所以很少会直接手撕代码 一是对python的熟练程度不够 二是对算法理解不完全正确 所以借抄代码的名义巩固对算法的理解 岂不美哉?
import math
def createDataSet(): #创建数据集
dataSet = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
featureNames = ['no surfacing','flippers'] #不浮出水面是否存活 ,有无脚蹼
return dataSet, featureNames
def Entropy(dataSet):
num = len(dataSet)
labelCounts = {}
for featVec in dataSet: # 统计每个类别的数量
currentLabel = featVec[-1] # 最后1列为键
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0 # 初始值=0
labelCounts[currentLabel] += 1 # 统计+1
entropy = 0.0 #计算信息熵
for key in labelCounts:
prob = float(labelCounts[key])/num
entropy -= prob * math.log(prob,2) #log2
return entropy
def splitDataSet(dataSet,axis,value):
returnDataSet = []
for dataVec in dataSet:
if dataVec[axis] == value:
tempVec = dataVec[:axis] #从最左边到第axis个元素
tempVec.extend(dataVec[axis + 1:])
returnDataSet.append(tempVec)
return returnDataSet
#myData,myFeatureNames = createDataSet()
#print(splitDataSet(myData,0,1)) #axis = 0,且这个特征的值=1
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 #特征的个数
baseEntropy = Entropy(dataSet) #计算原始信息熵
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featureList = [example[i] for example in dataSet] #将数据集中的第i个特征的值,放到一个list中
uniqueVal = set(featureList) #使用set函数去除所有重复元素
newEntropy = 0.0
for value in uniqueVal:
subDataSet = splitDataSet(dataSet, i, value) #对第i个特征,针对某个值划分
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * Entropy(subDataSet) #累加信息熵
infoGain = baseEntropy - newEntropy # 计算这次划分的信息增益
if (infoGain > bestInfoGain):
bestInfoGain = infoGain #替换好的值
bestFeature = i
return bestFeature
#index = chooseBestFeatureToSplit(myData)
#print(index)
import operator
#对字典排序,取得最大值:将多数的类别标签作为“叶子节点”的类别
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
print(sortedClassCount)
#sorted(classCount.items(), key=lambda x:x[1], reverse=True) #对字典排序
return sortedClassCount[0][0] #返回多数的那个类别
#c = [1,1,1,0,0,2,2,2,2]
#print("the majorityClassCount is 2\n",majorityCnt(c))
def createTree(dataSet,featureNames):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList): #如果所有的类都一样,第0类的个数==长度 count函数计算该数字在列表中出现的次数
return classList[0]#stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
return majorityCnt(classList) #数据集中只有一个特征的情况
bestFeature = chooseBestFeatureToSplit(dataSet)
bestFeatureName = featureNames[bestFeature]
myTree = {bestFeatureName: {}} # 用字典存储树结构
del(featureNames[bestFeature]) # 从特征名称列表中删除这个bestFeatureName
featureValues = [example[bestFeature] for example in dataSet] # 遍历最好的特征的取值,进行划分
uniqueVals = set(featureValues)
for value in uniqueVals:
subNames = featureNames[:] #copy all of featureName, 为了保留原有的featureName不被函数修改
myTree[bestFeatureName][value] = createTree(splitDataSet(dataSet,bestFeature, value), subNames)
return myTree
myData,myFeatureNames = createDataSet()
myTree = createTree(myData,myFeatureNames)
print(myTree)
print(myFeatureNames) #最后的list,会被改变(!!!因为函数参数是list,参数是按照引用的方式传递的)
#%%
#测试算法:对于某个输入变量x,使用决策树,进行分类
def classify(inputTree,labels, testVec):
firstStr = next(iter(inputTree)) #获取决策树第一个节点
secondDict = inputTree[firstStr] #下一个字典
featIndex = labels.index(firstStr) #第一个节点所在列的索引
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]) == dict:
classLabel = classify(secondDict[key],labels,testVec)
else:
classLabel = secondDict[key]
return classLabel
myData,myFeatureNames = createDataSet()
myTree = createTree(myData,myFeatureNames)
myData,myFeatureNames = createDataSet() #因为myFeatureNames在函数中,改变了,所以重新加载
class0 = classify(myTree,myFeatureNames,[1,0])
print(class0)
上一篇: 编程五大常用算法
下一篇: HNOI 2011 卡农 题解