欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

ML algorithm2 决策树

程序员文章站 2022-05-21 23:29:55
...

注释都写在代码里面了

有些是直接复制粘贴的 有些是抄写的 对于我这个初学者这一阶段 我认为首先要做到理解 才能继续往下学习 所以很少会直接手撕代码 一是对python的熟练程度不够 二是对算法理解不完全正确 所以借抄代码的名义巩固对算法的理解 岂不美哉?

 

import math

def createDataSet():   #创建数据集
    dataSet = [[1,1,'yes'],
               [1,1,'yes'],
               [1,0,'no'],
               [0,1,'no'],
               [0,1,'no']]
    featureNames = ['no surfacing','flippers'] #不浮出水面是否存活 ,有无脚蹼
    return dataSet, featureNames

def Entropy(dataSet):
    num = len(dataSet)
    labelCounts = {}
    for featVec in dataSet: # 统计每个类别的数量
        currentLabel = featVec[-1]  # 最后1列为键
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0  # 初始值=0
        labelCounts[currentLabel] += 1  # 统计+1
    entropy = 0.0 #计算信息熵
    for key in labelCounts:
        prob = float(labelCounts[key])/num
        entropy -= prob * math.log(prob,2) #log2
    return entropy

def splitDataSet(dataSet,axis,value):
    returnDataSet = []
    for dataVec in dataSet:
        if dataVec[axis] == value:
            tempVec = dataVec[:axis]  #从最左边到第axis个元素
            tempVec.extend(dataVec[axis + 1:])
            returnDataSet.append(tempVec)
    return returnDataSet

#myData,myFeatureNames = createDataSet()
#print(splitDataSet(myData,0,1)) #axis = 0,且这个特征的值=1

def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1 #特征的个数
    baseEntropy = Entropy(dataSet)  #计算原始信息熵
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        featureList = [example[i] for example in dataSet] #将数据集中的第i个特征的值,放到一个list中
        uniqueVal = set(featureList) #使用set函数去除所有重复元素
        newEntropy = 0.0
        for value in uniqueVal:
            subDataSet = splitDataSet(dataSet, i, value) #对第i个特征,针对某个值划分
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * Entropy(subDataSet)   #累加信息熵
        infoGain = baseEntropy - newEntropy  # 计算这次划分的信息增益
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain         #替换好的值
            bestFeature = i
    return bestFeature

#index = chooseBestFeatureToSplit(myData)
#print(index)

import operator

#对字典排序,取得最大值:将多数的类别标签作为“叶子节点”的类别
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    print(sortedClassCount)
    #sorted(classCount.items(), key=lambda x:x[1], reverse=True) #对字典排序
    return sortedClassCount[0][0] #返回多数的那个类别

#c = [1,1,1,0,0,2,2,2,2]
#print("the majorityClassCount is 2\n",majorityCnt(c))

def createTree(dataSet,featureNames):
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList): #如果所有的类都一样,第0类的个数==长度 count函数计算该数字在列表中出现的次数
        return classList[0]#stop splitting when all of the classes are equal
    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
        return majorityCnt(classList) #数据集中只有一个特征的情况
    bestFeature = chooseBestFeatureToSplit(dataSet)
    bestFeatureName = featureNames[bestFeature]
    myTree = {bestFeatureName: {}}  # 用字典存储树结构
    del(featureNames[bestFeature])  # 从特征名称列表中删除这个bestFeatureName
    featureValues = [example[bestFeature] for example in dataSet]  # 遍历最好的特征的取值,进行划分
    uniqueVals = set(featureValues)
    for value in uniqueVals:
        subNames = featureNames[:]  #copy all of featureName, 为了保留原有的featureName不被函数修改
        myTree[bestFeatureName][value] = createTree(splitDataSet(dataSet,bestFeature, value), subNames)
    return myTree

myData,myFeatureNames = createDataSet()
myTree = createTree(myData,myFeatureNames)
print(myTree)
print(myFeatureNames)  #最后的list,会被改变(!!!因为函数参数是list,参数是按照引用的方式传递的)

#%%
#测试算法:对于某个输入变量x,使用决策树,进行分类
def  classify(inputTree,labels,  testVec):
   firstStr  =  next(iter(inputTree))                      #获取决策树第一个节点
   secondDict  =  inputTree[firstStr]                     #下一个字典
   featIndex  =  labels.index(firstStr)      	   #第一个节点所在列的索引
   for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]) == dict:
                classLabel = classify(secondDict[key],labels,testVec)
            else:
                classLabel = secondDict[key]
   return classLabel


myData,myFeatureNames = createDataSet()
myTree = createTree(myData,myFeatureNames)
myData,myFeatureNames = createDataSet() #因为myFeatureNames在函数中,改变了,所以重新加载
class0 = classify(myTree,myFeatureNames,[1,0])
print(class0)

 

相关标签: 决策树