欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

机器学习实战-使用决策树预测隐形眼镜类型-python3

程序员文章站 2024-02-26 16:10:40
...
from math import log

数据集链接

计算给定数据集的香农熵

def calcShannonEnt(dataSet):
    numEntries=len(dataSet)
    labelCounts={}
    for featVec in dataSet:
        currentLabel=featVec[-1]
        #print('calcShannonEnt函数输出featVec:')
        #print(featVec)
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1
    shannonEnt=0.0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries
        shannonEnt-=prob*log(prob,2)
    return shannonEnt

按照给定特征划分数据集

def splitDataSet(dataSet,axis,value):
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis]==value:
            reducedFeatVec=featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

选择最好的数据集划分方式

def chooseBestFeatureToSplit(dataSet):
    numFeatures=len(dataSet[0])-1#特征的数量(因每一行有一列类别,故减一)
    baseEntropy=calcShannonEnt(dataSet)
    bestInfoGain=0.0
    bestFeature=-1
    for i in range(numFeatures):
        featList=[example[i] for example in dataSet]
        #print('chooseBestFeatureToSplit函数输出featList:')
        #print(featList)
        uniqueVals=set(featList)
        newEntropy=0.0
        for value in uniqueVals:
            subDataSet=splitDataSet(dataSet,i,value)
            prob=len(subDataSet)/float(len(dataSet))
            newEntropy+=prob*calcShannonEnt(subDataSet)
        infoGain=baseEntropy-newEntropy
        if(infoGain>bestInfoGain):
            bestInfoGain=infoGain
            bestFeature=i
    return bestFeature

选取频率最高的类

import operator
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=Ture)
    return sortedClassCount[0][0]

创建决策树

def createTree(dataSet,labels):
    classList=[example[-1] for example in dataSet]
    if classList.count(classList[0])==len(classList):
        return classList[0]
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat=chooseBestFeatureToSplit(dataSet)
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{}}
    #print(labels[bestFeat])
    del (labels[bestFeat])
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)
    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

使用决策树分类

def classify(inputTree,featLabels,testVec):
    firstStr=list(inputTree.keys())[0]#源代码没有list,会产生错误:TypeError: 'dict_keys' object does not support indexing,这是由于python3改变了dict.keys,返回的是dict_keys对象,支持iterable 但不支持indexable,我们可以将其明确的转化成list:
    #print(firstStr)
    #print(featLabels)
    secondDict=inputTree[firstStr]
    featIndex=featLabels.index(firstStr)
    for key in secondDict.keys():
        if testVec[featIndex]==key:
            if type(secondDict[key]).__name__=='dict':
                classLabel=classify(secondDict[key],featLabels,testVec)
            else:
                classLabel=secondDict[key]
    return classLabel

调用函数

fr=open('lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
#print(lenses)
lensesLabels=['age','prescript','astigmatic','tearRate']
lensesTree=createTree(lenses,lensesLabels)
lensesTree

lensesLabels=['age','prescript','astigmatic','tearRate']#直接用上边的lenseslabels会缺少属性,个人感觉是createtree函数中的del()的问题
#print(lensesLabels)
classify(lensesTree,lensesLabels,['presbyopic','hyper','no','normal'])

运行结果

机器学习实战-使用决策树预测隐形眼镜类型-python3

机器学习实战-使用决策树预测隐形眼镜类型-python3