李航统计学习方法-决策树python实现
程序员文章站
2022-05-21 23:29:13
...
整理:
基于ID3的实现:
# coding=utf-8
from math import log
import time
def createDataSet():
"""
创建测试数据集
:return:
"""
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfaceing', 'flippers']
return dataSet, labels
# 计算香农熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for feaVec in dataSet:
currentLabel = feaVec[-1]
if currentLabel not in labelCounts:
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # 因为数据集的最后一项是标签
baseEntropy = calcShannonEnt(dataSet) # 计算dataSet 的香浓熵
bestInfoGain = 0.0 #
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy # ID3 信息增益
if infoGain > bestInfoGain:
bestInfoGain = infoGain
bestFeature = i
return bestFeature
# 因为我们递归构建决策树是根据属性的消耗进行计算的,所以可能会存在最后属性用完了,但是分类
# 还是没有算完,这时候就会采用多数表决的方式计算节点分类
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
return max(classCount)
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet] # 迭代获取Yes no 标识
if classList.count(classList[0]) == len(classList): # 类别相同则停止划分
return classList[0]
if len(dataSet[0]) == 1: # 所有特征已经用完
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel: {}}
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:] # 为了不改变原始列表的内容复制了一下
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
def main():
data, label = createDataSet()
t1 = time.clock()
myTree = createTree(data, label)
t2 = time.clock()
print(myTree)
print('execute for ', t2 - t1)
if __name__ == '__main__':
main()
基于C4.5的实现:
# encoding=utf-8
import cv2
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 二值化
def binaryzation(img):
"""
threshold(src, thresh, maxval, type[, dst])->ret,dst
src::灰度图
thresh:阈值
maxval:最大值
type:阈值类型
:param img:
:return:
"""
cv_img = img.astype(np.uint8)
cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
return cv_img
def binaryzation_features(trainset):
features = []
for img in trainset:
img = np.reshape(img, (28, 28))
cv_img = img.astype(np.uint8)
img_b = binaryzation(cv_img)
# hog_feature = np.transpose(hog_feature)
features.append(img_b)
features = np.array(features)
features = np.reshape(features, (-1, feature_len))
return features
class Tree(object):
def __init__(self, node_type, Class=None, feature=None):
self.node_type = node_type # 节点类型(internal或leaf)
self.dict = {} # dict的键表示特征Ag的可能值ai,值表示根据ai得到的子树
self.Class = Class # 叶节点表示的类,若是内部节点则为none
self.feature = feature # 表示当前的树即将由第feature个特征划分(即第feature特征是使得当前树中信息增益最大的特征)
def add_tree(self, key, tree):
self.dict[key] = tree
def predict(self, features):
if self.node_type == 'leaf' or (features[self.feature] not in self.dict):
return self.Class
tree = self.dict.get(features[self.feature])
return tree.predict(features)
# 计算数据集x的经验熵H(x)
def calc_ent(x):
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
return ent
# 计算条件熵H(y/x)
def calc_condition_ent(x, y):
x_value_list = set([x[i] for i in range(x.shape[0])])
ent = 0.0
for x_value in x_value_list:
sub_y = y[x == x_value]
temp_ent = calc_ent(sub_y)
ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
return ent
# 计算信息增益H(x) - H(y/x)
def calc_ent_grap(x, y):
base_ent = calc_ent(y)
condition_ent = calc_condition_ent(x, y)
ent_grap = base_ent - condition_ent
return ent_grap
# C4.5算法
def recurse_train(train_set, train_label, features):
LEAF = 'leaf'
INTERNAL = 'internal'
# 步骤1——如果训练集train_set中的所有实例都属于同一类Ck
label_set = set(train_label)
if len(label_set) == 1:
return Tree(LEAF, Class=label_set.pop())
# 步骤2——如果特征集features为空
class_len = [(i, len(list(filter(lambda x: x == i, train_label)))) for i in range(class_num)] # 计算每一个类出现的个数
(max_class, max_len) = max(class_len, key=lambda x: x[1])
if len(features) == 0:
return Tree(LEAF, Class=max_class)
# 步骤3——计算信息增益,并选择信息增益最大的特征
max_feature = 0
max_gda = 0
D = train_label
for feature in features:
# print(type(train_set))
A = np.array(train_set[:, feature].flat) # 选择训练集中的第feature列(即第feature个特征)
gda = calc_ent_grap(A, D) # 计算信息增益
if calc_ent(A) != 0: # 计算信息增益比,这是与ID3算法唯一的不同
gda /= calc_ent(A) # 信息增益 / 经验熵
if gda > max_gda: # 如果大于当前最大增益比 赋值特征
max_gda, max_feature = gda, feature
# 步骤4——信息增益小于阈值
if max_gda < epsilon:
return Tree(LEAF, Class=max_class)
# 步骤5——构建非空子集
sub_features = list(filter(lambda x: x != max_feature, features))
tree = Tree(INTERNAL, feature=max_feature)
max_feature_col = np.array(train_set[:, max_feature].flat)
feature_value_list = set(
[max_feature_col[i] for i in range(max_feature_col.shape[0])]) # 保存信息增益最大的特征可能的取值 (shape[0]表示计算行数)
for feature_value in feature_value_list:
index = []
for i in range(len(train_label)):
if train_set[i][max_feature] == feature_value:
index.append(i)
sub_train_set = train_set[index]
sub_train_label = train_label[index]
sub_tree = recurse_train(sub_train_set, sub_train_label, sub_features)
tree.add_tree(feature_value, sub_tree)
return tree
def train(train_set, train_label, features):
return recurse_train(train_set, train_label, features)
def predict(test_set, tree):
result = []
for features in test_set:
tmp_predict = tree.predict(features)
result.append(tmp_predict)
return np.array(result)
class_num = 10 # MINST数据集有10种labels,分别是“0,1,2,3,4,5,6,7,8,9”
feature_len = 784 # MINST数据集每个image有28*28=784个特征(pixels)
epsilon = 0.001 # 设定阈值
if __name__ == '__main__':
print("Start read data...")
time_1 = time.time()
raw_data = pd.read_csv('../data/train.csv', header=0) # 读取csv数据
data = raw_data.values
imgs = data[::, 1::]
features = binaryzation_features(imgs) # 图片二值化(很重要,不然预测准确率很低)
labels = data[::, 0]
# 避免过拟合,采用交叉验证,随机选取33%数据作为测试集,剩余为训练集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)
# 通过C4.5算法生成决策树
print('Start training...')
tree = train(train_features, train_labels, list(range(feature_len)))
print('Start predicting...')
test_predict = predict(test_features, tree)
for i in range(len(test_predict)):
if test_predict[i] is None:
test_predict[i] = epsilon
score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
Cart (GINI实现)
# encoding=utf-8
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
if __name__ == '__main__':
print("Start read data...")
time_1 = time.time()
raw_data = pd.read_csv('../data/train.csv', header=0)
data = raw_data.values
features = data[::, 1::]
labels = data[::, 0]
# 随机选取33%数据作为测试集,剩余为训练集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=0)
print('Start training...')
# criterion可选‘gini’, ‘entropy’,默认为gini(对应CART算法),entropy为信息增益(对应ID3算法)
clf = DecisionTreeClassifier(criterion='gini')
clf.fit(train_features, train_labels)
print('Start predicting...')
test_predict = clf.predict(test_features)
score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
下一篇: 树的深度优先与广度优先遍历