AdaBoost集成方法
AdaBoost: 自适应boosting,训练数据中的每个样本,并赋予一个权重,构成权重向量D。开始权重初始化为相等的值,先在数据集上训练一个弱分类器,计算分类器的错误率,再在同一数据集上再次训练弱分类器,在第二次训练中,增加分类错误的权重,减少分类正确的权重。
基于单层决策树构建弱分类器:单层决策树仅基于单个特征来做决策,即在某一个坐标轴选择一个值对数据集进行分类。选择的将数据集分来的值成为阈值,阈值的选择要使单层决策树的分类误差最小。AdaBoost需要将多个单层决策树组合起来,才能对数据集正确分类。单层决策树的构建:
from numpy import *
# 基于单层决策树构建弱分类器
# 导入数据及标签
def loadSimpData():
dataMat = matrix([[1, 2.1], [2, 1.1], [1.3, 1], [1, 1], [2, 1]])
classLabels = [1.0, 1.0, -1.0, -1.0, 1.0]
return dataMat, classLabels
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split("\t"))
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat - 1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
# 通过阈值比较对数据分类
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
retArray = ones((shape(dataMatrix)[0], 1))
# lt表示less than,如果小于阈值,设置为-1.0.
if threshIneq == 'lt':
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
# 寻找最佳单层决策树,思路: 寻找最小误差的阈值,就是单层决策树的分类方式,保存这个单层决策树
def buildStump(dataArr, classLabels, D):
# 列表变矩阵
dataMatrix = mat(dataArr)
# 列向量
labelMat = mat(classLabels).T
m, n = shape(dataMatrix)
numSteps = 10.0
bestStump = {}
bestClasEst = mat(zeros((m, 1)))
# 最小误差被初始化为正无穷大
minError = inf
# 遍历所有特征数据
for i in range(n):
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max()
stepSize = (rangeMax - rangeMin) / numSteps
# 对每个步长(-1到numSteps)
for j in range(-1, int(numSteps) + 1):
# 不管大于或小于的情况都要遍历,lt表示less than,gt表示greater than
for inequal in ['lt', 'gt']:
# 阈值
threshVal = rangeMin + float(j) * stepSize
# 预测分类
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
errArr = mat(ones((m, 1)))
errArr[predictedVals == labelMat] = 0
# 基于权重的错误率
weightedError = D.T * errArr
# print("split: dim %d, thresh %.2f, thresh ineqal %s, the weighted error %.3f" % (i, threshVal, inequal, weightedError))
# 寻找最小分类误差,并在bestStump字典中保存单层决策树
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClasEst
"""
D = mat(ones((5, 1)) / 5)
print(D)
dataMat, labelMat = loadSimpData()
buildStump(dataMat, labelMat, D)
"""
实现完整的AdaBoost算法:在用户选择的特定迭代周期内,寻找最佳的单层决策树,重新计算权重,更新类别估计。在错误达到0或者达到迭代次数时,退出循环。代码如下:
# 使用多个弱分类器构建AdaBoost算法,更新alpha值,更新分类
def adaBoostTrainDS(dataArr, classLabels, numIt=40):
# 单层决策树数组
weakClassArr = []
m = shape(dataArr)[0]
D = mat(ones((m, 1)) / m)
# 类别
aggClassEst = mat(zeros((m, 1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D)
# print("D:", D.T)
alpha = float(0.5 * log((1.0 - error) / max(error, 1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
# print("classEst:", classEst.T)
# 更新权重,如果分类正确,指数为-alpha,如果不正确,指数为alpha
expon = multiply(-1 * alpha * mat(classLabels).T, classEst)
D = multiply(D, exp(expon))
D = D / D.sum()
# 重新分类,类别估计值
aggClassEst += alpha * classEst
print("aggClassEst:", aggClassEst)
# 分类错误,设置为1
aggErrors = multiply(sign(aggClassEst) != mat(classLabels).T, ones((m, 1)))
# 错误率
errorRate = aggErrors.sum() / m
# print("errorRate:", errorRate)
if errorRate == 0:
break
return weakClassArr
"""
dataMat, labelMat = loadSimpData()
classifierArray = adaBoostTrainDS(dataMat, labelMat, 9)
print(classifierArray)
"""
测试算法,基于AdaBoost分类,将每个弱分类器的结果和对应的alpha值加权求和得到结果,该结果的符号为最后的分类结果。
# 测试算法:基于adaboost分类,dataToClass为待分类样例,classifierArray为多个弱分类器组成的数组
def adaClassify(dataToClass, classifierArray):
dataMatrix = mat(dataToClass)
m = shape(dataMatrix)[0]
aggClassEst = mat(zeros((m, 1)))
for i in range(len(classifierArray)):
# 类别估计值
classEst = stumpClassify(dataMatrix, classifierArray[i]['dim'], classifierArray[i]['thresh'], classifierArray[i]['ineq'])
aggClassEst += classifierArray[i]['alpha'] * classEst
# print(aggClassEst)
return sign(aggClassEst)
"""
dataMat, labelMat = loadSimpData()
classifierArray = adaBoostTrainDS(dataMat, labelMat, 30)
aggClassESt = adaClassify([0, 0], classifierArray)
print(aggClassESt)
"""
在马疝病的数据集上应用AdaBoost算法:将数据的类别标签设置为1和-1。训练分类器,观察错误率。
from Chapter7.boost import adaBoostTrainDS
from Chapter7.boost import adaClassify
from numpy import *
# 自动检测特征数目的加载数据,假定最后一个是类别标签
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split("\t"))
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat - 1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
"""
# 加载训练集,获得单层决策树数组
dataMat, labelMat = loadDataSet('E:\机器学习\machinelearninginaction\Ch05\horseColicTraining.txt')
classfierArray = adaBoostTrainDS(dataMat, labelMat, 10)
# 加载测试集,用于分类
testDataMat, testLabelMat = loadDataSet('E:\机器学习\machinelearninginaction\Ch05\horseColicTest.txt')
predicition = adaClassify(testDataMat, classfierArray)
print(predicition)
# 计算错误率
errArr = mat(ones((67, 1)))
errRate = float(errArr[predicition != mat(testLabelMat).T].sum()) / 67
print(errRate)
"""
分类器性能度量:非均衡分类问题是指在分类器训练时,正例数目和反例数目不相等而且相差很大。用ROC曲线评价分类器的性能。ROC代表接收者操作特征,横轴表示伪正例的比例,纵轴表示真正例的比例。理想情况下,最佳的分类器要尽可能的处于左上角。AUC是曲线下的面积,完美分类器的AUC为1.0,随机猜测的ROC曲线是一条对角线,AUC为0.5.
创建ROC曲线时,要将分类样例按照预测强度从低到高排序。
# 用于度量分类中的非均衡性的ROC曲线的绘制及AUC计算
def plotROC(preStrenghts, classLabels):
import matplotlib.pyplot as plot
# 绘制光标位置,从右上角开始绘第一个点
cur = (1.0, 1.0)
# y轴的步进数目
ySum = 0.0
# 真正例(TP)
numPosClas = sum(array(classLabels) == 1.0)
# 计算步长
yStep = 1.0 / float(numPosClas)
xStep = 1.0 / float(len(classLabels) - numPosClas)
# 从小到大排列预测强度
sortedIndicies = preStrenghts.argsort()
# 构建画笔
fig = plot.figure()
fig.clf()
# 一行一列第一块
ax = plot.subplot(111)
# m, n = shape(sortedIndicies)
# print(n)
# print(sortedIndicies.tolist()[0])
# i = 0
# index为排序好的每一个强度值
for index in sortedIndicies.tolist()[0]:
# print(index)
if classLabels[index] == 1.0:
delX = 0
delY = yStep
else:
delX = xStep
delY = 0
# 为了计算AUC,高度累加
ySum += cur[1]
# 绘制数据集的ROC
ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c='b')
# 更新当前光标位置
cur = (cur[0] - delX, cur[1] - delY)
print(cur)
# i += 1
# print(i)
# 绘制随机预测的ROC曲线
ax.plot([0, 1], [0, 1], 'b--')
plot.xlabel('False Positive Rate')
plot.ylabel('True Positive Rate')
plot.title('ROC curve for AdaBoost Horse Colic Detection System')
# 前两位表示X轴范围,后两位表示Y轴范围
ax.axis([0, 1, 0, 1])
plot.show()
print('the Area Under the Curve is: ', ySum * xStep)
dataMat, labelMat = loadDataSet('E:\机器学习\machinelearninginaction\Ch07\horseColicTraining2.txt')
classfierArray, aggClassEst = adaBoostTrainDS(dataMat, labelMat, 10)
plotROC(aggClassEst.T, labelMat)
结果:
其他的分类性能的度量指标:正确率(预测为正例的样本中真正正例的比率),召回率(预测为正例的真实正例占所有真实正例的比率),基于代价函数的分类器决策控制(选择最小代价或最小期望代价的分类器),通过欠抽样和过抽样调节数据集中正例和反例数目(正例数目少,过抽样,复制已有样例或加入样例相似的点,或加入已有数据的插值点)(反例多,欠抽样,删除离决策边界较远的样例)。
下一篇: flink的broadcast