stack模型融合
程序员文章站
2022-07-12 12:08:25
...
#coding:utf-8
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from random import shuffle
import copy
import random
random.seed(42)
class Essemble(object):
def __init__(self):
self.trainDf = pd.DataFrame()
self.testDf = pd.DataFrame()
def loadTrainData(self):
traindf = pd.read_csv("E:\\horseColicTraining2.csv")
label_train = traindf["label"]
label_train.ix[label_train==-1]=0
attr_train = traindf.ix[:,:-1]
self.label_train = label_train
self.attr_train = attr_train
def loadTestData(self):
testdf = pd.read_csv("E:\\horseColicTest2.csv")
label_test = testdf["label"]
label_test.ix[label_test==-1]=0
attr_test = testdf.ix[:,:-1]
self.label_test = label_test
self.attr_test = attr_test
def Kfold(self,model,folds=5):
span = self.attr_train.shape[0]/folds+1 #跨度,每折的样本数量
sampleIndex = range(self.attr_train.shape[0])
shuffle(sampleIndex)
sp = 0
predResult = [-1]*self.attr_train.shape[0] #预测的结果记录
rtestLst = []
while sp < self.attr_train.shape[0]:
testIndex = sampleIndex[sp:(sp+span)] #测试集的索引
trainIndex = copy.deepcopy(sampleIndex)
del trainIndex[sp:(sp+span)] #训练集的索引
attr_train_sample = self.attr_train.ix[trainIndex,:] #训练集的属性
label_train_sample = self.label_train.ix[trainIndex] #训练集的标签
attr_test_sample = self.attr_train.ix[testIndex,:] #测试集的属性
label_test_sample = self.label_train.ix[testIndex] #测试集的属性
model.fit(attr_train_sample,label_train_sample)
testpred = model.predict(attr_test_sample)
rtestpred = model.predict(self.attr_test) #真实的外部测试集的估计
rtestLst.append(rtestpred)
for i in range(len(testIndex)):
predResult[testIndex[i]] = testpred[i]
sp = sp+span
if (sp+span) >= self.attr_train.shape[0]:
span = self.attr_train.shape[0]-sp
rtestByModel = np.mat(rtestLst).mean(axis=0).tolist()[0] #模型估计的测试集的结果的均值
return predResult,rtestByModel
def package(self,model,key):
predResult,rtestByModel = self.Kfold(model)
self.trainDf[key] = pd.Series(predResult)
self.testDf[key] = pd.Series(rtestByModel)
self.singleModel(model,key)
def singleModel(self,model,key):
model.fit(self.attr_train,self.label_train)
pred = model.predict(self.attr_test)
auc = roc_auc_score(self.label_test,pred)
print "single model of " + str(key) + ":" + str(auc)
def console(self):
self.loadTrainData()
self.loadTestData()
ada = AdaBoostClassifier(n_estimators=500)
self.package(ada,"ada")
dt_entropy = DecisionTreeClassifier(criterion="entropy")
self.package(dt_entropy,"dt_entropy")
dt_gini = DecisionTreeClassifier(criterion="gini")
self.package(dt_gini,"dt_gini")
gb = GradientBoostingClassifier(n_estimators=500)
self.package(gb,"gb")
rf = RandomForestClassifier(n_estimators=500)
self.package(rf,"rf")
svc = SVC(kernel="rbf")
self.package(svc,"svc")
def LRstack(self):
logist = LogisticRegression()
logist.fit(self.trainDf,self.label_train)
pred = logist.predict(self.testDf)
auc = roc_auc_score(self.label_test,pred)
print auc
###############################
Eobject = Essemble()
Eobject.console()
Eobject.LRstack()
数据是机器学习实战中的马的疾病数据,原理知乎上的简介是https://zhuanlan.zhihu.com/p/25836678
上一篇: Python笔记——模块
推荐阅读
-
iOS中的应用启动原理以及嵌套模型开发示例详解
-
Java内存模型以及happens-before规则
-
Stack Overflow上59万浏览量的提问:为什么会发生ArrayIndexOutOfBoundsException?
-
C++高级编程之对象模型、const、new和delete及其重载讲解
-
模型自喷漆十大品牌,高达模型自喷漆品牌排名推荐
-
第14届高交会 云计算三网融合成IT展热点
-
C4D怎么创建圆凳模型? C4D凳子的制作方法
-
UG12.0怎么创建三维立体的卡管零件模型?
-
CAD中将图形从布局转换到模型中的方法
-
solidworks曲面模型上怎么刻字?solidworks曲面刻字的教程