欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

stack模型融合

程序员文章站 2022-07-12 12:08:25
...
#coding:utf-8
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from random import shuffle
import copy
import random
random.seed(42)

class Essemble(object):
    def __init__(self):
        self.trainDf = pd.DataFrame()
        self.testDf = pd.DataFrame()

    def loadTrainData(self):
        traindf = pd.read_csv("E:\\horseColicTraining2.csv")
        label_train = traindf["label"]
        label_train.ix[label_train==-1]=0
        attr_train = traindf.ix[:,:-1]
        self.label_train = label_train
        self.attr_train = attr_train

    def loadTestData(self):
        testdf = pd.read_csv("E:\\horseColicTest2.csv")
        label_test = testdf["label"]
        label_test.ix[label_test==-1]=0
        attr_test = testdf.ix[:,:-1]
        self.label_test = label_test
        self.attr_test = attr_test

    def Kfold(self,model,folds=5):
        span = self.attr_train.shape[0]/folds+1   #跨度,每折的样本数量
        sampleIndex = range(self.attr_train.shape[0])
        shuffle(sampleIndex)
        sp = 0
        predResult = [-1]*self.attr_train.shape[0]   #预测的结果记录
        rtestLst = []
        while sp < self.attr_train.shape[0]:
            testIndex = sampleIndex[sp:(sp+span)]    #测试集的索引
            trainIndex = copy.deepcopy(sampleIndex)
            del trainIndex[sp:(sp+span)]             #训练集的索引
            attr_train_sample = self.attr_train.ix[trainIndex,:]   #训练集的属性
            label_train_sample = self.label_train.ix[trainIndex]   #训练集的标签
            attr_test_sample = self.attr_train.ix[testIndex,:]    #测试集的属性
            label_test_sample = self.label_train.ix[testIndex]    #测试集的属性
            model.fit(attr_train_sample,label_train_sample)
            testpred = model.predict(attr_test_sample)
            rtestpred = model.predict(self.attr_test)               #真实的外部测试集的估计
            rtestLst.append(rtestpred)
            for i in range(len(testIndex)):
                predResult[testIndex[i]] = testpred[i]
            sp = sp+span
            if (sp+span) >= self.attr_train.shape[0]:
                span = self.attr_train.shape[0]-sp
        rtestByModel = np.mat(rtestLst).mean(axis=0).tolist()[0]         #模型估计的测试集的结果的均值
        return predResult,rtestByModel

    def package(self,model,key):
        predResult,rtestByModel = self.Kfold(model)
        self.trainDf[key] = pd.Series(predResult)
        self.testDf[key] = pd.Series(rtestByModel)
        self.singleModel(model,key)

    def singleModel(self,model,key):
        model.fit(self.attr_train,self.label_train)
        pred = model.predict(self.attr_test)
        auc = roc_auc_score(self.label_test,pred)
        print "single model of " + str(key) + ":" + str(auc)

    def console(self):
        self.loadTrainData()
        self.loadTestData()

        ada = AdaBoostClassifier(n_estimators=500)
        self.package(ada,"ada")

        dt_entropy = DecisionTreeClassifier(criterion="entropy")
        self.package(dt_entropy,"dt_entropy")

        dt_gini = DecisionTreeClassifier(criterion="gini")
        self.package(dt_gini,"dt_gini")

        gb = GradientBoostingClassifier(n_estimators=500)
        self.package(gb,"gb")

        rf = RandomForestClassifier(n_estimators=500)
        self.package(rf,"rf")

        svc = SVC(kernel="rbf")
        self.package(svc,"svc")

    def LRstack(self):
        logist = LogisticRegression()
        logist.fit(self.trainDf,self.label_train)
        pred = logist.predict(self.testDf)
        auc = roc_auc_score(self.label_test,pred)
        print auc

###############################
Eobject = Essemble()
Eobject.console()
Eobject.LRstack()

数据是机器学习实战中的马的疾病数据,原理知乎上的简介是https://zhuanlan.zhihu.com/p/25836678

相关标签: 模型融合