marcino239.github.io

home archive about

Ensemble

August 10, 2015

In previous posts we've looked at single classifiers in action. Let's asses in this post a voting class of classifiers.
Logistic regression comes first.

In [1]:
%matplotlib inline

import numpy as np

import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
from sklearn.cross_validation import train_test_split

digits = datasets.load_digits()

# split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split( digits.data, 
                                                    digits.target,
                                                    test_size=0.33 )
Cs = np.logspace(-4., 4., 20)

logreg_cv = linear_model.LogisticRegressionCV( Cs=Cs )
logreg_cv.fit( X_train, y_train )

print( 'Logistic Regression: C:{0}, score:{1}'.format( np.average( logreg_cv.C_ ),
                            logreg_cv.score( X_train, y_train ) ) )
Logistic Regression: C:3.05929854461, score:0.991687448047

Now we will set up KNN classifier for the train set

In [2]:
from sklearn import datasets, neighbors

# use default number of neighbors (5)
nbor = neighbors.KNeighborsClassifier()

# fit the predictor
nbor.fit( X_train, y_train )

print( 'KNN Test score:{0}'.format( nbor.score( X_train, y_train ) ) )
KNN Test score:0.995012468828

The last one will be SVM classifier with linear kernel. We are going to use a "bad" value for regularization parameter.

In [3]:
from sklearn import svm

# we will use default value for C here for a moment
clf_svm = svm.LinearSVC(penalty='l1', loss='squared_hinge',
                    dual=False, tol=1e-3, C=1e-3 )
clf_svm.fit( X_train, y_train )

print( 'Lin SVC: Test score:{0}'.format( clf_svm.score( X_train, y_train ) ) )
Lin SVC: Test score:0.896924355777

Let's define a voting classifier. The code is copied from sklearn source for 0.17.0 and credit goes to Sebastian Raschka @rasbt.

Comments and docstrings are removed for readability :)

In [4]:
import operator

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators

class VotingClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, clfs, voting='hard', weights=None):
        self.clfs = clfs
        self.named_clfs = {key:value for key,value in _name_estimators(clfs)}
        self.voting = voting
        self.weights = weights


    def fit(self, X, y):
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'\
                                      ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % voting)

        if self.weights and len(self.weights) != len(self.clfs):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d clfs'
                             % (len(self.weights), len(self.clfs)))

        self.le_ = LabelEncoder()
        self.le_.fit(y)
        self.classes_ = self.le_.classes_
        self.clfs_ = []
        for clf in self.clfs:
            fitted_clf = clone(clf).fit(X, self.le_.transform(y))
            self.clfs_.append(fitted_clf)
        return self

    def predict(self, X):
        if self.voting == 'soft':

            maj = np.argmax(self.predict_proba(X), axis=1)

        else:  # 'hard' voting
            predictions = self._predict(X)

            maj = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions)

        maj = self.le_.inverse_transform(maj)
        return maj

    def predict_proba(self, X):
        avg = np.average(self._predict_probas(X), axis=0, weights=self.weights)
        return avg

    def transform(self, X):
        if self.voting == 'soft':
            return self._predict_probas(X)
        else:
            return self._predict(X)

    def get_params(self, deep=True):
        if not deep:
            return super(EnsembleClassifier, self).get_params(deep=False)
        else:
            out = self.named_clfs.copy()
            for name, step in six.iteritems(self.named_clfs):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out

    def _predict(self, X):
        return np.asarray([clf.predict(X) for clf in self.clfs]).T

    def _predict_probas(self, X):
        return np.asarray([clf.predict_proba(X) for clf in self.clfs])

Let's put all these together using VotingClassifier

In [5]:
estimators = ( logreg_cv, nbor, clf_svm )
vclf = VotingClassifier( clfs=estimators,
                        voting='hard',
                        weights=[1, 1, 1])

vclf.fit( X_train, y_train )

for e in _name_estimators( estimators ):
    print( '{0} score on test data: {1}'.format( e[0], e[1].score( X_test, y_test ) ) )

# final test
print( '\n' )
print( 'hard voting: Test score:{0}'.format( vclf.score( X_test, y_test ) ) )
logisticregressioncv score on test data: 0.939393939394
kneighborsclassifier score on test data: 0.974747474747
linearsvc score on test data: 0.877104377104


hard voting: Test score:0.951178451178