Copyright (c) 2015, 2016 [Sebastian Raschka](sebastianraschka.com)

https://github.com/rasbt/python-machine-learning-book

[MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt)

# Python Machine Learning - Code Examples

# Méthodes d'Ensemble : Combinaison de classifieurs

In [None]:
from IPython.display import Image
%matplotlib inline

In [None]:
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

# Pourquoi les méthodes d'ensemble marchent ?
## Simulation de performances dans des conditions idéales (décorrélation des erreurs classifieurs) 

In [None]:
from scipy.misc import comb
import math

def ensemble_error(n_classifier, error):
 k_start = math.ceil(n_classifier / 2.0)
 probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)
 for k in range(k_start, n_classifier + 1)]
 return sum(probs)

In [None]:
from scipy.misc import comb
import math

def ensemble_error(n_classifier, error):
 k_start = int(math.ceil(n_classifier / 2.0))
 probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)
 for k in range(k_start, n_classifier + 1)]
 return sum(probs)

In [None]:
print ensemble_error(n_classifier=1, error=0.25)
print ensemble_error(n_classifier=10, error=0.25)
print ensemble_error(n_classifier=100, error=0.25)


In [None]:
import numpy as np

error_range = np.arange(0.0, 1.01, 0.01)
ens_errors = [ensemble_error(n_classifier=11, error=error)
 for error in error_range]

In [None]:
import matplotlib.pyplot as plt

plt.plot(error_range, 
 ens_errors, 
 label='Ensemble error', 
 linewidth=2)

plt.plot(error_range, 
 error_range, 
 linestyle='--',
 label='Base error',
 linewidth=2)

plt.xlabel('Base error')
plt.ylabel('Base/Ensemble error')
plt.legend(loc='upper left')
plt.grid()
plt.tight_layout()
# plt.savefig('./figures/ensemble_err.png', dpi=300)
plt.show()

# Une méthode simple de vote majoritaire 

In [None]:
import numpy as np


## Implémentation de la classe Vote Majoritaire

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator


class MajorityVoteClassifier(BaseEstimator, 
 ClassifierMixin):
 """ A majority vote ensemble classifier

 Parameters
 ----------
 classifiers : array-like, shape = [n_classifiers]
 Different classifiers for the ensemble

 vote : str, {'classlabel', 'probability'} (default='label')
 If 'classlabel' the prediction is based on the argmax of
 class labels. Else if 'probability', the argmax of
 the sum of probabilities is used to predict the class label
 (recommended for calibrated classifiers).

 weights : array-like, shape = [n_classifiers], optional (default=None)
 If a list of `int` or `float` values are provided, the classifiers
 are weighted by importance; Uses uniform weights if `weights=None`.

 """
 def __init__(self, classifiers, vote='classlabel', weights=None):

 self.classifiers = classifiers
 self.named_classifiers = {key: value for key, value
 in _name_estimators(classifiers)}
 self.vote = vote
 self.weights = weights

 def fit(self, X, y):
 """ Fit classifiers.

 Parameters
 ----------
 X : {array-like, sparse matrix}, shape = [n_samples, n_features]
 Matrix of training samples.

 y : array-like, shape = [n_samples]
 Vector of target class labels.

 Returns
 -------
 self : object

 """
 if self.vote not in ('probability', 'classlabel'):
 raise ValueError("vote must be 'probability' or 'classlabel'"
 "; got (vote=%r)"
 % self.vote)

 if self.weights and len(self.weights) != len(self.classifiers):
 raise ValueError('Number of classifiers and weights must be equal'
 '; got %d weights, %d classifiers'
 % (len(self.weights), len(self.classifiers)))

 # Use LabelEncoder to ensure class labels start with 0, which
 # is important for np.argmax call in self.predict
 self.lablenc_ = LabelEncoder()
 self.lablenc_.fit(y)
 self.classes_ = self.lablenc_.classes_
 self.classifiers_ = []
 for clf in self.classifiers:
 fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
 self.classifiers_.append(fitted_clf)
 return self

 def predict(self, X):
 """ Predict class labels for X.

 Parameters
 ----------
 X : {array-like, sparse matrix}, shape = [n_samples, n_features]
 Matrix of training samples.

 Returns
 ----------
 maj_vote : array-like, shape = [n_samples]
 Predicted class labels.
 
 """
 if self.vote == 'probability':
 maj_vote = np.argmax(self.predict_proba(X), axis=1)
 else: # 'classlabel' vote

 # Collect results from clf.predict calls
 predictions = np.asarray([clf.predict(X)
 for clf in self.classifiers_]).T

 maj_vote = np.apply_along_axis(
 lambda x:
 np.argmax(np.bincount(x,
 weights=self.weights)),
 axis=1,
 arr=predictions)
 maj_vote = self.lablenc_.inverse_transform(maj_vote)
 return maj_vote

 def predict_proba(self, X):
 """ Predict class probabilities for X.

 Parameters
 ----------
 X : {array-like, sparse matrix}, shape = [n_samples, n_features]
 Training vectors, where n_samples is the number of samples and
 n_features is the number of features.

 Returns
 ----------
 avg_proba : array-like, shape = [n_samples, n_classes]
 Weighted average probability for each class per sample.

 """
 probas = np.asarray([clf.predict_proba(X)
 for clf in self.classifiers_])
 avg_proba = np.average(probas, axis=0, weights=self.weights)
 return avg_proba

 def get_params(self, deep=True):
 """ Get classifier parameter names for GridSearch"""
 if not deep:
 return super(MajorityVoteClassifier, self).get_params(deep=False)
 else:
 out = self.named_classifiers.copy()
 for name, step in six.iteritems(self.named_classifiers):
 for key, value in six.iteritems(step.get_params(deep=True)):
 out['%s__%s' % (name, key)] = value
 return out

## Combinaison de différents algorithmes par vote majoritaire


In [None]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
if Version(sklearn_version) < '0.18':
 from sklearn.cross_validation import train_test_split
else:
 from sklearn.model_selection import train_test_split

iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test =\
 train_test_split(X, y, 
 test_size=0.5, 
 random_state=1)


### Performances des modèles isolés

In [None]:

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import Pipeline
if Version(sklearn_version) < '0.18':
 from sklearn.cross_validation import cross_val_score
else:
 from sklearn.model_selection import cross_val_score

clf1 = LogisticRegression(penalty='l2', 
 C=0.001,
 random_state=0)

clf2 = DecisionTreeClassifier(max_depth=1,
 criterion='entropy',
 random_state=0)

clf3 = KNeighborsClassifier(n_neighbors=1,
 p=2,
 metric='minkowski')

pipe1 = Pipeline([['sc', StandardScaler()],
 ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
 ['clf', clf3]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
 scores = cross_val_score(estimator=clf,
 X=X_train,
 y=y_train,
 cv=10,
 scoring='roc_auc')
 print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
 % (scores.mean(), scores.std(), label))

### Performance de la combinaison

In [None]:
# Majority Rule (hard) Voting

mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])

clf_labels += ['Majority Voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]


for clf, label in zip(all_clf, clf_labels):
 scores = cross_val_score(estimator=clf,
 X=X_train,
 y=y_train,
 cv=10,
 scoring='roc_auc')
 print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
 % (scores.mean(), scores.std(), label))

# Evaluation de l'ensemble de classifieurs 

In [None]:
# L'Aire Under the ROC Curve (AUC) est un critère de performance très populaire

from sklearn.metrics import roc_curve
from sklearn.metrics import auc

colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']
for clf, label, clr, ls \
 in zip(all_clf,
 clf_labels, colors, linestyles):

 # assuming the label of the positive class is 1
 y_pred = clf.fit(X_train,
 y_train).predict_proba(X_test)[:, 1]
 fpr, tpr, thresholds = roc_curve(y_true=y_test,
 y_score=y_pred)
 roc_auc = auc(x=fpr, y=tpr)
 plt.plot(fpr, tpr,
 color=clr,
 linestyle=ls,
 label='%s (auc = %0.2f)' % (label, roc_auc))

plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],
 linestyle='--',
 color='gray',
 linewidth=2)

plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# plt.tight_layout()
# plt.savefig('./figures/roc.png', dpi=300)
plt.show()

In [None]:
mv_clf.get_params()

## Piste d'amélioration 

# Combinaison par bagging

## Sur les données des iris


In [None]:
from sklearn.datasets import load_digits, load_iris

#digits = load_digits()

dataset = load_iris()
print dataset.data.shape
print dataset.target.shape

X, y= dataset.data, dataset.target

In [None]:
from sklearn.preprocessing import LabelEncoder
if Version(sklearn_version) < '0.18':
 from sklearn.cross_validation import train_test_split
else:
 from sklearn.model_selection import train_test_split

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test =\
 train_test_split(X, y, 
 test_size=0.40, 
 random_state=1)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
 max_depth=1,
 random_state=1)

bag = BaggingClassifier(base_estimator=tree,
 n_estimators=500, 
 max_samples=1.0, 
 max_features=1.0, 
 bootstrap=True, 
 bootstrap_features=False, 
 n_jobs=1, 
 random_state=1)

In [None]:
from sklearn.metrics import accuracy_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
 % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
 % (bag_train, bag_test))

# Combinaison par Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='entropy', 
 max_depth=1,
 random_state=0)

ada = AdaBoostClassifier(base_estimator=tree,
 n_estimators=500, 
 learning_rate=0.1,
 random_state=0)

In [None]:
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
 % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
 % (ada_train, ada_test))

# Reproduire les expérimentations ci-dessus et déterminer les performances comparées des approches de boosting et de bagging sur les données digits 