{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Copyright (c) 2015, 2016 [Sebastian Raschka](sebastianraschka.com)\n", "\n", "https://github.com/rasbt/python-machine-learning-book\n", "\n", "[MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Python Machine Learning - Code Examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Méthodes d'Ensemble : Combinaison de classifieurs" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from IPython.display import Image\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Added version check for recent scikit-learn 0.18 checks\n", "from distutils.version import LooseVersion as Version\n", "from sklearn import __version__ as sklearn_version" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pourquoi les méthodes d'ensemble marchent ?\n", "## Simulation de performances dans des conditions idéales (décorrélation des erreurs classifieurs) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from scipy.misc import comb\n", "import math\n", "\n", "def ensemble_error(n_classifier, error):\n", " k_start = math.ceil(n_classifier / 2.0)\n", " probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)\n", " for k in range(k_start, n_classifier + 1)]\n", " return sum(probs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from scipy.misc import comb\n", "import math\n", "\n", "def ensemble_error(n_classifier, error):\n", " k_start = int(math.ceil(n_classifier / 2.0))\n", " probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)\n", " for k in range(k_start, n_classifier + 1)]\n", " return sum(probs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print ensemble_error(n_classifier=1, error=0.25)\n", "print ensemble_error(n_classifier=10, error=0.25)\n", "print ensemble_error(n_classifier=100, error=0.25)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "\n", "error_range = np.arange(0.0, 1.01, 0.01)\n", "ens_errors = [ensemble_error(n_classifier=11, error=error)\n", " for error in error_range]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.plot(error_range, \n", " ens_errors, \n", " label='Ensemble error', \n", " linewidth=2)\n", "\n", "plt.plot(error_range, \n", " error_range, \n", " linestyle='--',\n", " label='Base error',\n", " linewidth=2)\n", "\n", "plt.xlabel('Base error')\n", "plt.ylabel('Base/Ensemble error')\n", "plt.legend(loc='upper left')\n", "plt.grid()\n", "plt.tight_layout()\n", "# plt.savefig('./figures/ensemble_err.png', dpi=300)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Une méthode simple de vote majoritaire " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Implémentation de la classe Vote Majoritaire" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", "from sklearn.base import ClassifierMixin\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.externals import six\n", "from sklearn.base import clone\n", "from sklearn.pipeline import _name_estimators\n", "import numpy as np\n", "import operator\n", "\n", "\n", "class MajorityVoteClassifier(BaseEstimator, \n", " ClassifierMixin):\n", " \"\"\" A majority vote ensemble classifier\n", "\n", " Parameters\n", " ----------\n", " classifiers : array-like, shape = [n_classifiers]\n", " Different classifiers for the ensemble\n", "\n", " vote : str, {'classlabel', 'probability'} (default='label')\n", " If 'classlabel' the prediction is based on the argmax of\n", " class labels. Else if 'probability', the argmax of\n", " the sum of probabilities is used to predict the class label\n", " (recommended for calibrated classifiers).\n", "\n", " weights : array-like, shape = [n_classifiers], optional (default=None)\n", " If a list of `int` or `float` values are provided, the classifiers\n", " are weighted by importance; Uses uniform weights if `weights=None`.\n", "\n", " \"\"\"\n", " def __init__(self, classifiers, vote='classlabel', weights=None):\n", "\n", " self.classifiers = classifiers\n", " self.named_classifiers = {key: value for key, value\n", " in _name_estimators(classifiers)}\n", " self.vote = vote\n", " self.weights = weights\n", "\n", " def fit(self, X, y):\n", " \"\"\" Fit classifiers.\n", "\n", " Parameters\n", " ----------\n", " X : {array-like, sparse matrix}, shape = [n_samples, n_features]\n", " Matrix of training samples.\n", "\n", " y : array-like, shape = [n_samples]\n", " Vector of target class labels.\n", "\n", " Returns\n", " -------\n", " self : object\n", "\n", " \"\"\"\n", " if self.vote not in ('probability', 'classlabel'):\n", " raise ValueError(\"vote must be 'probability' or 'classlabel'\"\n", " \"; got (vote=%r)\"\n", " % self.vote)\n", "\n", " if self.weights and len(self.weights) != len(self.classifiers):\n", " raise ValueError('Number of classifiers and weights must be equal'\n", " '; got %d weights, %d classifiers'\n", " % (len(self.weights), len(self.classifiers)))\n", "\n", " # Use LabelEncoder to ensure class labels start with 0, which\n", " # is important for np.argmax call in self.predict\n", " self.lablenc_ = LabelEncoder()\n", " self.lablenc_.fit(y)\n", " self.classes_ = self.lablenc_.classes_\n", " self.classifiers_ = []\n", " for clf in self.classifiers:\n", " fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))\n", " self.classifiers_.append(fitted_clf)\n", " return self\n", "\n", " def predict(self, X):\n", " \"\"\" Predict class labels for X.\n", "\n", " Parameters\n", " ----------\n", " X : {array-like, sparse matrix}, shape = [n_samples, n_features]\n", " Matrix of training samples.\n", "\n", " Returns\n", " ----------\n", " maj_vote : array-like, shape = [n_samples]\n", " Predicted class labels.\n", " \n", " \"\"\"\n", " if self.vote == 'probability':\n", " maj_vote = np.argmax(self.predict_proba(X), axis=1)\n", " else: # 'classlabel' vote\n", "\n", " # Collect results from clf.predict calls\n", " predictions = np.asarray([clf.predict(X)\n", " for clf in self.classifiers_]).T\n", "\n", " maj_vote = np.apply_along_axis(\n", " lambda x:\n", " np.argmax(np.bincount(x,\n", " weights=self.weights)),\n", " axis=1,\n", " arr=predictions)\n", " maj_vote = self.lablenc_.inverse_transform(maj_vote)\n", " return maj_vote\n", "\n", " def predict_proba(self, X):\n", " \"\"\" Predict class probabilities for X.\n", "\n", " Parameters\n", " ----------\n", " X : {array-like, sparse matrix}, shape = [n_samples, n_features]\n", " Training vectors, where n_samples is the number of samples and\n", " n_features is the number of features.\n", "\n", " Returns\n", " ----------\n", " avg_proba : array-like, shape = [n_samples, n_classes]\n", " Weighted average probability for each class per sample.\n", "\n", " \"\"\"\n", " probas = np.asarray([clf.predict_proba(X)\n", " for clf in self.classifiers_])\n", " avg_proba = np.average(probas, axis=0, weights=self.weights)\n", " return avg_proba\n", "\n", " def get_params(self, deep=True):\n", " \"\"\" Get classifier parameter names for GridSearch\"\"\"\n", " if not deep:\n", " return super(MajorityVoteClassifier, self).get_params(deep=False)\n", " else:\n", " out = self.named_classifiers.copy()\n", " for name, step in six.iteritems(self.named_classifiers):\n", " for key, value in six.iteritems(step.get_params(deep=True)):\n", " out['%s__%s' % (name, key)] = value\n", " return out" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Combinaison de différents algorithmes par vote majoritaire\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn import datasets\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import LabelEncoder\n", "if Version(sklearn_version) < '0.18':\n", " from sklearn.cross_validation import train_test_split\n", "else:\n", " from sklearn.model_selection import train_test_split\n", "\n", "iris = datasets.load_iris()\n", "X, y = iris.data[50:, [1, 2]], iris.target[50:]\n", "le = LabelEncoder()\n", "y = le.fit_transform(y)\n", "\n", "X_train, X_test, y_train, y_test =\\\n", " train_test_split(X, y, \n", " test_size=0.5, \n", " random_state=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### Performances des modèles isolés" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "import numpy as np\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neighbors import KNeighborsClassifier \n", "from sklearn.pipeline import Pipeline\n", "if Version(sklearn_version) < '0.18':\n", " from sklearn.cross_validation import cross_val_score\n", "else:\n", " from sklearn.model_selection import cross_val_score\n", "\n", "clf1 = LogisticRegression(penalty='l2', \n", " C=0.001,\n", " random_state=0)\n", "\n", "clf2 = DecisionTreeClassifier(max_depth=1,\n", " criterion='entropy',\n", " random_state=0)\n", "\n", "clf3 = KNeighborsClassifier(n_neighbors=1,\n", " p=2,\n", " metric='minkowski')\n", "\n", "pipe1 = Pipeline([['sc', StandardScaler()],\n", " ['clf', clf1]])\n", "pipe3 = Pipeline([['sc', StandardScaler()],\n", " ['clf', clf3]])\n", "\n", "clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']\n", "\n", "print('10-fold cross validation:\\n')\n", "for clf, label in zip([pipe1, clf2, pipe3], clf_labels):\n", " scores = cross_val_score(estimator=clf,\n", " X=X_train,\n", " y=y_train,\n", " cv=10,\n", " scoring='roc_auc')\n", " print(\"ROC AUC: %0.2f (+/- %0.2f) [%s]\"\n", " % (scores.mean(), scores.std(), label))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Performance de la combinaison" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Majority Rule (hard) Voting\n", "\n", "mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])\n", "\n", "clf_labels += ['Majority Voting']\n", "all_clf = [pipe1, clf2, pipe3, mv_clf]\n", "\n", "\n", "for clf, label in zip(all_clf, clf_labels):\n", " scores = cross_val_score(estimator=clf,\n", " X=X_train,\n", " y=y_train,\n", " cv=10,\n", " scoring='roc_auc')\n", " print(\"ROC AUC: %0.2f (+/- %0.2f) [%s]\"\n", " % (scores.mean(), scores.std(), label))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluation de l'ensemble de classifieurs " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# L'Aire Under the ROC Curve (AUC) est un critère de performance très populaire\n", "\n", "from sklearn.metrics import roc_curve\n", "from sklearn.metrics import auc\n", "\n", "colors = ['black', 'orange', 'blue', 'green']\n", "linestyles = [':', '--', '-.', '-']\n", "for clf, label, clr, ls \\\n", " in zip(all_clf,\n", " clf_labels, colors, linestyles):\n", "\n", " # assuming the label of the positive class is 1\n", " y_pred = clf.fit(X_train,\n", " y_train).predict_proba(X_test)[:, 1]\n", " fpr, tpr, thresholds = roc_curve(y_true=y_test,\n", " y_score=y_pred)\n", " roc_auc = auc(x=fpr, y=tpr)\n", " plt.plot(fpr, tpr,\n", " color=clr,\n", " linestyle=ls,\n", " label='%s (auc = %0.2f)' % (label, roc_auc))\n", "\n", "plt.legend(loc='lower right')\n", "plt.plot([0, 1], [0, 1],\n", " linestyle='--',\n", " color='gray',\n", " linewidth=2)\n", "\n", "plt.xlim([-0.1, 1.1])\n", "plt.ylim([-0.1, 1.1])\n", "plt.grid()\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "\n", "# plt.tight_layout()\n", "# plt.savefig('./figures/roc.png', dpi=300)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mv_clf.get_params()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Piste d'amélioration " ] }, { "cell_type": "raw", "metadata": {}, "source": [ "NB : Bien sur vous pouver inclure tout ceci dans un grid search pour trouver les meilleurs paramètres de l'ensemble\n", "\n", "Par exemple :\n", " \n", "params = {'decisiontreeclassifier__max_depth': [1, 2],\n", " 'pipeline-1__clf__C': [0.001, 0.1, 100.0]}\n", "\n", "grid = GridSearchCV(estimator=mv_clf,\n", " param_grid=params,\n", " cv=10,\n", " scoring='roc_auc')\n", "grid.fit(X_train, y_train)\n", "\n", "print('Best parameters: %s' % grid.best_params_)\n", "print('Accuracy: %.2f' % grid.best_score_)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Combinaison par bagging" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sur les données des iris\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_digits, load_iris\n", "\n", "#digits = load_digits()\n", "\n", "dataset = load_iris()\n", "print dataset.data.shape\n", "print dataset.target.shape\n", "\n", "X, y= dataset.data, dataset.target" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.preprocessing import LabelEncoder\n", "if Version(sklearn_version) < '0.18':\n", " from sklearn.cross_validation import train_test_split\n", "else:\n", " from sklearn.model_selection import train_test_split\n", "\n", "le = LabelEncoder()\n", "y = le.fit_transform(y)\n", "\n", "X_train, X_test, y_train, y_test =\\\n", " train_test_split(X, y, \n", " test_size=0.40, \n", " random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.ensemble import BaggingClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", "tree = DecisionTreeClassifier(criterion='entropy', \n", " max_depth=1,\n", " random_state=1)\n", "\n", "bag = BaggingClassifier(base_estimator=tree,\n", " n_estimators=500, \n", " max_samples=1.0, \n", " max_features=1.0, \n", " bootstrap=True, \n", " bootstrap_features=False, \n", " n_jobs=1, \n", " random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", "tree = tree.fit(X_train, y_train)\n", "y_train_pred = tree.predict(X_train)\n", "y_test_pred = tree.predict(X_test)\n", "\n", "tree_train = accuracy_score(y_train, y_train_pred)\n", "tree_test = accuracy_score(y_test, y_test_pred)\n", "print('Decision tree train/test accuracies %.3f/%.3f'\n", " % (tree_train, tree_test))\n", "\n", "bag = bag.fit(X_train, y_train)\n", "y_train_pred = bag.predict(X_train)\n", "y_test_pred = bag.predict(X_test)\n", "\n", "bag_train = accuracy_score(y_train, y_train_pred) \n", "bag_test = accuracy_score(y_test, y_test_pred) \n", "print('Bagging train/test accuracies %.3f/%.3f'\n", " % (bag_train, bag_test))" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Explorer les performances obtenues en fonction \n", "- des paramètres de définition des arbres de décision (notamment max_depth)\n", "- des paramètres du bagging (notamment n_estimators)\n", "- du classifieur de base" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Combinaison par Boosting" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.ensemble import AdaBoostClassifier\n", "\n", "tree = DecisionTreeClassifier(criterion='entropy', \n", " max_depth=1,\n", " random_state=0)\n", "\n", "ada = AdaBoostClassifier(base_estimator=tree,\n", " n_estimators=500, \n", " learning_rate=0.1,\n", " random_state=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tree = tree.fit(X_train, y_train)\n", "y_train_pred = tree.predict(X_train)\n", "y_test_pred = tree.predict(X_test)\n", "\n", "tree_train = accuracy_score(y_train, y_train_pred)\n", "tree_test = accuracy_score(y_test, y_test_pred)\n", "print('Decision tree train/test accuracies %.3f/%.3f'\n", " % (tree_train, tree_test))\n", "\n", "ada = ada.fit(X_train, y_train)\n", "y_train_pred = ada.predict(X_train)\n", "y_test_pred = ada.predict(X_test)\n", "\n", "ada_train = accuracy_score(y_train, y_train_pred) \n", "ada_test = accuracy_score(y_test, y_test_pred) \n", "print('AdaBoost train/test accuracies %.3f/%.3f'\n", " % (ada_train, ada_test))" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "Explorer les performances obtenues en fonction :\n", "- des paramètres de définition des arbres de décision (notamment max_depth)\n", "- des paramètres du boosting (notamment n_estimators)\n", "- du classifieur de base" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Reproduire les expérimentations ci-dessus et déterminer les performances comparées des approches de boosting et de bagging sur les données digits " ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 1 }