{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Copyright (c) 2015, 2016 [Sebastian Raschka](sebastianraschka.com)\n",
    "\n",
    "https://github.com/rasbt/python-machine-learning-book\n",
    "\n",
    "[MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Python Machine Learning - Code Examples"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Méthodes d'Ensemble : Combinaison de classifieurs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from IPython.display import Image\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Added version check for recent scikit-learn 0.18 checks\n",
    "from distutils.version import LooseVersion as Version\n",
    "from sklearn import __version__ as sklearn_version"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pourquoi les méthodes d'ensemble marchent ?\n",
    "## Simulation de performances dans des conditions idéales (décorrélation des erreurs classifieurs) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from scipy.misc import comb\n",
    "import math\n",
    "\n",
    "def ensemble_error(n_classifier, error):\n",
    "    k_start = math.ceil(n_classifier / 2.0)\n",
    "    probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)\n",
    "             for k in range(k_start, n_classifier + 1)]\n",
    "    return sum(probs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from scipy.misc import comb\n",
    "import math\n",
    "\n",
    "def ensemble_error(n_classifier, error):\n",
    "    k_start = int(math.ceil(n_classifier / 2.0))\n",
    "    probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k)\n",
    "             for k in range(k_start, n_classifier + 1)]\n",
    "    return sum(probs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print ensemble_error(n_classifier=1, error=0.25)\n",
    "print ensemble_error(n_classifier=10, error=0.25)\n",
    "print ensemble_error(n_classifier=100, error=0.25)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "error_range = np.arange(0.0, 1.01, 0.01)\n",
    "ens_errors = [ensemble_error(n_classifier=11, error=error)\n",
    "              for error in error_range]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "plt.plot(error_range, \n",
    "         ens_errors, \n",
    "         label='Ensemble error', \n",
    "         linewidth=2)\n",
    "\n",
    "plt.plot(error_range, \n",
    "         error_range, \n",
    "         linestyle='--',\n",
    "         label='Base error',\n",
    "         linewidth=2)\n",
    "\n",
    "plt.xlabel('Base error')\n",
    "plt.ylabel('Base/Ensemble error')\n",
    "plt.legend(loc='upper left')\n",
    "plt.grid()\n",
    "plt.tight_layout()\n",
    "# plt.savefig('./figures/ensemble_err.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Une méthode simple de vote majoritaire "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Implémentation de la classe Vote Majoritaire"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.base import BaseEstimator\n",
    "from sklearn.base import ClassifierMixin\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.externals import six\n",
    "from sklearn.base import clone\n",
    "from sklearn.pipeline import _name_estimators\n",
    "import numpy as np\n",
    "import operator\n",
    "\n",
    "\n",
    "class MajorityVoteClassifier(BaseEstimator, \n",
    "                             ClassifierMixin):\n",
    "    \"\"\" A majority vote ensemble classifier\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    classifiers : array-like, shape = [n_classifiers]\n",
    "      Different classifiers for the ensemble\n",
    "\n",
    "    vote : str, {'classlabel', 'probability'} (default='label')\n",
    "      If 'classlabel' the prediction is based on the argmax of\n",
    "        class labels. Else if 'probability', the argmax of\n",
    "        the sum of probabilities is used to predict the class label\n",
    "        (recommended for calibrated classifiers).\n",
    "\n",
    "    weights : array-like, shape = [n_classifiers], optional (default=None)\n",
    "      If a list of `int` or `float` values are provided, the classifiers\n",
    "      are weighted by importance; Uses uniform weights if `weights=None`.\n",
    "\n",
    "    \"\"\"\n",
    "    def __init__(self, classifiers, vote='classlabel', weights=None):\n",
    "\n",
    "        self.classifiers = classifiers\n",
    "        self.named_classifiers = {key: value for key, value\n",
    "                                  in _name_estimators(classifiers)}\n",
    "        self.vote = vote\n",
    "        self.weights = weights\n",
    "\n",
    "    def fit(self, X, y):\n",
    "        \"\"\" Fit classifiers.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        X : {array-like, sparse matrix}, shape = [n_samples, n_features]\n",
    "            Matrix of training samples.\n",
    "\n",
    "        y : array-like, shape = [n_samples]\n",
    "            Vector of target class labels.\n",
    "\n",
    "        Returns\n",
    "        -------\n",
    "        self : object\n",
    "\n",
    "        \"\"\"\n",
    "        if self.vote not in ('probability', 'classlabel'):\n",
    "            raise ValueError(\"vote must be 'probability' or 'classlabel'\"\n",
    "                             \"; got (vote=%r)\"\n",
    "                             % self.vote)\n",
    "\n",
    "        if self.weights and len(self.weights) != len(self.classifiers):\n",
    "            raise ValueError('Number of classifiers and weights must be equal'\n",
    "                             '; got %d weights, %d classifiers'\n",
    "                             % (len(self.weights), len(self.classifiers)))\n",
    "\n",
    "        # Use LabelEncoder to ensure class labels start with 0, which\n",
    "        # is important for np.argmax call in self.predict\n",
    "        self.lablenc_ = LabelEncoder()\n",
    "        self.lablenc_.fit(y)\n",
    "        self.classes_ = self.lablenc_.classes_\n",
    "        self.classifiers_ = []\n",
    "        for clf in self.classifiers:\n",
    "            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))\n",
    "            self.classifiers_.append(fitted_clf)\n",
    "        return self\n",
    "\n",
    "    def predict(self, X):\n",
    "        \"\"\" Predict class labels for X.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        X : {array-like, sparse matrix}, shape = [n_samples, n_features]\n",
    "            Matrix of training samples.\n",
    "\n",
    "        Returns\n",
    "        ----------\n",
    "        maj_vote : array-like, shape = [n_samples]\n",
    "            Predicted class labels.\n",
    "            \n",
    "        \"\"\"\n",
    "        if self.vote == 'probability':\n",
    "            maj_vote = np.argmax(self.predict_proba(X), axis=1)\n",
    "        else:  # 'classlabel' vote\n",
    "\n",
    "            #  Collect results from clf.predict calls\n",
    "            predictions = np.asarray([clf.predict(X)\n",
    "                                      for clf in self.classifiers_]).T\n",
    "\n",
    "            maj_vote = np.apply_along_axis(\n",
    "                                      lambda x:\n",
    "                                      np.argmax(np.bincount(x,\n",
    "                                                weights=self.weights)),\n",
    "                                      axis=1,\n",
    "                                      arr=predictions)\n",
    "        maj_vote = self.lablenc_.inverse_transform(maj_vote)\n",
    "        return maj_vote\n",
    "\n",
    "    def predict_proba(self, X):\n",
    "        \"\"\" Predict class probabilities for X.\n",
    "\n",
    "        Parameters\n",
    "        ----------\n",
    "        X : {array-like, sparse matrix}, shape = [n_samples, n_features]\n",
    "            Training vectors, where n_samples is the number of samples and\n",
    "            n_features is the number of features.\n",
    "\n",
    "        Returns\n",
    "        ----------\n",
    "        avg_proba : array-like, shape = [n_samples, n_classes]\n",
    "            Weighted average probability for each class per sample.\n",
    "\n",
    "        \"\"\"\n",
    "        probas = np.asarray([clf.predict_proba(X)\n",
    "                             for clf in self.classifiers_])\n",
    "        avg_proba = np.average(probas, axis=0, weights=self.weights)\n",
    "        return avg_proba\n",
    "\n",
    "    def get_params(self, deep=True):\n",
    "        \"\"\" Get classifier parameter names for GridSearch\"\"\"\n",
    "        if not deep:\n",
    "            return super(MajorityVoteClassifier, self).get_params(deep=False)\n",
    "        else:\n",
    "            out = self.named_classifiers.copy()\n",
    "            for name, step in six.iteritems(self.named_classifiers):\n",
    "                for key, value in six.iteritems(step.get_params(deep=True)):\n",
    "                    out['%s__%s' % (name, key)] = value\n",
    "            return out"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Combinaison de différents algorithmes par vote majoritaire\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn import datasets\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "if Version(sklearn_version) < '0.18':\n",
    "    from sklearn.cross_validation import train_test_split\n",
    "else:\n",
    "    from sklearn.model_selection import train_test_split\n",
    "\n",
    "iris = datasets.load_iris()\n",
    "X, y = iris.data[50:, [1, 2]], iris.target[50:]\n",
    "le = LabelEncoder()\n",
    "y = le.fit_transform(y)\n",
    "\n",
    "X_train, X_test, y_train, y_test =\\\n",
    "       train_test_split(X, y, \n",
    "                        test_size=0.5, \n",
    "                        random_state=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### Performances des modèles isolés"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.neighbors import KNeighborsClassifier \n",
    "from sklearn.pipeline import Pipeline\n",
    "if Version(sklearn_version) < '0.18':\n",
    "    from sklearn.cross_validation import cross_val_score\n",
    "else:\n",
    "    from sklearn.model_selection import cross_val_score\n",
    "\n",
    "clf1 = LogisticRegression(penalty='l2', \n",
    "                          C=0.001,\n",
    "                          random_state=0)\n",
    "\n",
    "clf2 = DecisionTreeClassifier(max_depth=1,\n",
    "                              criterion='entropy',\n",
    "                              random_state=0)\n",
    "\n",
    "clf3 = KNeighborsClassifier(n_neighbors=1,\n",
    "                            p=2,\n",
    "                            metric='minkowski')\n",
    "\n",
    "pipe1 = Pipeline([['sc', StandardScaler()],\n",
    "                  ['clf', clf1]])\n",
    "pipe3 = Pipeline([['sc', StandardScaler()],\n",
    "                  ['clf', clf3]])\n",
    "\n",
    "clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']\n",
    "\n",
    "print('10-fold cross validation:\\n')\n",
    "for clf, label in zip([pipe1, clf2, pipe3], clf_labels):\n",
    "    scores = cross_val_score(estimator=clf,\n",
    "                             X=X_train,\n",
    "                             y=y_train,\n",
    "                             cv=10,\n",
    "                             scoring='roc_auc')\n",
    "    print(\"ROC AUC: %0.2f (+/- %0.2f) [%s]\"\n",
    "          % (scores.mean(), scores.std(), label))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Performance de la combinaison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Majority Rule (hard) Voting\n",
    "\n",
    "mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])\n",
    "\n",
    "clf_labels += ['Majority Voting']\n",
    "all_clf = [pipe1, clf2, pipe3, mv_clf]\n",
    "\n",
    "\n",
    "for clf, label in zip(all_clf, clf_labels):\n",
    "    scores = cross_val_score(estimator=clf,\n",
    "                             X=X_train,\n",
    "                             y=y_train,\n",
    "                             cv=10,\n",
    "                             scoring='roc_auc')\n",
    "    print(\"ROC AUC: %0.2f (+/- %0.2f) [%s]\"\n",
    "          % (scores.mean(), scores.std(), label))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluation de l'ensemble de classifieurs "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# L'Aire Under the ROC Curve (AUC) est un critère de performance très populaire\n",
    "\n",
    "from sklearn.metrics import roc_curve\n",
    "from sklearn.metrics import auc\n",
    "\n",
    "colors = ['black', 'orange', 'blue', 'green']\n",
    "linestyles = [':', '--', '-.', '-']\n",
    "for clf, label, clr, ls \\\n",
    "        in zip(all_clf,\n",
    "               clf_labels, colors, linestyles):\n",
    "\n",
    "    # assuming the label of the positive class is 1\n",
    "    y_pred = clf.fit(X_train,\n",
    "                     y_train).predict_proba(X_test)[:, 1]\n",
    "    fpr, tpr, thresholds = roc_curve(y_true=y_test,\n",
    "                                     y_score=y_pred)\n",
    "    roc_auc = auc(x=fpr, y=tpr)\n",
    "    plt.plot(fpr, tpr,\n",
    "             color=clr,\n",
    "             linestyle=ls,\n",
    "             label='%s (auc = %0.2f)' % (label, roc_auc))\n",
    "\n",
    "plt.legend(loc='lower right')\n",
    "plt.plot([0, 1], [0, 1],\n",
    "         linestyle='--',\n",
    "         color='gray',\n",
    "         linewidth=2)\n",
    "\n",
    "plt.xlim([-0.1, 1.1])\n",
    "plt.ylim([-0.1, 1.1])\n",
    "plt.grid()\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.ylabel('True Positive Rate')\n",
    "\n",
    "# plt.tight_layout()\n",
    "# plt.savefig('./figures/roc.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mv_clf.get_params()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Piste d'amélioration "
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "NB : Bien sur vous pouver inclure tout ceci dans  un grid search pour trouver les meilleurs paramètres de l'ensemble\n",
    "\n",
    "Par exemple  :\n",
    "    \n",
    "params = {'decisiontreeclassifier__max_depth': [1, 2],\n",
    "          'pipeline-1__clf__C': [0.001, 0.1, 100.0]}\n",
    "\n",
    "grid = GridSearchCV(estimator=mv_clf,\n",
    "                    param_grid=params,\n",
    "                    cv=10,\n",
    "                    scoring='roc_auc')\n",
    "grid.fit(X_train, y_train)\n",
    "\n",
    "print('Best parameters: %s' % grid.best_params_)\n",
    "print('Accuracy: %.2f' % grid.best_score_)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combinaison par bagging"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sur les données des iris\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_digits, load_iris\n",
    "\n",
    "#digits = load_digits()\n",
    "\n",
    "dataset = load_iris()\n",
    "print dataset.data.shape\n",
    "print dataset.target.shape\n",
    "\n",
    "X, y= dataset.data, dataset.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "if Version(sklearn_version) < '0.18':\n",
    "    from sklearn.cross_validation import train_test_split\n",
    "else:\n",
    "    from sklearn.model_selection import train_test_split\n",
    "\n",
    "le = LabelEncoder()\n",
    "y = le.fit_transform(y)\n",
    "\n",
    "X_train, X_test, y_train, y_test =\\\n",
    "            train_test_split(X, y, \n",
    "                             test_size=0.40, \n",
    "                             random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "tree = DecisionTreeClassifier(criterion='entropy', \n",
    "                              max_depth=1,\n",
    "                              random_state=1)\n",
    "\n",
    "bag = BaggingClassifier(base_estimator=tree,\n",
    "                        n_estimators=500, \n",
    "                        max_samples=1.0, \n",
    "                        max_features=1.0, \n",
    "                        bootstrap=True, \n",
    "                        bootstrap_features=False, \n",
    "                        n_jobs=1, \n",
    "                        random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "tree = tree.fit(X_train, y_train)\n",
    "y_train_pred = tree.predict(X_train)\n",
    "y_test_pred = tree.predict(X_test)\n",
    "\n",
    "tree_train = accuracy_score(y_train, y_train_pred)\n",
    "tree_test = accuracy_score(y_test, y_test_pred)\n",
    "print('Decision tree train/test accuracies %.3f/%.3f'\n",
    "      % (tree_train, tree_test))\n",
    "\n",
    "bag = bag.fit(X_train, y_train)\n",
    "y_train_pred = bag.predict(X_train)\n",
    "y_test_pred = bag.predict(X_test)\n",
    "\n",
    "bag_train = accuracy_score(y_train, y_train_pred) \n",
    "bag_test = accuracy_score(y_test, y_test_pred) \n",
    "print('Bagging train/test accuracies %.3f/%.3f'\n",
    "      % (bag_train, bag_test))"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "Explorer les performances obtenues en fonction \n",
    "- des paramètres de définition des arbres de décision (notamment  max_depth)\n",
    "- des paramètres du bagging (notamment n_estimators)\n",
    "- du classifieur de base"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Combinaison par Boosting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "\n",
    "tree = DecisionTreeClassifier(criterion='entropy', \n",
    "                              max_depth=1,\n",
    "                              random_state=0)\n",
    "\n",
    "ada = AdaBoostClassifier(base_estimator=tree,\n",
    "                         n_estimators=500, \n",
    "                         learning_rate=0.1,\n",
    "                         random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tree = tree.fit(X_train, y_train)\n",
    "y_train_pred = tree.predict(X_train)\n",
    "y_test_pred = tree.predict(X_test)\n",
    "\n",
    "tree_train = accuracy_score(y_train, y_train_pred)\n",
    "tree_test = accuracy_score(y_test, y_test_pred)\n",
    "print('Decision tree train/test accuracies %.3f/%.3f'\n",
    "      % (tree_train, tree_test))\n",
    "\n",
    "ada = ada.fit(X_train, y_train)\n",
    "y_train_pred = ada.predict(X_train)\n",
    "y_test_pred = ada.predict(X_test)\n",
    "\n",
    "ada_train = accuracy_score(y_train, y_train_pred) \n",
    "ada_test = accuracy_score(y_test, y_test_pred) \n",
    "print('AdaBoost train/test accuracies %.3f/%.3f'\n",
    "      % (ada_train, ada_test))"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "Explorer les performances obtenues en fonction :\n",
    "- des paramètres de définition des arbres de décision (notamment  max_depth)\n",
    "- des paramètres du boosting (notamment n_estimators)\n",
    "- du classifieur de base"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Reproduire les expérimentations ci-dessus et déterminer les performances comparées des approches de boosting et de bagging sur les données digits  "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}