# Acknowledgment

Ce notebook utilise une partie d'un notebook de [Sebastian Raschka](sebastianraschka.com) Copyright (c) 2015, 2016 

Python Machine Learning - Code Examples

https://github.com/rasbt/python-machine-learning-book

[MIT License](https://github.com/rasbt/python-machine-learning-book/blob/master/LICENSE.txt)

# Bref panorama de méthodes de classification

### Table des matières

- [Initialisation et Chargement des Iris](#Initialisation-et-Chargement-des-Iris)
- [Le Perceptron](#Le-Perceptron)
- [Les K plus proches voisins](#K-plus-proches-voisins)
- [La Régression logistique](#Régression-logistique)
- [Machines à vecteurs de support](#Support-vector-machines)
- [Le kernel trick](#Le-Kernel-Trick)
- [Travail à réaliser](#Travail-à-réaliser)

# Initialisation et Chargement des Iris


In [None]:
from IPython.display import Image
%matplotlib inline

In [None]:
import sys
print(sys.version_info)

In [None]:
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

In [None]:
import sklearn
from sklearn import datasets
iris = datasets.load_iris()
iris


Loading the Iris dataset from scikit-learn. Here, the third column represents the petal length, and the fourth column the petal width of the flower samples. The classes are already converted to integer labels where 0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica.

In [None]:
from sklearn import datasets
import numpy as np

iris = datasets.load_iris()
X = iris.data[:, [2, 3]]
y = iris.target

print('Class labels:', np.unique(y))

Splitting data into 70% training and 30% test data:

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
 X, y, test_size=0.3, random_state=0)

Standardizing the features:

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

# Le Perceptron

Redefining the `plot_decision_region` function:

In [None]:
from sklearn.linear_model import Perceptron

ppn = Perceptron(max_iter=40, eta0=0.1, random_state=0)
ppn.fit(X_train_std, y_train)

In [None]:
y_test.shape

In [None]:
y_pred = ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())

In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))


print(ppn.score(X_test_std,y_test ))

In [None]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings


def versiontuple(v):
 return tuple(map(int, (v.split("."))))


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

 # setup marker generator and color map
 markers = ('s', 'x', 'o', '^', 'v')
 colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
 cmap = ListedColormap(colors[:len(np.unique(y))])

 # plot the decision surface
 x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
 xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
 np.arange(x2_min, x2_max, resolution))
 Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
 Z = Z.reshape(xx1.shape)
 plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
 plt.xlim(xx1.min(), xx1.max())
 plt.ylim(xx2.min(), xx2.max())

 for idx, cl in enumerate(np.unique(y)):
 plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
 alpha=0.8, c=cmap(idx),
 marker=markers[idx], label=cl)

 # highlight test samples
 if test_idx:
 # plot all samples
 if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
 X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
 warnings.warn('Please update to NumPy 1.9.0 or newer')
 else:
 X_test, y_test = X[test_idx, :], y[test_idx]

 plt.scatter(X_test[:, 0],
 X_test[:, 1],
 c='',
 alpha=1.0,
 linewidths=1,
 marker='o',
 s=55, label='test set')

Training a perceptron model using the standardized training data:

In [None]:
X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

plot_decision_regions(X=X_combined_std, y=y_combined,
 classifier=ppn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')

plt.tight_layout()
# plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
plt.show()

# K plus proches voisins

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')
knn.fit(X_train_std, y_train)

print(knn.score(X_test_std, y_test))
plot_decision_regions(X_combined_std, y_combined, 
 classifier=knn, test_idx=range(105, 150))

plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/k_nearest_neighbors.png', dpi=300)
plt.show()

# Régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, y_train)


print('Accuracy: %.2f' % lr.score(X_test_std,y_test ))


plot_decision_regions(X_combined_std, y_combined,
 classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()

In [None]:
print(lr.predict_proba(X_test_std[0, :].reshape(1, -1)))

#print(lr.coef_)

# Support vector machines

## Le cas non linéairement séparable et les slack variables

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train_std, y_train)


print('Accuracy: %.2f' % svm.score(X_test_std,y_test ))


plot_decision_regions(X_combined_std, y_combined,
 classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
plt.show()

## Le kernel trick


In [None]:
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(0)
X_xor = np.random.randn(200, 2)
y_xor = np.logical_xor(X_xor[:, 0] > 0,
 X_xor[:, 1] > 0)
y_xor = np.where(y_xor, 1, -1)

plt.scatter(X_xor[y_xor == 1, 0],
 X_xor[y_xor == 1, 1],
 c='b', marker='x',
 label='1')
plt.scatter(X_xor[y_xor == -1, 0],
 X_xor[y_xor == -1, 1],
 c='r',
 marker='s',
 label='-1')

plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.legend(loc='best')
plt.tight_layout()
# plt.savefig('./figures/xor.png', dpi=300)
plt.show()

## Illustration sur le Ou Exclusif (XOR)

In [None]:
svm = SVC(kernel='rbf', random_state=0, gamma=0.10, C=10.0)
svm.fit(X_xor, y_xor)


plot_decision_regions(X_xor, y_xor,
 classifier=svm)

plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_xor.png', dpi=300)
plt.show()

In [None]:
from sklearn.svm import SVC

svm = SVC(kernel='rbf', random_state=0, gamma=0.2, C=1.0)
svm.fit(X_train_std, y_train)


print('Accuracy: %.2f' % svm.score(X_test_std,y_test ))


plot_decision_regions(X_combined_std, y_combined,
 classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
plt.show()

In [None]:
svm = SVC(kernel='rbf', random_state=0, gamma=100.0, C=1.0)
svm.fit(X_train_std, y_train)


print('Accuracy: %.2f' % svm.score(X_test_std,y_test ))

plot_decision_regions(X_combined_std, y_combined, 
 classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_iris_2.png', dpi=300)
plt.show()

# Travail à réaliser

Reproduisez pour les datasets suivants:
- [Iris](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html#sklearn.datasets.load_iris)
- [Digits](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits) (en utilisant les données complètes)
- [Breast](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html)

les expérimentations suivantes:

- Mise au point de plusieurs types de classifieurs (Perceptron, régression logistique, SVM, Knn). Pour chacun de ces types de classifieurs vous devrez :
 - Définir les hyper-paramètres à faire varier.
 - Evaluer et selectionner par Grid-Search l'ensemble des configurations possibles, en utilisant la Validation Croisée à 3 plis pour l'évaluation de la performance en généralisation. Vous pourrez vous inspirer d'un code tel que [celui-ci](http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-auto-examples-classification-plot-classifier-comparison-py) pour boucler sur les datasets et/ou les classifieurs.
- ### Ecrire sous forme d'un tableau récapitulatif les performances respectives (les meilleures obtenues) par chacun des modèles sur chacun des jeux de données (sur le test set).
- Donner des conclusions sur les résultats obtenus quant à la performance, la stabilité, la robustesse des familles de classifieurs utilisées, et les paramètres optimaux de chaque type de modèle.


 
 


In [None]:
from sklearn import datasets

# loading datasets
ir = datasets.load_iris()
dig = datasets.load_digits()
bc = datasets.load_breast_cancer()

print(ir.data.shape)
print(bc.data.shape)
print(dig.data.shape)
