{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "-ic5KTgSfB0f" }, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "markdown", "metadata": { "id": "TKCFh-WhfGXS" }, "source": [ "## Dataset loading" ] }, { "cell_type": "markdown", "metadata": { "id": "xkqdgAwOfQpq" }, "source": [ "### Default datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "x6w8ioPjfF9c" }, "outputs": [], "source": [ "from sklearn.datasets import load_breast_cancer\n", "breast = load_breast_cancer()\n", "print(breast.DESCR)\n", "\n", "# Matrix of examples and associated labels\n", "X, y = breast.data, breast.target" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_digits\n", "digits = load_digits()\n", "print(digits.DESCR)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "plt.gray()\n", "plt.matshow(digits.images[2])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "id": "WY9VSfsMfbl5" }, "source": [ "### Da file svmlight" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Wv1Z4Un3fEmO" }, "outputs": [], "source": [ "#Let's load a dataset svmlight from a URL\n", "import urllib\n", "raw_data = urllib.request.urlopen(\"https://www.math.unipd.it/~aiolli/teaching/ml2122/tic-tac-toe.svmlight\")\n", "\n", "\n", "#SVMLIGHT\n", "from sklearn.datasets import load_svmlight_file\n", "X, y = load_svmlight_file(raw_data) #or \"filename\" if loaded from a file \n", "X = X.toarray() #Needed to convert a sparse matrix to a dense matrix (alternative todense)" ] }, { "cell_type": "markdown", "metadata": { "id": "SLDqwgbaiMh6" }, "source": [ "## Training and test set" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Dv39kAYoiQHP" }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" ] }, { "cell_type": "markdown", "metadata": { "id": "1cizhhgdgJfm" }, "source": [ "# Preprocessing" ] }, { "cell_type": "markdown", "metadata": { "id": "MpB__a_Xgkua" }, "source": [ "### Standard scaler\n", "\n", "\"Standardize\" features by removing mean and scaling to a unitary variance (mean 0, variance 1).\n", "\n", "### Minmax scaler\n", "By default, it scales features between 0 and 1 according to the following transformation\n", "\n", "```\n", "X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n", "X_scaled = X_std * (max - min) + min\n", "```\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ZaYuJPNSf8LJ" }, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n", "scaler = StandardScaler()\n", "#scaler = MinMaxScaler()\n", "scaler.fit(X_train)\n", "X_train = scaler.transform(X_train)\n", "X_test = scaler.transform(X_test)" ] }, { "cell_type": "markdown", "metadata": { "id": "FnLVhmNEiWBY" }, "source": [ "## Evaluation (classification)\n", "\n", "$\\textit{accuracy} = \\frac{\\text{TP}+\\text{TN}}{\\text{TP}+\\text{TN}+\\text{FP}+\\text{FN}}$\n", "\n", "$\\textit{precision} = \\frac{\\text{TP}}{\\text{TP}+\\text{FP}}$\n", "\n", "$\\textit{recall} = \\frac{\\text{TP}}{\\text{TP}+\\text{FN}}$\n", "\n", "*AUC* = Area Under the ROC Curve" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Cwkdch0OgVfX" }, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score\n", "\n", "def evaluate(y_test, y_pred):\n", "\tprint(\"accuracy:\", accuracy_score(y_test, y_pred))\n", "\tprint(\"precision:\", precision_score(y_test, y_pred))\n", "\tprint(\"recall:\", recall_score(y_test, y_pred))\n", "\t#print \"AUC:\", roc_auc_score(y_test, y_pred)\n", "\tprint" ] }, { "cell_type": "markdown", "metadata": { "id": "qGzlW0klicp8" }, "source": [ "# Methods for classification" ] }, { "cell_type": "markdown", "metadata": { "id": "TKpTJkIUihiu" }, "source": [ "## Decision trees" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3aVCtV8qig-6" }, "outputs": [], "source": [ "from sklearn import tree\n", "clf_tree = tree.DecisionTreeClassifier()\n", "#DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False)\n", "\n", "clf_tree = clf_tree.fit(X_train, y_train)\n", "y_pred = clf_tree.predict(X_test)\n", "\n", "print(\"DT\")\n", "evaluate(y_test, y_pred)" ] }, { "cell_type": "markdown", "metadata": { "id": "Hyeu6tRXixe5" }, "source": [ "## Multi Layer Perceptron" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ijWxdFscixKv" }, "outputs": [], "source": [ "from sklearn.neural_network import MLPClassifier\n", "mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, alpha=1, tol=1e-8, learning_rate_init=.01)\n", "\n", "mlp.fit(X_train, y_train)\n", "y_pred = mlp.predict(X_test)\n", "\n", "print(\"MLP\")\n", "evaluate(y_test, y_pred)" ] }, { "cell_type": "markdown", "metadata": { "id": "2YS7rIYHi7gT" }, "source": [ "## Support Vector Machine" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "txPYpUrviwDw" }, "outputs": [], "source": [ "#SVM\n", "from sklearn import svm\n", "svc = svm.SVC(gamma=0.0001, C=100.0) #kernel : string, optional (default=rbf) linear, poly, rbf, sigmoid\n", "#SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)\n", "\n", "svc.fit(X_train, y_train)\n", "y_pred = svc.predict(X_test)\n", "\n", "print(\"SVM\")\n", "evaluate(y_test, y_pred)" ] }, { "cell_type": "markdown", "metadata": { "id": "gxjQLDnCjEZo" }, "source": [ "# Model selection" ] }, { "cell_type": "markdown", "metadata": { "id": "Hs02S_qhjKZt" }, "source": [ "## k-fold cross validation and grid search" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Q8bsUxyljAgC" }, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV, KFold\n", "from sklearn import metrics" ] }, { "cell_type": "markdown", "metadata": { "id": "255G5zG-jTWu" }, "source": [ "Dictionary containingthe parameter grid for SVM (linear kernel, RBF and custom) and NN" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "maC-atdcjPyF" }, "outputs": [], "source": [ "C_values = [2**i for i in range(-5,5)]\n", "p_grid = { \n", " \"svm\" : [{\"C\": C_values, \"kernel\": [\"rbf\"], \"gamma\" : [10**i for i in range(-4, 4)]},\n", "\t\t\t {\"C\": C_values, \"kernel\": [\"poly\"], \"degree\": [2+i for i in range(4)]}],\n", " \"precomputed\" : {\"C\": C_values, \"kernel\": [\"precomputed\"]},\n", "\t\t\t\"nn\" : {\"alpha\" : [10**i for i in range(-5,1)], 'hidden_layer_sizes': [(10,), (50,), (100,), (200,)]}\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "YM2pCKmKjXaa" }, "outputs": [], "source": [ "#from sklearn import svm\n", "#from sklearn.neural_network import MLPClassifier\n", "\n", "### hdden warnings of sklearn\n", "#import warnings\n", "#warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", "\n", "kernel = \"precomputed\" #rbf/linear/poly\n", "\n", "#Example of linear kernel\n", "Klin = np.dot(X, X.T)\n", "\n", "### Test repeated N times\n", "skf = KFold(n_splits=5, shuffle=True, random_state=42)\n", "\n", "fold, accs = 1, []\n", "for train, test in skf.split(X, y):\n", "\tprint(\"FOLD:\", fold)\n", "\tX_train, X_test = X[train], X[test]\n", "\t\n", "\t### The content of X should be a custom kernel!\n", "\tif kernel == \"precomputed\": \n", "\t\tX_train, X_test = Klin[train][:,train], Klin[test][:,train]\n", "\t\n", "\t### Possible Preprocessing!!!\n", "\t\n", "\t### Validation of parameters with Grid Search\n", "\t#GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score='warn')\n", "\tclf = GridSearchCV(svm.SVC(kernel), param_grid=p_grid[\"precomputed\"], cv=5, scoring='accuracy') #SVM\n", "\t#clf = GridSearchCV(MLPClassifier(), param_grid=p_grid[\"nn\"], cv=5, scoring='accuracy') #NN\n", "\t\n", "\t### Training\n", "\tclf.fit(X_train, y[train])\n", "\t\n", "\t### Info about validation\n", "\t#print \"CV info:\", clf.cv_results_#.keys()\n", "\tprint(\"VALIDATION score:\", clf.best_score_)\n", "\tprint(\"BEST parameters:\", clf.best_params_)\n", "\t#clf.best_estimator_\n", "\t\n", "\t### score (not the prediction!) Useful to compute AUC (ranking metric)\n", "\t#y_pred = clf.decision_function(X_test) \n", "\t\n", "\ty_pred = clf.predict(X_test)\n", "\ty_true = y[test]\n", "\t\n", "\t### Training classification report and confusion matrix\n", "\tprint(metrics.classification_report(y_true, y_pred))\n", "\tprint(metrics.confusion_matrix(y_true, y_pred))\n", "\t\n", "\t#auc = roc_auc_score(y_true, y_pred) # AUC\n", "\tacc = accuracy_score(y_true, y_pred) # Accuracy\n", "\tprint(\"TEST score:{}\\n\".format(acc))\n", "\t\n", "\taccs.append(acc)\n", "\tfold += 1\n", "\n", "print(\"AVG ACCURACY:\", np.mean(accs), \"+-\", np.std(accs))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "CFDvC2KOVbSJ" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "Lab02_AA2021.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3.10.4 64-bit", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.4" }, "vscode": { "interpreter": { "hash": "6f59339097bb9ac4cf41ab9fa2e7f783ea6bb84442f6ce2c2671fa41ded377c2" } } }, "nbformat": 4, "nbformat_minor": 0 }