{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-ic5KTgSfB0f"
      },
      "outputs": [],
      "source": [
        "import numpy as np"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TKCFh-WhfGXS"
      },
      "source": [
        "## Dataset loading"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "xkqdgAwOfQpq"
      },
      "source": [
        "### Default datasets"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "x6w8ioPjfF9c"
      },
      "outputs": [],
      "source": [
        "from sklearn.datasets import load_breast_cancer\n",
        "breast = load_breast_cancer()\n",
        "print(breast.DESCR)\n",
        "\n",
        "# Matrix of examples and associated labels\n",
        "X, y = breast.data, breast.target"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "from sklearn.datasets import load_digits\n",
        "digits = load_digits()\n",
        "print(digits.DESCR)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "plt.gray()\n",
        "plt.matshow(digits.images[2])\n",
        "plt.show()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WY9VSfsMfbl5"
      },
      "source": [
        "### Da file svmlight"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Wv1Z4Un3fEmO"
      },
      "outputs": [],
      "source": [
        "#Let's load a dataset svmlight from a URL\n",
        "import urllib\n",
        "raw_data = urllib.request.urlopen(\"https://www.math.unipd.it/~aiolli/teaching/ml2122/tic-tac-toe.svmlight\")\n",
        "\n",
        "\n",
        "#SVMLIGHT\n",
        "from sklearn.datasets import load_svmlight_file\n",
        "X, y = load_svmlight_file(raw_data) #or \"filename\" if loaded from a file \n",
        "X = X.toarray() #Needed to convert a sparse matrix to a dense matrix (alternative todense)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SLDqwgbaiMh6"
      },
      "source": [
        "## Training and test set"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Dv39kAYoiQHP"
      },
      "outputs": [],
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "1cizhhgdgJfm"
      },
      "source": [
        "# Preprocessing"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MpB__a_Xgkua"
      },
      "source": [
        "### Standard scaler\n",
        "\n",
        "\"Standardize\" features by removing mean and scaling to a unitary variance (mean 0, variance 1).\n",
        "\n",
        "### Minmax scaler\n",
        "By default, it scales features between 0 and 1 according to the following transformation\n",
        "\n",
        "```\n",
        "X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n",
        "X_scaled = X_std * (max - min) + min\n",
        "```\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZaYuJPNSf8LJ"
      },
      "outputs": [],
      "source": [
        "from sklearn.preprocessing import StandardScaler, MinMaxScaler\n",
        "scaler = StandardScaler()\n",
        "#scaler = MinMaxScaler()\n",
        "scaler.fit(X_train)\n",
        "X_train = scaler.transform(X_train)\n",
        "X_test = scaler.transform(X_test)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FnLVhmNEiWBY"
      },
      "source": [
        "## Evaluation (classification)\n",
        "\n",
        "$\\textit{accuracy} = \\frac{\\text{TP}+\\text{TN}}{\\text{TP}+\\text{TN}+\\text{FP}+\\text{FN}}$\n",
        "\n",
        "$\\textit{precision} = \\frac{\\text{TP}}{\\text{TP}+\\text{FP}}$\n",
        "\n",
        "$\\textit{recall} = \\frac{\\text{TP}}{\\text{TP}+\\text{FN}}$\n",
        "\n",
        "*AUC* = Area Under the ROC Curve"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Cwkdch0OgVfX"
      },
      "outputs": [],
      "source": [
        "from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score\n",
        "\n",
        "def evaluate(y_test, y_pred):\n",
        "\tprint(\"accuracy:\", accuracy_score(y_test, y_pred))\n",
        "\tprint(\"precision:\", precision_score(y_test, y_pred))\n",
        "\tprint(\"recall:\", recall_score(y_test, y_pred))\n",
        "\t#print \"AUC:\", roc_auc_score(y_test, y_pred)\n",
        "\tprint"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qGzlW0klicp8"
      },
      "source": [
        "# Methods for classification"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TKpTJkIUihiu"
      },
      "source": [
        "## Decision trees"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3aVCtV8qig-6"
      },
      "outputs": [],
      "source": [
        "from sklearn import tree\n",
        "clf_tree = tree.DecisionTreeClassifier()\n",
        "#DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False)\n",
        "\n",
        "clf_tree = clf_tree.fit(X_train, y_train)\n",
        "y_pred = clf_tree.predict(X_test)\n",
        "\n",
        "print(\"DT\")\n",
        "evaluate(y_test, y_pred)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Hyeu6tRXixe5"
      },
      "source": [
        "## Multi Layer Perceptron"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ijWxdFscixKv"
      },
      "outputs": [],
      "source": [
        "from sklearn.neural_network import MLPClassifier\n",
        "mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, alpha=1, tol=1e-8, learning_rate_init=.01)\n",
        "\n",
        "mlp.fit(X_train, y_train)\n",
        "y_pred = mlp.predict(X_test)\n",
        "\n",
        "print(\"MLP\")\n",
        "evaluate(y_test, y_pred)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2YS7rIYHi7gT"
      },
      "source": [
        "## Support Vector Machine"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "txPYpUrviwDw"
      },
      "outputs": [],
      "source": [
        "#SVM\n",
        "from sklearn import svm\n",
        "svc = svm.SVC(gamma=0.0001, C=100.0) #kernel : string, optional (default=rbf) linear, poly, rbf, sigmoid\n",
        "#SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,  random_state=None, shrinking=True, tol=0.001, verbose=False)\n",
        "\n",
        "svc.fit(X_train, y_train)\n",
        "y_pred = svc.predict(X_test)\n",
        "\n",
        "print(\"SVM\")\n",
        "evaluate(y_test, y_pred)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gxjQLDnCjEZo"
      },
      "source": [
        "# Model selection"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Hs02S_qhjKZt"
      },
      "source": [
        "## k-fold cross validation and grid search"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Q8bsUxyljAgC"
      },
      "outputs": [],
      "source": [
        "from sklearn.model_selection import GridSearchCV, KFold\n",
        "from sklearn import metrics"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "255G5zG-jTWu"
      },
      "source": [
        "Dictionary containingthe parameter grid for SVM (linear kernel, RBF and custom) and NN"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "maC-atdcjPyF"
      },
      "outputs": [],
      "source": [
        "C_values = [2**i for i in range(-5,5)]\n",
        "p_grid = { \n",
        "            \"svm\" : [{\"C\": C_values, \"kernel\": [\"rbf\"], \"gamma\" : [10**i for i in range(-4, 4)]},\n",
        "\t\t\t         {\"C\": C_values, \"kernel\": [\"poly\"], \"degree\": [2+i for i in range(4)]}],\n",
        "            \"precomputed\" : {\"C\": C_values, \"kernel\": [\"precomputed\"]},\n",
        "\t\t\t\"nn\" : {\"alpha\" : [10**i for i in range(-5,1)], 'hidden_layer_sizes': [(10,), (50,), (100,), (200,)]}\n",
        "         }"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YM2pCKmKjXaa"
      },
      "outputs": [],
      "source": [
        "#from sklearn import svm\n",
        "#from sklearn.neural_network import MLPClassifier\n",
        "\n",
        "### hdden warnings of sklearn\n",
        "#import warnings\n",
        "#warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n",
        "\n",
        "kernel = \"precomputed\" #rbf/linear/poly\n",
        "\n",
        "#Example of linear kernel\n",
        "Klin = np.dot(X, X.T)\n",
        "\n",
        "### Test repeated N times\n",
        "skf = KFold(n_splits=5, shuffle=True, random_state=42)\n",
        "\n",
        "fold, accs = 1, []\n",
        "for train, test in skf.split(X, y):\n",
        "\tprint(\"FOLD:\", fold)\n",
        "\tX_train, X_test = X[train],  X[test]\n",
        "\t\n",
        "\t### The content of  X should be a custom kernel!\n",
        "\tif kernel == \"precomputed\": \n",
        "\t\tX_train, X_test = Klin[train][:,train], Klin[test][:,train]\n",
        "\t\n",
        "\t### Possible Preprocessing!!!\n",
        "\t\n",
        "\t### Validation of parameters with Grid Search\n",
        "\t#GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score='warn')\n",
        "\tclf = GridSearchCV(svm.SVC(kernel), param_grid=p_grid[\"precomputed\"], cv=5, scoring='accuracy') #SVM\n",
        "\t#clf = GridSearchCV(MLPClassifier(), param_grid=p_grid[\"nn\"], cv=5, scoring='accuracy') #NN\n",
        "\t\n",
        "\t### Training\n",
        "\tclf.fit(X_train, y[train])\n",
        "\t\n",
        "\t### Info about validation\n",
        "\t#print \"CV info:\", clf.cv_results_#.keys()\n",
        "\tprint(\"VALIDATION score:\", clf.best_score_)\n",
        "\tprint(\"BEST parameters:\", clf.best_params_)\n",
        "\t#clf.best_estimator_\n",
        "\t\n",
        "\t### score (not the prediction!) Useful to compute AUC (ranking metric)\n",
        "\t#y_pred = clf.decision_function(X_test) \n",
        "\t\n",
        "\ty_pred = clf.predict(X_test)\n",
        "\ty_true = y[test]\n",
        "\t\n",
        "\t### Training classification report and confusion matrix\n",
        "\tprint(metrics.classification_report(y_true, y_pred))\n",
        "\tprint(metrics.confusion_matrix(y_true, y_pred))\n",
        "\t\n",
        "\t#auc = roc_auc_score(y_true, y_pred) # AUC\n",
        "\tacc = accuracy_score(y_true, y_pred) # Accuracy\n",
        "\tprint(\"TEST score:{}\\n\".format(acc))\n",
        "\t\n",
        "\taccs.append(acc)\n",
        "\tfold += 1\n",
        "\n",
        "print(\"AVG ACCURACY:\", np.mean(accs), \"+-\", np.std(accs))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CFDvC2KOVbSJ"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "collapsed_sections": [],
      "name": "Lab02_AA2021.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3.10.4 64-bit",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.4"
    },
    "vscode": {
      "interpreter": {
        "hash": "6f59339097bb9ac4cf41ab9fa2e7f783ea6bb84442f6ce2c2671fa41ded377c2"
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}