{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Application of the Principal Component Analysis on the Iris dataset"
      ],
      "metadata": {
        "id": "b94slaKNpac5"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "I apply the PCA of the Iris dataset with different amounts of components noting the loss of accuracy"
      ],
      "metadata": {
        "id": "MW-CAN33pxyR"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "First of all, I import the necessary libraries"
      ],
      "metadata": {
        "id": "e4y6CSXhqCtg"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.decomposition import PCA\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "import pandas as pd\n",
        "from sklearn.datasets import load_iris\n",
        "from sklearn.pipeline import Pipeline\n",
        "from sklearn.model_selection import cross_val_score"
      ],
      "metadata": {
        "id": "wGyrwmhbq5v5"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "I load Iris dataset showing the first rows"
      ],
      "metadata": {
        "id": "Wl4yTdyOqQEn"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "iris = load_iris()\n",
        "X = iris['data']\n",
        "y = iris['target']\n",
        "\n",
        "df = pd.DataFrame(iris['data'], columns=iris['feature_names'])\n",
        "df.head()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "_GpYFu3Rq57_",
        "outputId": "c74fb4a2-53c2-4879-c964-daea9d2a9007"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)\n",
              "0                5.1               3.5                1.4               0.2\n",
              "1                4.9               3.0                1.4               0.2\n",
              "2                4.7               3.2                1.3               0.2\n",
              "3                4.6               3.1                1.5               0.2\n",
              "4                5.0               3.6                1.4               0.2"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-143aa235-1375-4e53-9d92-d877b14ee78a\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>sepal length (cm)</th>\n",
              "      <th>sepal width (cm)</th>\n",
              "      <th>petal length (cm)</th>\n",
              "      <th>petal width (cm)</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>5.1</td>\n",
              "      <td>3.5</td>\n",
              "      <td>1.4</td>\n",
              "      <td>0.2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>4.9</td>\n",
              "      <td>3.0</td>\n",
              "      <td>1.4</td>\n",
              "      <td>0.2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>4.7</td>\n",
              "      <td>3.2</td>\n",
              "      <td>1.3</td>\n",
              "      <td>0.2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>4.6</td>\n",
              "      <td>3.1</td>\n",
              "      <td>1.5</td>\n",
              "      <td>0.2</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>5.0</td>\n",
              "      <td>3.6</td>\n",
              "      <td>1.4</td>\n",
              "      <td>0.2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-143aa235-1375-4e53-9d92-d877b14ee78a')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-143aa235-1375-4e53-9d92-d877b14ee78a button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-143aa235-1375-4e53-9d92-d877b14ee78a');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "I aplly PCA and I calculate the accuracy per each number of components (from 5 to one) in this way:\n",
        "\n",
        "\n",
        "*   create a pipeline to perform data standardization and PCA\n",
        "*   performs cross-validation with a logistic regression\n",
        "*   Shows mean accuracy\n",
        "\n",
        "Repeat this procedure 5 times (one for each number of components)"
      ],
      "metadata": {
        "id": "Fw-x0gA-rB7w"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 22,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "t98efWm5lg1h",
        "outputId": "e3de2437-0664-4e86-b847-47330cffa564"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Number of components: 1. mean accuracy: 0.92000\n",
            "Number of components: 2. mean accuracy: 0.91333\n",
            "Number of components: 3. mean accuracy: 0.96000\n",
            "Number of components: 4. mean accuracy: 0.96000\n"
          ]
        }
      ],
      "source": [
        "for number_components in range(1, 5):\n",
        "    model = Pipeline([\n",
        "        ('scaler', StandardScaler()),\n",
        "        ('pca', PCA(n_components=number_components)),\n",
        "    ])\n",
        "\n",
        "    scores = cross_val_score(LogisticRegression(solver='lbfgs'), model.fit_transform(X), y, cv=5)\n",
        "\n",
        "    print(f\"Number of components: {number_components}. mean accuracy: {scores.mean():.5f}\")\n"
      ]
    }
  ]
}