{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 95,
      "metadata": {
        "id": "nEBxinWfFnnJ"
      },
      "outputs": [],
      "source": [
        "import pandas as pd"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Import the dataset"
      ],
      "metadata": {
        "id": "Rx0dXECgSra-"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset = pd.read_csv(\"./tested.csv\")\n",
        "dataset.info()"
      ],
      "metadata": {
        "id": "a9EWjl4NGwxl",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "0a889877-4bde-4108-9124-bbc0b83755b4"
      },
      "execution_count": 97,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 418 entries, 0 to 417\n",
            "Data columns (total 12 columns):\n",
            " #   Column       Non-Null Count  Dtype  \n",
            "---  ------       --------------  -----  \n",
            " 0   PassengerId  418 non-null    int64  \n",
            " 1   Survived     418 non-null    int64  \n",
            " 2   Pclass       418 non-null    int64  \n",
            " 3   Name         418 non-null    object \n",
            " 4   Sex          418 non-null    object \n",
            " 5   Age          332 non-null    float64\n",
            " 6   SibSp        418 non-null    int64  \n",
            " 7   Parch        418 non-null    int64  \n",
            " 8   Ticket       418 non-null    object \n",
            " 9   Fare         417 non-null    float64\n",
            " 10  Cabin        91 non-null     object \n",
            " 11  Embarked     418 non-null    object \n",
            "dtypes: float64(2), int64(5), object(5)\n",
            "memory usage: 39.3+ KB\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Split patterns to labels"
      ],
      "metadata": {
        "id": "DoshY8QWStiz"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "y = dataset[\"Survived\"]\n",
        "dataset = dataset.drop([\"Survived\", \"Embarked\", \"Ticket\", \"Name\", \"PassengerId\", \"Cabin\",], 1)\n",
        "dataset.head()"
      ],
      "metadata": {
        "id": "gpnu7pt6HDal",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 262
        },
        "outputId": "a3fd3795-147e-4b12-a217-f11df8a149e3"
      },
      "execution_count": 98,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only\n",
            "  \n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "   Pclass     Sex   Age  SibSp  Parch     Fare\n",
              "0       3    male  34.5      0      0   7.8292\n",
              "1       3  female  47.0      1      0   7.0000\n",
              "2       2    male  62.0      0      0   9.6875\n",
              "3       3    male  27.0      0      0   8.6625\n",
              "4       3  female  22.0      1      1  12.2875"
            ],
            "text/html": [
              "\n",
              "  <div id=\"df-0a933eeb-8bf7-4227-a183-4a8364e04951\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Pclass</th>\n",
              "      <th>Sex</th>\n",
              "      <th>Age</th>\n",
              "      <th>SibSp</th>\n",
              "      <th>Parch</th>\n",
              "      <th>Fare</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>3</td>\n",
              "      <td>male</td>\n",
              "      <td>34.5</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>7.8292</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>3</td>\n",
              "      <td>female</td>\n",
              "      <td>47.0</td>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>7.0000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>male</td>\n",
              "      <td>62.0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>9.6875</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>male</td>\n",
              "      <td>27.0</td>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>8.6625</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>3</td>\n",
              "      <td>female</td>\n",
              "      <td>22.0</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>12.2875</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0a933eeb-8bf7-4227-a183-4a8364e04951')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-0a933eeb-8bf7-4227-a183-4a8364e04951 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-0a933eeb-8bf7-4227-a183-4a8364e04951');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ]
          },
          "metadata": {},
          "execution_count": 98
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Vectorize categorial variables (Sex)"
      ],
      "metadata": {
        "id": "EChES9ddaBv5"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import OneHotEncoder\n",
        "import numpy as np\n",
        "\n",
        "encoder = OneHotEncoder()\n",
        "dataset[\"Sex\"] = encoder.fit_transform(dataset[\"Sex\"][:, np.newaxis]).toarray()\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tBDcqI2tXzMo",
        "outputId": "0f14aa47-7efe-4989-d5a0-355f12bd3bfa"
      },
      "execution_count": 99,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.\n",
            "  \"\"\"\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Check if there are missing values"
      ],
      "metadata": {
        "id": "07BoPiKiaELk"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "print(\"Missing values for every feature:\")\n",
        "dataset.shape[0] - dataset.notnull().sum()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PNpPhyVcaPMd",
        "outputId": "7895e316-ad98-4794-c768-57c5874096eb"
      },
      "execution_count": 103,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Missing values for every feature:\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "Pclass     0\n",
              "Sex        0\n",
              "Age       86\n",
              "SibSp      0\n",
              "Parch      0\n",
              "Fare       1\n",
              "dtype: int64"
            ]
          },
          "metadata": {},
          "execution_count": 103
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "We fill the missing values by using a Simple Imputer (strategy -> mean)"
      ],
      "metadata": {
        "id": "qJcm6BcecJIf"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.impute import SimpleImputer\n",
        "\n",
        "def fillMissingValues(columnName):\n",
        "  imputer = SimpleImputer()\n",
        "  dataset[columnName] = imputer.fit_transform(dataset[columnName][:, np.newaxis])\n",
        "\n",
        "fillMissingValues(\"Age\")\n",
        "fillMissingValues(\"Fare\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yUE5p1qIa_Y1",
        "outputId": "434f3406-c379-46a9-9d02-6828445bc549"
      },
      "execution_count": 105,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.\n",
            "  \"\"\"\n",
            "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.\n",
            "  \"\"\"\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Split data in Train / Test"
      ],
      "metadata": {
        "id": "cp-PaN67czSx"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "X = dataset.loc[:,:]\n",
        "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"
      ],
      "metadata": {
        "id": "wFVt-2SqIQG-"
      },
      "execution_count": 106,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Evaluate KNeighbors with data not-preprocessed"
      ],
      "metadata": {
        "id": "MvDRqGw8OHb-"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.neighbors import KNeighborsClassifier\n",
        "\n",
        "clf = KNeighborsClassifier()\n",
        "clf.fit(X_train, y_train)\n",
        "clf.score(X_test, y_test)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "QjxF4Ni3Ip6j",
        "outputId": "c92544be-3300-4705-ef4e-baf211a30c84"
      },
      "execution_count": 107,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.7238095238095238"
            ]
          },
          "metadata": {},
          "execution_count": 107
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now try to preprocess data using a scaler"
      ],
      "metadata": {
        "id": "95CUpSy0ksuC"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
        "from sklearn.pipeline import Pipeline\n",
        "\n",
        "# scaler = MinMaxScaler()\n",
        "scaler = StandardScaler()\n",
        "clf = Pipeline([('scaler', scaler), ('estimator', KNeighborsClassifier())])\n",
        "clf.fit(X_train, y_train)\n",
        "clf.score(X_test, y_test)"
      ],
      "metadata": {
        "id": "VCAUuuaeLttO",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "fac216b8-2a78-47b8-eaa6-f0b48ab1a7bd"
      },
      "execution_count": 108,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.9809523809523809"
            ]
          },
          "metadata": {},
          "execution_count": 108
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now we try to execute Cross-Validation"
      ],
      "metadata": {
        "id": "NsNm-lctK3j5"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.model_selection import cross_val_score\n",
        "\n",
        "cv_score = cross_val_score(clf, X_train, y_train)\n",
        "print(f\"Results of Cross-Validation: {cv_score}\")\n",
        "print(f\"Avg of the results: {cv_score.mean()}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "EdMZLVnwK6sJ",
        "outputId": "80b2647d-129e-42a4-bb32-1bbf5b85ec3f"
      },
      "execution_count": 109,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Results of Cross-Validation: [1.         0.98412698 1.         1.         0.96774194]\n",
            "Avg of the results: 0.990373783922171\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "JubkZ36lLVbu"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}