{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "772995b3-adfa-476c-8e0e-104f3a221518",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     PassengerId  Survived  Pclass  \\\n",
      "0              1         0       3   \n",
      "1              2         1       1   \n",
      "2              3         1       3   \n",
      "3              4         1       1   \n",
      "4              5         0       3   \n",
      "..           ...       ...     ...   \n",
      "886          887         0       2   \n",
      "887          888         1       1   \n",
      "888          889         0       3   \n",
      "889          890         1       1   \n",
      "890          891         0       3   \n",
      "\n",
      "                                                  Name     Sex   Age  SibSp  \\\n",
      "0                              Braund, Mr. Owen Harris    male  22.0      1   \n",
      "1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
      "2                               Heikkinen, Miss. Laina  female  26.0      0   \n",
      "3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
      "4                             Allen, Mr. William Henry    male  35.0      0   \n",
      "..                                                 ...     ...   ...    ...   \n",
      "886                              Montvila, Rev. Juozas    male  27.0      0   \n",
      "887                       Graham, Miss. Margaret Edith  female  19.0      0   \n",
      "888           Johnston, Miss. Catherine Helen \"Carrie\"  female   NaN      1   \n",
      "889                              Behr, Mr. Karl Howell    male  26.0      0   \n",
      "890                                Dooley, Mr. Patrick    male  32.0      0   \n",
      "\n",
      "     Parch            Ticket     Fare Cabin Embarked  \n",
      "0        0         A/5 21171   7.2500   NaN        S  \n",
      "1        0          PC 17599  71.2833   C85        C  \n",
      "2        0  STON/O2. 3101282   7.9250   NaN        S  \n",
      "3        0            113803  53.1000  C123        S  \n",
      "4        0            373450   8.0500   NaN        S  \n",
      "..     ...               ...      ...   ...      ...  \n",
      "886      0            211536  13.0000   NaN        S  \n",
      "887      0            112053  30.0000   B42        S  \n",
      "888      2        W./C. 6607  23.4500   NaN        S  \n",
      "889      0            111369  30.0000  C148        C  \n",
      "890      0            370376   7.7500   NaN        Q  \n",
      "\n",
      "[891 rows x 12 columns]\n"
     ]
    }
   ],
   "source": [
    "# Imports\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "import sklearn.metrics as metrics\n",
    "\n",
    "# Import data of the titanic passengers\n",
    "data_titanic = pd.read_csv(\"titanic_data.csv\")\n",
    "\n",
    "print(data_titanic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f229134f-2d20-4322-b58f-eb6278b51f40",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass     Sex   Age     Fare Embarked\n",
      "0           0       3    male  22.0   7.2500        S\n",
      "1           1       1  female  38.0  71.2833        C\n",
      "2           1       3  female  26.0   7.9250        S\n",
      "3           1       1  female  35.0  53.1000        S\n",
      "4           0       3    male  35.0   8.0500        S\n",
      "..        ...     ...     ...   ...      ...      ...\n",
      "886         0       2    male  27.0  13.0000        S\n",
      "887         1       1  female  19.0  30.0000        S\n",
      "888         0       3  female   NaN  23.4500        S\n",
      "889         1       1    male  26.0  30.0000        C\n",
      "890         0       3    male  32.0   7.7500        Q\n",
      "\n",
      "[891 rows x 6 columns]\n"
     ]
    }
   ],
   "source": [
    "# Not all data is important for the training. The PassengerId, the name, the number of siblings, the parch, the ticket number\n",
    "# and the cabin number are not important and therefore they will be dropped\n",
    "data_titanic = data_titanic.drop(data_titanic.columns[[0, 3, 6, 7, 8, 10]], axis = 1)\n",
    "print(data_titanic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c5796121-905e-4841-8ec3-5227db142ee6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Survived      0\n",
       "Pclass        0\n",
       "Sex           0\n",
       "Age         177\n",
       "Fare          0\n",
       "Embarked      2\n",
       "dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check for missing values in the data set\n",
    "data_titanic.isnull().sum() # Sums all the empty entries of a columns\n",
    "# As can be seen, the age and the Embarked columns have missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1c09781a-7c0a-4785-a0bb-fb249e236b12",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Age average:  30\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Survived    0\n",
       "Pclass      0\n",
       "Sex         0\n",
       "Age         0\n",
       "Fare        0\n",
       "Embarked    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Fill the gaps:\n",
    "# For the age: Compute the average of ages of all passengers on board and take it as value\n",
    "ages = data_titanic[\"Age\"]\n",
    "age_average = round(ages.mean(axis = 0,  skipna = True)) \n",
    "print('Age average: ', age_average)\n",
    "\n",
    "def set_age(Age):\n",
    "    age = Age\n",
    "    if pd.isnull(age):\n",
    "        return age_average\n",
    "    else:\n",
    "        return age\n",
    "\n",
    "data_titanic['Age'] = data_titanic['Age'].apply(set_age)\n",
    "    \n",
    "# For the embarked: Take the most occuring value, available options: S, C and Q\n",
    "Embarked = data_titanic[\"Embarked\"]\n",
    "count_S = 0\n",
    "count_C = 0\n",
    "count_Q = 0\n",
    "\n",
    "# Count embarks\n",
    "for i in range(data_titanic.shape[0]):\n",
    "    if (Embarked[i] == 'S'):\n",
    "        count_S += 1\n",
    "    if (Embarked[i] == 'C'):\n",
    "        count_C += 1\n",
    "    if (Embarked[i] == 'Q'):\n",
    "        count_Q += 1\n",
    "        \n",
    "if ((count_S >= count_C) and (count_S >= count_Q)):\n",
    "    common_embarked = 'S'\n",
    "if ((count_C > count_S) and (count_C >= count_Q)):\n",
    "    common_embarked = 'C'\n",
    "if ((count_Q >= count_S) and (count_Q >= count_C)):\n",
    "    common_embarked = 'Q'\n",
    "        \n",
    "def set_Embarked(Embarked):\n",
    "    embarked = Embarked\n",
    "    if pd.isnull(embarked):\n",
    "        return common_embarked\n",
    "    else:\n",
    "        return embarked\n",
    "\n",
    "data_titanic['Embarked'] = data_titanic['Embarked'].apply(set_Embarked)\n",
    "\n",
    "# Check replacement\n",
    "data_titanic.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "b14df726-807f-4dea-a360-1691b58bece7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass  Sex   Age     Fare  Embarked\n",
      "0           0       3    0  22.0   7.2500         0\n",
      "1           1       1    1  38.0  71.2833         1\n",
      "2           1       3    1  26.0   7.9250         0\n",
      "3           1       1    1  35.0  53.1000         0\n",
      "4           0       3    0  35.0   8.0500         0\n",
      "..        ...     ...  ...   ...      ...       ...\n",
      "886         0       2    0  27.0  13.0000         0\n",
      "887         1       1    1  19.0  30.0000         0\n",
      "888         0       3    1  30.0  23.4500         0\n",
      "889         1       1    0  26.0  30.0000         1\n",
      "890         0       3    0  32.0   7.7500         2\n",
      "\n",
      "[891 rows x 6 columns]\n"
     ]
    }
   ],
   "source": [
    "# Gender and Embarked have to be replaced by values: Set male = 0, female = 1 and S = 0, C = 1, Q = 2\n",
    "replace_gender = {data_titanic.columns[2]:     {\"male\": 0, \"female\": 1}}\n",
    "replace_embarked = {data_titanic.columns[5]:     {\"S\": 0, \"C\": 1, \"Q\": 2}}\n",
    "data_titanic = data_titanic.replace(replace_gender)\n",
    "data_titanic = data_titanic.replace(replace_embarked)\n",
    "\n",
    "# Check replacement\n",
    "print(data_titanic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6328a9b8-da13-4abe-b9f4-6f7b10bcc1d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "KNN_3: Accuracy:  0.672645739910314 , Recall:  0.6486486486486487 , Precision:  0.5052631578947369\n",
      "KNN_4: Accuracy:  0.6591928251121076 , Recall:  0.7021276595744681 , Precision:  0.3473684210526316\n",
      "KNN_5: Accuracy:  0.6816143497757847 , Recall:  0.6666666666666666 , Precision:  0.5052631578947369\n"
     ]
    }
   ],
   "source": [
    "# Run a KNN prediction\n",
    "\n",
    "# Split data into train and evaluation subsets, proportion 1:4\n",
    "input_data = data_titanic.drop(data_titanic.columns[[0]], axis = 1) # Input has to be without \"Survived\" statement\n",
    "target_data = data_titanic['Survived'] # KNN algorithm has to predict survied statement\n",
    "input_train, input_evaluate, target_train, target_evaluate = train_test_split(input_data , target_data, test_size = 0.25)\n",
    "\n",
    "# Test for k = 3, 4 and 5\n",
    "# Initalise\n",
    "KNN_3 = KNeighborsClassifier(n_neighbors=3)\n",
    "KNN_4 = KNeighborsClassifier(n_neighbors=4)\n",
    "KNN_5 = KNeighborsClassifier(n_neighbors=5)\n",
    "\n",
    "#Train\n",
    "KNN_3.fit(input_train, target_train)\n",
    "KNN_4.fit(input_train, target_train)\n",
    "KNN_5.fit(input_train, target_train)\n",
    "\n",
    "#Perform evaluation\n",
    "target_test_3 = KNN_3.predict(input_evaluate)\n",
    "target_test_4 = KNN_4.predict(input_evaluate)\n",
    "target_test_5 = KNN_5.predict(input_evaluate)\n",
    "\n",
    "# Investigate accuracy, recall and precision\n",
    "accuracy_3 = metrics.accuracy_score(target_test_3, target_evaluate)\n",
    "accuracy_4 = metrics.accuracy_score(target_test_4, target_evaluate)\n",
    "accuracy_5 = metrics.accuracy_score(target_test_5, target_evaluate)\n",
    "\n",
    "recall_3 = metrics.recall_score(target_test_3, target_evaluate)\n",
    "recall_4 = metrics.recall_score(target_test_4, target_evaluate)\n",
    "recall_5 = metrics.recall_score(target_test_5, target_evaluate)\n",
    "\n",
    "precision_3 = metrics.precision_score(target_test_3, target_evaluate)\n",
    "precision_4 = metrics.precision_score(target_test_4, target_evaluate)\n",
    "precision_5 = metrics.precision_score(target_test_5, target_evaluate)\n",
    "\n",
    "print(\"KNN_3: Accuracy: \", accuracy_3, \", Recall: \", recall_3, \", Precision: \", precision_3)\n",
    "print(\"KNN_4: Accuracy: \", accuracy_4, \", Recall: \", recall_4, \", Precision: \", precision_4)\n",
    "print(\"KNN_5: Accuracy: \", accuracy_5, \", Recall: \", recall_5, \", Precision: \", precision_5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c88e5dc0-2358-4773-804c-d070b09dff4b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}