{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "772995b3-adfa-476c-8e0e-104f3a221518", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", ".. ... ... ... \n", "886 887 0 2 \n", "887 888 1 1 \n", "888 889 0 3 \n", "889 890 1 1 \n", "890 891 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", ".. ... ... ... ... \n", "886 Montvila, Rev. Juozas male 27.0 0 \n", "887 Graham, Miss. Margaret Edith female 19.0 0 \n", "888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n", "889 Behr, Mr. Karl Howell male 26.0 0 \n", "890 Dooley, Mr. Patrick male 32.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S \n", ".. ... ... ... ... ... \n", "886 0 211536 13.0000 NaN S \n", "887 0 112053 30.0000 B42 S \n", "888 2 W./C. 6607 23.4500 NaN S \n", "889 0 111369 30.0000 C148 C \n", "890 0 370376 7.7500 NaN Q \n", "\n", "[891 rows x 12 columns]\n" ] } ], "source": [ "# Imports\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.neighbors import KNeighborsClassifier\n", "import sklearn.metrics as metrics\n", "\n", "# Import data of the titanic passengers\n", "data_titanic = pd.read_csv(\"titanic_data.csv\")\n", "\n", "print(data_titanic)" ] }, { "cell_type": "code", "execution_count": 2, "id": "f229134f-2d20-4322-b58f-eb6278b51f40", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Survived Pclass Sex Age Fare Embarked\n", "0 0 3 male 22.0 7.2500 S\n", "1 1 1 female 38.0 71.2833 C\n", "2 1 3 female 26.0 7.9250 S\n", "3 1 1 female 35.0 53.1000 S\n", "4 0 3 male 35.0 8.0500 S\n", ".. ... ... ... ... ... ...\n", "886 0 2 male 27.0 13.0000 S\n", "887 1 1 female 19.0 30.0000 S\n", "888 0 3 female NaN 23.4500 S\n", "889 1 1 male 26.0 30.0000 C\n", "890 0 3 male 32.0 7.7500 Q\n", "\n", "[891 rows x 6 columns]\n" ] } ], "source": [ "# Not all data is important for the training. The PassengerId, the name, the number of siblings, the parch, the ticket number\n", "# and the cabin number are not important and therefore they will be dropped\n", "data_titanic = data_titanic.drop(data_titanic.columns[[0, 3, 6, 7, 8, 10]], axis = 1)\n", "print(data_titanic)" ] }, { "cell_type": "code", "execution_count": 3, "id": "c5796121-905e-4841-8ec3-5227db142ee6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Survived 0\n", "Pclass 0\n", "Sex 0\n", "Age 177\n", "Fare 0\n", "Embarked 2\n", "dtype: int64" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check for missing values in the data set\n", "data_titanic.isnull().sum() # Sums all the empty entries of a columns\n", "# As can be seen, the age and the Embarked columns have missing values" ] }, { "cell_type": "code", "execution_count": 4, "id": "1c09781a-7c0a-4785-a0bb-fb249e236b12", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Age average: 30\n" ] }, { "data": { "text/plain": [ "Survived 0\n", "Pclass 0\n", "Sex 0\n", "Age 0\n", "Fare 0\n", "Embarked 0\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Fill the gaps:\n", "# For the age: Compute the average of ages of all passengers on board and take it as value\n", "ages = data_titanic[\"Age\"]\n", "age_average = round(ages.mean(axis = 0, skipna = True)) \n", "print('Age average: ', age_average)\n", "\n", "def set_age(Age):\n", " age = Age\n", " if pd.isnull(age):\n", " return age_average\n", " else:\n", " return age\n", "\n", "data_titanic['Age'] = data_titanic['Age'].apply(set_age)\n", " \n", "# For the embarked: Take the most occuring value, available options: S, C and Q\n", "Embarked = data_titanic[\"Embarked\"]\n", "count_S = 0\n", "count_C = 0\n", "count_Q = 0\n", "\n", "# Count embarks\n", "for i in range(data_titanic.shape[0]):\n", " if (Embarked[i] == 'S'):\n", " count_S += 1\n", " if (Embarked[i] == 'C'):\n", " count_C += 1\n", " if (Embarked[i] == 'Q'):\n", " count_Q += 1\n", " \n", "if ((count_S >= count_C) and (count_S >= count_Q)):\n", " common_embarked = 'S'\n", "if ((count_C > count_S) and (count_C >= count_Q)):\n", " common_embarked = 'C'\n", "if ((count_Q >= count_S) and (count_Q >= count_C)):\n", " common_embarked = 'Q'\n", " \n", "def set_Embarked(Embarked):\n", " embarked = Embarked\n", " if pd.isnull(embarked):\n", " return common_embarked\n", " else:\n", " return embarked\n", "\n", "data_titanic['Embarked'] = data_titanic['Embarked'].apply(set_Embarked)\n", "\n", "# Check replacement\n", "data_titanic.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 5, "id": "b14df726-807f-4dea-a360-1691b58bece7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Survived Pclass Sex Age Fare Embarked\n", "0 0 3 0 22.0 7.2500 0\n", "1 1 1 1 38.0 71.2833 1\n", "2 1 3 1 26.0 7.9250 0\n", "3 1 1 1 35.0 53.1000 0\n", "4 0 3 0 35.0 8.0500 0\n", ".. ... ... ... ... ... ...\n", "886 0 2 0 27.0 13.0000 0\n", "887 1 1 1 19.0 30.0000 0\n", "888 0 3 1 30.0 23.4500 0\n", "889 1 1 0 26.0 30.0000 1\n", "890 0 3 0 32.0 7.7500 2\n", "\n", "[891 rows x 6 columns]\n" ] } ], "source": [ "# Gender and Embarked have to be replaced by values: Set male = 0, female = 1 and S = 0, C = 1, Q = 2\n", "replace_gender = {data_titanic.columns[2]: {\"male\": 0, \"female\": 1}}\n", "replace_embarked = {data_titanic.columns[5]: {\"S\": 0, \"C\": 1, \"Q\": 2}}\n", "data_titanic = data_titanic.replace(replace_gender)\n", "data_titanic = data_titanic.replace(replace_embarked)\n", "\n", "# Check replacement\n", "print(data_titanic)" ] }, { "cell_type": "code", "execution_count": 8, "id": "6328a9b8-da13-4abe-b9f4-6f7b10bcc1d9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "KNN_3: Accuracy: 0.672645739910314 , Recall: 0.6486486486486487 , Precision: 0.5052631578947369\n", "KNN_4: Accuracy: 0.6591928251121076 , Recall: 0.7021276595744681 , Precision: 0.3473684210526316\n", "KNN_5: Accuracy: 0.6816143497757847 , Recall: 0.6666666666666666 , Precision: 0.5052631578947369\n" ] } ], "source": [ "# Run a KNN prediction\n", "\n", "# Split data into train and evaluation subsets, proportion 1:4\n", "input_data = data_titanic.drop(data_titanic.columns[[0]], axis = 1) # Input has to be without \"Survived\" statement\n", "target_data = data_titanic['Survived'] # KNN algorithm has to predict survied statement\n", "input_train, input_evaluate, target_train, target_evaluate = train_test_split(input_data , target_data, test_size = 0.25)\n", "\n", "# Test for k = 3, 4 and 5\n", "# Initalise\n", "KNN_3 = KNeighborsClassifier(n_neighbors=3)\n", "KNN_4 = KNeighborsClassifier(n_neighbors=4)\n", "KNN_5 = KNeighborsClassifier(n_neighbors=5)\n", "\n", "#Train\n", "KNN_3.fit(input_train, target_train)\n", "KNN_4.fit(input_train, target_train)\n", "KNN_5.fit(input_train, target_train)\n", "\n", "#Perform evaluation\n", "target_test_3 = KNN_3.predict(input_evaluate)\n", "target_test_4 = KNN_4.predict(input_evaluate)\n", "target_test_5 = KNN_5.predict(input_evaluate)\n", "\n", "# Investigate accuracy, recall and precision\n", "accuracy_3 = metrics.accuracy_score(target_test_3, target_evaluate)\n", "accuracy_4 = metrics.accuracy_score(target_test_4, target_evaluate)\n", "accuracy_5 = metrics.accuracy_score(target_test_5, target_evaluate)\n", "\n", "recall_3 = metrics.recall_score(target_test_3, target_evaluate)\n", "recall_4 = metrics.recall_score(target_test_4, target_evaluate)\n", "recall_5 = metrics.recall_score(target_test_5, target_evaluate)\n", "\n", "precision_3 = metrics.precision_score(target_test_3, target_evaluate)\n", "precision_4 = metrics.precision_score(target_test_4, target_evaluate)\n", "precision_5 = metrics.precision_score(target_test_5, target_evaluate)\n", "\n", "print(\"KNN_3: Accuracy: \", accuracy_3, \", Recall: \", recall_3, \", Precision: \", precision_3)\n", "print(\"KNN_4: Accuracy: \", accuracy_4, \", Recall: \", recall_4, \", Precision: \", precision_4)\n", "print(\"KNN_5: Accuracy: \", accuracy_5, \", Recall: \", recall_5, \", Precision: \", precision_5)" ] }, { "cell_type": "code", "execution_count": null, "id": "c88e5dc0-2358-4773-804c-d070b09dff4b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }