{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": 95, "metadata": { "id": "nEBxinWfFnnJ" }, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "source": [ "Import the dataset" ], "metadata": { "id": "Rx0dXECgSra-" } }, { "cell_type": "code", "source": [ "dataset = pd.read_csv(\"./tested.csv\")\n", "dataset.info()" ], "metadata": { "id": "a9EWjl4NGwxl", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0a889877-4bde-4108-9124-bbc0b83755b4" }, "execution_count": 97, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 418 entries, 0 to 417\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 PassengerId 418 non-null int64 \n", " 1 Survived 418 non-null int64 \n", " 2 Pclass 418 non-null int64 \n", " 3 Name 418 non-null object \n", " 4 Sex 418 non-null object \n", " 5 Age 332 non-null float64\n", " 6 SibSp 418 non-null int64 \n", " 7 Parch 418 non-null int64 \n", " 8 Ticket 418 non-null object \n", " 9 Fare 417 non-null float64\n", " 10 Cabin 91 non-null object \n", " 11 Embarked 418 non-null object \n", "dtypes: float64(2), int64(5), object(5)\n", "memory usage: 39.3+ KB\n" ] } ] }, { "cell_type": "markdown", "source": [ "Split patterns to labels" ], "metadata": { "id": "DoshY8QWStiz" } }, { "cell_type": "code", "source": [ "y = dataset[\"Survived\"]\n", "dataset = dataset.drop([\"Survived\", \"Embarked\", \"Ticket\", \"Name\", \"PassengerId\", \"Cabin\",], 1)\n", "dataset.head()" ], "metadata": { "id": "gpnu7pt6HDal", "colab": { "base_uri": "https://localhost:8080/", "height": 262 }, "outputId": "a3fd3795-147e-4b12-a217-f11df8a149e3" }, "execution_count": 98, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only\n", " \n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " Pclass Sex Age SibSp Parch Fare\n", "0 3 male 34.5 0 0 7.8292\n", "1 3 female 47.0 1 0 7.0000\n", "2 2 male 62.0 0 0 9.6875\n", "3 3 male 27.0 0 0 8.6625\n", "4 3 female 22.0 1 1 12.2875" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PclassSexAgeSibSpParchFare
03male34.5007.8292
13female47.0107.0000
22male62.0009.6875
33male27.0008.6625
43female22.01112.2875
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 98 } ] }, { "cell_type": "markdown", "source": [ "Vectorize categorial variables (Sex)" ], "metadata": { "id": "EChES9ddaBv5" } }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "import numpy as np\n", "\n", "encoder = OneHotEncoder()\n", "dataset[\"Sex\"] = encoder.fit_transform(dataset[\"Sex\"][:, np.newaxis]).toarray()\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tBDcqI2tXzMo", "outputId": "0f14aa47-7efe-4989-d5a0-355f12bd3bfa" }, "execution_count": 99, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead.\n", " \"\"\"\n" ] } ] }, { "cell_type": "markdown", "source": [ "Check if there are missing values" ], "metadata": { "id": "07BoPiKiaELk" } }, { "cell_type": "code", "source": [ "print(\"Missing values for every feature:\")\n", "dataset.shape[0] - dataset.notnull().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PNpPhyVcaPMd", "outputId": "7895e316-ad98-4794-c768-57c5874096eb" }, "execution_count": 103, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Missing values for every feature:\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "Pclass 0\n", "Sex 0\n", "Age 86\n", "SibSp 0\n", "Parch 0\n", "Fare 1\n", "dtype: int64" ] }, "metadata": {}, "execution_count": 103 } ] }, { "cell_type": "markdown", "source": [ "We fill the missing values by using a Simple Imputer (strategy -> mean)" ], "metadata": { "id": "qJcm6BcecJIf" } }, { "cell_type": "code", "source": [ "from sklearn.impute import SimpleImputer\n", "\n", "def fillMissingValues(columnName):\n", " imputer = SimpleImputer()\n", " dataset[columnName] = imputer.fit_transform(dataset[columnName][:, np.newaxis])\n", "\n", "fillMissingValues(\"Age\")\n", "fillMissingValues(\"Fare\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yUE5p1qIa_Y1", "outputId": "434f3406-c379-46a9-9d02-6828445bc549" }, "execution_count": 105, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead.\n", " \"\"\"\n", "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead.\n", " \"\"\"\n" ] } ] }, { "cell_type": "markdown", "source": [ "Split data in Train / Test" ], "metadata": { "id": "cp-PaN67czSx" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import train_test_split\n", "X = dataset.loc[:,:]\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)" ], "metadata": { "id": "wFVt-2SqIQG-" }, "execution_count": 106, "outputs": [] }, { "cell_type": "markdown", "source": [ "Evaluate KNeighbors with data not-preprocessed" ], "metadata": { "id": "MvDRqGw8OHb-" } }, { "cell_type": "code", "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "clf = KNeighborsClassifier()\n", "clf.fit(X_train, y_train)\n", "clf.score(X_test, y_test)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "QjxF4Ni3Ip6j", "outputId": "c92544be-3300-4705-ef4e-baf211a30c84" }, "execution_count": 107, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.7238095238095238" ] }, "metadata": {}, "execution_count": 107 } ] }, { "cell_type": "markdown", "source": [ "Now try to preprocess data using a scaler" ], "metadata": { "id": "95CUpSy0ksuC" } }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n", "from sklearn.pipeline import Pipeline\n", "\n", "# scaler = MinMaxScaler()\n", "scaler = StandardScaler()\n", "clf = Pipeline([('scaler', scaler), ('estimator', KNeighborsClassifier())])\n", "clf.fit(X_train, y_train)\n", "clf.score(X_test, y_test)" ], "metadata": { "id": "VCAUuuaeLttO", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "fac216b8-2a78-47b8-eaa6-f0b48ab1a7bd" }, "execution_count": 108, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9809523809523809" ] }, "metadata": {}, "execution_count": 108 } ] }, { "cell_type": "markdown", "source": [ "Now we try to execute Cross-Validation" ], "metadata": { "id": "NsNm-lctK3j5" } }, { "cell_type": "code", "source": [ "from sklearn.model_selection import cross_val_score\n", "\n", "cv_score = cross_val_score(clf, X_train, y_train)\n", "print(f\"Results of Cross-Validation: {cv_score}\")\n", "print(f\"Avg of the results: {cv_score.mean()}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EdMZLVnwK6sJ", "outputId": "80b2647d-129e-42a4-bb32-1bbf5b85ec3f" }, "execution_count": 109, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Results of Cross-Validation: [1. 0.98412698 1. 1. 0.96774194]\n", "Avg of the results: 0.990373783922171\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "JubkZ36lLVbu" }, "execution_count": null, "outputs": [] } ] }