{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a40c3060",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn.datasets import fetch_openml\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from sklearn.preprocessing import StandardScaler"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "6ca0451b",
"metadata": {},
"outputs": [],
"source": [
"df_titanic = fetch_openml(\"titanic\", version = 1, as_frame = True)[\"data\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "108b699a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1309, 13)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0f68c698",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare',\n",
" 'cabin', 'embarked', 'boat', 'body', 'home.dest'],\n",
" dtype='object')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic.columns"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "678f1623",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" name | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" home.dest | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" Allen, Miss. Elisabeth Walton | \n",
" female | \n",
" 29.0000 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 24160 | \n",
" 211.3375 | \n",
" B5 | \n",
" S | \n",
" 2 | \n",
" NaN | \n",
" St Louis, MO | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" Allison, Master. Hudson Trevor | \n",
" male | \n",
" 0.9167 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 113781 | \n",
" 151.5500 | \n",
" C22 C26 | \n",
" S | \n",
" 11 | \n",
" NaN | \n",
" Montreal, PQ / Chesterville, ON | \n",
"
\n",
" \n",
" 2 | \n",
" 1.0 | \n",
" Allison, Miss. Helen Loraine | \n",
" female | \n",
" 2.0000 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 113781 | \n",
" 151.5500 | \n",
" C22 C26 | \n",
" S | \n",
" None | \n",
" NaN | \n",
" Montreal, PQ / Chesterville, ON | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" Allison, Mr. Hudson Joshua Creighton | \n",
" male | \n",
" 30.0000 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 113781 | \n",
" 151.5500 | \n",
" C22 C26 | \n",
" S | \n",
" None | \n",
" 135.0 | \n",
" Montreal, PQ / Chesterville, ON | \n",
"
\n",
" \n",
" 4 | \n",
" 1.0 | \n",
" Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | \n",
" female | \n",
" 25.0000 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 113781 | \n",
" 151.5500 | \n",
" C22 C26 | \n",
" S | \n",
" None | \n",
" NaN | \n",
" Montreal, PQ / Chesterville, ON | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass name sex age \\\n",
"0 1.0 Allen, Miss. Elisabeth Walton female 29.0000 \n",
"1 1.0 Allison, Master. Hudson Trevor male 0.9167 \n",
"2 1.0 Allison, Miss. Helen Loraine female 2.0000 \n",
"3 1.0 Allison, Mr. Hudson Joshua Creighton male 30.0000 \n",
"4 1.0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 \n",
"\n",
" sibsp parch ticket fare cabin embarked boat body \\\n",
"0 0.0 0.0 24160 211.3375 B5 S 2 NaN \n",
"1 1.0 2.0 113781 151.5500 C22 C26 S 11 NaN \n",
"2 1.0 2.0 113781 151.5500 C22 C26 S None NaN \n",
"3 1.0 2.0 113781 151.5500 C22 C26 S None 135.0 \n",
"4 1.0 2.0 113781 151.5500 C22 C26 S None NaN \n",
"\n",
" home.dest \n",
"0 St Louis, MO \n",
"1 Montreal, PQ / Chesterville, ON \n",
"2 Montreal, PQ / Chesterville, ON \n",
"3 Montreal, PQ / Chesterville, ON \n",
"4 Montreal, PQ / Chesterville, ON "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2f31b7e0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" name | \n",
" sex | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" ticket | \n",
" fare | \n",
" cabin | \n",
" embarked | \n",
" boat | \n",
" body | \n",
" home.dest | \n",
"
\n",
" \n",
" \n",
" \n",
" 1304 | \n",
" 3.0 | \n",
" Zabour, Miss. Hileni | \n",
" female | \n",
" 14.5 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 2665 | \n",
" 14.4542 | \n",
" None | \n",
" C | \n",
" None | \n",
" 328.0 | \n",
" None | \n",
"
\n",
" \n",
" 1305 | \n",
" 3.0 | \n",
" Zabour, Miss. Thamine | \n",
" female | \n",
" NaN | \n",
" 1.0 | \n",
" 0.0 | \n",
" 2665 | \n",
" 14.4542 | \n",
" None | \n",
" C | \n",
" None | \n",
" NaN | \n",
" None | \n",
"
\n",
" \n",
" 1306 | \n",
" 3.0 | \n",
" Zakarian, Mr. Mapriededer | \n",
" male | \n",
" 26.5 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2656 | \n",
" 7.2250 | \n",
" None | \n",
" C | \n",
" None | \n",
" 304.0 | \n",
" None | \n",
"
\n",
" \n",
" 1307 | \n",
" 3.0 | \n",
" Zakarian, Mr. Ortin | \n",
" male | \n",
" 27.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 2670 | \n",
" 7.2250 | \n",
" None | \n",
" C | \n",
" None | \n",
" NaN | \n",
" None | \n",
"
\n",
" \n",
" 1308 | \n",
" 3.0 | \n",
" Zimmerman, Mr. Leo | \n",
" male | \n",
" 29.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 315082 | \n",
" 7.8750 | \n",
" None | \n",
" S | \n",
" None | \n",
" NaN | \n",
" None | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass name sex age sibsp parch ticket \\\n",
"1304 3.0 Zabour, Miss. Hileni female 14.5 1.0 0.0 2665 \n",
"1305 3.0 Zabour, Miss. Thamine female NaN 1.0 0.0 2665 \n",
"1306 3.0 Zakarian, Mr. Mapriededer male 26.5 0.0 0.0 2656 \n",
"1307 3.0 Zakarian, Mr. Ortin male 27.0 0.0 0.0 2670 \n",
"1308 3.0 Zimmerman, Mr. Leo male 29.0 0.0 0.0 315082 \n",
"\n",
" fare cabin embarked boat body home.dest \n",
"1304 14.4542 None C None 328.0 None \n",
"1305 14.4542 None C None NaN None \n",
"1306 7.2250 None C None 304.0 None \n",
"1307 7.2250 None C None NaN None \n",
"1308 7.8750 None S None NaN None "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic.tail()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c523519b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 1309 entries, 0 to 1308\n",
"Data columns (total 13 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 pclass 1309 non-null float64 \n",
" 1 name 1309 non-null object \n",
" 2 sex 1309 non-null category\n",
" 3 age 1046 non-null float64 \n",
" 4 sibsp 1309 non-null float64 \n",
" 5 parch 1309 non-null float64 \n",
" 6 ticket 1309 non-null object \n",
" 7 fare 1308 non-null float64 \n",
" 8 cabin 295 non-null object \n",
" 9 embarked 1307 non-null category\n",
" 10 boat 486 non-null object \n",
" 11 body 121 non-null float64 \n",
" 12 home.dest 745 non-null object \n",
"dtypes: category(2), float64(6), object(5)\n",
"memory usage: 115.4+ KB\n"
]
}
],
"source": [
"df_titanic.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ad1c6271",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" body | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1309.000000 | \n",
" 1046.000000 | \n",
" 1309.000000 | \n",
" 1309.000000 | \n",
" 1308.000000 | \n",
" 121.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 2.294882 | \n",
" 29.881135 | \n",
" 0.498854 | \n",
" 0.385027 | \n",
" 33.295479 | \n",
" 160.809917 | \n",
"
\n",
" \n",
" std | \n",
" 0.837836 | \n",
" 14.413500 | \n",
" 1.041658 | \n",
" 0.865560 | \n",
" 51.758668 | \n",
" 97.696922 | \n",
"
\n",
" \n",
" min | \n",
" 1.000000 | \n",
" 0.166700 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 2.000000 | \n",
" 21.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 7.895800 | \n",
" 72.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 3.000000 | \n",
" 28.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 14.454200 | \n",
" 155.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 3.000000 | \n",
" 39.000000 | \n",
" 1.000000 | \n",
" 0.000000 | \n",
" 31.275000 | \n",
" 256.000000 | \n",
"
\n",
" \n",
" max | \n",
" 3.000000 | \n",
" 80.000000 | \n",
" 8.000000 | \n",
" 9.000000 | \n",
" 512.329200 | \n",
" 328.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass age sibsp parch fare \\\n",
"count 1309.000000 1046.000000 1309.000000 1309.000000 1308.000000 \n",
"mean 2.294882 29.881135 0.498854 0.385027 33.295479 \n",
"std 0.837836 14.413500 1.041658 0.865560 51.758668 \n",
"min 1.000000 0.166700 0.000000 0.000000 0.000000 \n",
"25% 2.000000 21.000000 0.000000 0.000000 7.895800 \n",
"50% 3.000000 28.000000 0.000000 0.000000 14.454200 \n",
"75% 3.000000 39.000000 1.000000 0.000000 31.275000 \n",
"max 3.000000 80.000000 8.000000 9.000000 512.329200 \n",
"\n",
" body \n",
"count 121.000000 \n",
"mean 160.809917 \n",
"std 97.696922 \n",
"min 1.000000 \n",
"25% 72.000000 \n",
"50% 155.000000 \n",
"75% 256.000000 \n",
"max 328.000000 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic.describe()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "61b8168d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize = (8, 6))\n",
"sns.heatmap(df_titanic.corr(), cmap = \"BuPu\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7a74e552",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"male 843\n",
"female 466\n",
"Name: sex, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic[\"sex\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "e4b9b64c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize = (8, 6))\n",
"plt.pie(df_titanic[\"sex\"].value_counts(), explode = [0, 0.05], labels = [\"Female\", \"Male\"], colors = [\"#FF82AB\", \"#8E388E\"])\n",
"plt.legend()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f0e4d736",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"35.6"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.round((sum(df_titanic[\"sex\"] == \"female\") / (df_titanic.shape[0]) * 100), 2)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9187c02b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"64.4"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.round((sum(df_titanic[\"sex\"] == \"male\") / (df_titanic.shape[0]) * 100), 2)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ccbfa540",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"30.0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.round(df_titanic[\"age\"].mean(), 0)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "b68deced",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pclass 0\n",
"name 0\n",
"sex 0\n",
"age 263\n",
"sibsp 0\n",
"parch 0\n",
"ticket 0\n",
"fare 1\n",
"cabin 1014\n",
"embarked 2\n",
"boat 823\n",
"body 1188\n",
"home.dest 564\n",
"dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "10093c65",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.set()\n",
"missing_values = pd.DataFrame(df_titanic.isnull().sum()/len(df_titanic) * 100)\n",
"missing_values.plot(kind = \"bar\",\n",
" title = \"Percentage of Missing Values\",\n",
" ylabel = \"Percentage\",\n",
" color = \"#00CD66\")"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "7ee19288",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Datatypes of Missing Values:\n",
"age : nan\n",
"fare : nan\n",
"cabin : None\n",
"embarked : nan\n",
"boat : None\n",
"body : nan\n",
"home.dest : None\n"
]
}
],
"source": [
"#Imputing Missing Values:\n",
"print(\"Datatypes of Missing Values:\")\n",
"for col in df_titanic.columns[df_titanic.isnull().any()]:\n",
" print(col, \":\", df_titanic[col][df_titanic[col].isnull()].values[0])"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "aee16690",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'age': {'missing_values': nan, 'strategy': 'mean'},\n",
" 'fare': {'missing_values': nan, 'strategy': 'mean'},\n",
" 'cabin': {'missing_values': None, 'strategy': 'most_frequent'},\n",
" 'embarked': {'missing_values': nan, 'strategy': 'most_frequent'},\n",
" 'boat': {'missing_values': None, 'strategy': 'most_frequent'},\n",
" 'body': {'missing_values': nan, 'strategy': 'mean'},\n",
" 'home.dest': {'missing_values': None, 'strategy': 'most_frequent'}}"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_parameters(df_titanic):\n",
" parameters = {}\n",
" for col in df_titanic.columns[df_titanic.isnull().any()]:\n",
" if df_titanic[col].dtype == \"float64\" or df_titanic[col].dtype == \"int64\" or df_titanic[col].dtype == \"int32\":\n",
" strategy = \"mean\"\n",
" else:\n",
" strategy = \"most_frequent\"\n",
" missing_values = df_titanic[col][df_titanic[col].isnull()].values[0]\n",
" parameters[col] = {\"missing_values\" : missing_values, \n",
" \"strategy\" : strategy}\n",
" return parameters\n",
"get_parameters(df_titanic)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "70ce99ac",
"metadata": {},
"outputs": [],
"source": [
"parameters = get_parameters(df_titanic)\n",
"for col, param in parameters.items():\n",
" missing_values = param[\"missing_values\"]\n",
" strategy = param[\"strategy\"]\n",
" IMP = SimpleImputer(missing_values = missing_values, strategy = strategy)\n",
" df_titanic[col] = IMP.fit_transform(df_titanic[[col]])"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "2312e508",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"pclass 0\n",
"name 0\n",
"sex 0\n",
"age 0\n",
"sibsp 0\n",
"parch 0\n",
"ticket 0\n",
"fare 0\n",
"cabin 0\n",
"embarked 0\n",
"boat 0\n",
"body 0\n",
"home.dest 0\n",
"dtype: int64"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titanic.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "fa570c0a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of Null Values (after) : 0\n"
]
}
],
"source": [
"print(f'Number of Null Values (after) : {df_titanic.age.isnull().sum()}')"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "0e24e195",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['pclass', 'age', 'sibsp', 'parch', 'fare', 'body'], dtype='object')"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Normalizing the Data - MinMaxScaler:\n",
"n_cols = df_titanic.select_dtypes(include = [\"int64\", \"float64\", \"float32\"]).columns\n",
"n_cols"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "b5d7be2b",
"metadata": {},
"outputs": [],
"source": [
"for col in n_cols:\n",
" fill_value = df_titanic[col].mean()\n",
" df_titanic[col].fillna(fill_value, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "6956e3bd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" body | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0 | \n",
" 0.361169 | \n",
" 0.000 | \n",
" 0.000000 | \n",
" 0.412503 | \n",
" 0.488715 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 0.009395 | \n",
" 0.125 | \n",
" 0.222222 | \n",
" 0.295806 | \n",
" 0.488715 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.022964 | \n",
" 0.125 | \n",
" 0.222222 | \n",
" 0.295806 | \n",
" 0.488715 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 0.373695 | \n",
" 0.125 | \n",
" 0.222222 | \n",
" 0.295806 | \n",
" 0.409786 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 0.311064 | \n",
" 0.125 | \n",
" 0.222222 | \n",
" 0.295806 | \n",
" 0.488715 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1304 | \n",
" 1.0 | \n",
" 0.179540 | \n",
" 0.125 | \n",
" 0.000000 | \n",
" 0.028213 | \n",
" 1.000000 | \n",
"
\n",
" \n",
" 1305 | \n",
" 1.0 | \n",
" 0.372206 | \n",
" 0.125 | \n",
" 0.000000 | \n",
" 0.028213 | \n",
" 0.488715 | \n",
"
\n",
" \n",
" 1306 | \n",
" 1.0 | \n",
" 0.329854 | \n",
" 0.000 | \n",
" 0.000000 | \n",
" 0.014102 | \n",
" 0.926606 | \n",
"
\n",
" \n",
" 1307 | \n",
" 1.0 | \n",
" 0.336117 | \n",
" 0.000 | \n",
" 0.000000 | \n",
" 0.014102 | \n",
" 0.488715 | \n",
"
\n",
" \n",
" 1308 | \n",
" 1.0 | \n",
" 0.361169 | \n",
" 0.000 | \n",
" 0.000000 | \n",
" 0.015371 | \n",
" 0.488715 | \n",
"
\n",
" \n",
"
\n",
"
1309 rows × 6 columns
\n",
"
"
],
"text/plain": [
" pclass age sibsp parch fare body\n",
"0 0.0 0.361169 0.000 0.000000 0.412503 0.488715\n",
"1 0.0 0.009395 0.125 0.222222 0.295806 0.488715\n",
"2 0.0 0.022964 0.125 0.222222 0.295806 0.488715\n",
"3 0.0 0.373695 0.125 0.222222 0.295806 0.409786\n",
"4 0.0 0.311064 0.125 0.222222 0.295806 0.488715\n",
"... ... ... ... ... ... ...\n",
"1304 1.0 0.179540 0.125 0.000000 0.028213 1.000000\n",
"1305 1.0 0.372206 0.125 0.000000 0.028213 0.488715\n",
"1306 1.0 0.329854 0.000 0.000000 0.014102 0.926606\n",
"1307 1.0 0.336117 0.000 0.000000 0.014102 0.488715\n",
"1308 1.0 0.361169 0.000 0.000000 0.015371 0.488715\n",
"\n",
"[1309 rows x 6 columns]"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"minmax = MinMaxScaler()\n",
"df_titanic[n_cols] = minmax.fit_transform(df_titanic[n_cols])\n",
"df_titanic[n_cols]"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "28746928",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" pclass | \n",
" age | \n",
" sibsp | \n",
" parch | \n",
" fare | \n",
" body | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1.309000e+03 | \n",
" 1.309000e+03 | \n",
" 1.309000e+03 | \n",
" 1.309000e+03 | \n",
" 1.309000e+03 | \n",
" 1.309000e+03 | \n",
"
\n",
" \n",
" mean | \n",
" 5.000329e-15 | \n",
" 1.832313e-16 | \n",
" -1.028801e-15 | \n",
" 3.833620e-17 | \n",
" 8.410215e-16 | \n",
" -2.912731e-17 | \n",
"
\n",
" \n",
" std | \n",
" 1.000382e+00 | \n",
" 1.000382e+00 | \n",
" 1.000382e+00 | \n",
" 1.000382e+00 | \n",
" 1.000382e+00 | \n",
" 1.000382e+00 | \n",
"
\n",
" \n",
" min | \n",
" -1.546098e+00 | \n",
" -2.307330e+00 | \n",
" -4.790868e-01 | \n",
" -4.449995e-01 | \n",
" -6.437751e-01 | \n",
" -5.402590e+00 | \n",
"
\n",
" \n",
" 25% | \n",
" -3.520907e-01 | \n",
" -6.119712e-01 | \n",
" -4.790868e-01 | \n",
" -4.449995e-01 | \n",
" -4.911082e-01 | \n",
" 3.201135e-17 | \n",
"
\n",
" \n",
" 50% | \n",
" 8.419164e-01 | \n",
" 1.302752e-16 | \n",
" -4.790868e-01 | \n",
" -4.449995e-01 | \n",
" -3.643001e-01 | \n",
" 3.201135e-17 | \n",
"
\n",
" \n",
" 75% | \n",
" 8.419164e-01 | \n",
" 3.974806e-01 | \n",
" 4.812878e-01 | \n",
" -4.449995e-01 | \n",
" -3.906640e-02 | \n",
" 3.201135e-17 | \n",
"
\n",
" \n",
" max | \n",
" 8.419164e-01 | \n",
" 3.891737e+00 | \n",
" 7.203909e+00 | \n",
" 9.956864e+00 | \n",
" 9.262219e+00 | \n",
" 5.652087e+00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" pclass age sibsp parch fare \\\n",
"count 1.309000e+03 1.309000e+03 1.309000e+03 1.309000e+03 1.309000e+03 \n",
"mean 5.000329e-15 1.832313e-16 -1.028801e-15 3.833620e-17 8.410215e-16 \n",
"std 1.000382e+00 1.000382e+00 1.000382e+00 1.000382e+00 1.000382e+00 \n",
"min -1.546098e+00 -2.307330e+00 -4.790868e-01 -4.449995e-01 -6.437751e-01 \n",
"25% -3.520907e-01 -6.119712e-01 -4.790868e-01 -4.449995e-01 -4.911082e-01 \n",
"50% 8.419164e-01 1.302752e-16 -4.790868e-01 -4.449995e-01 -3.643001e-01 \n",
"75% 8.419164e-01 3.974806e-01 4.812878e-01 -4.449995e-01 -3.906640e-02 \n",
"max 8.419164e-01 3.891737e+00 7.203909e+00 9.956864e+00 9.262219e+00 \n",
"\n",
" body \n",
"count 1.309000e+03 \n",
"mean -2.912731e-17 \n",
"std 1.000382e+00 \n",
"min -5.402590e+00 \n",
"25% 3.201135e-17 \n",
"50% 3.201135e-17 \n",
"75% 3.201135e-17 \n",
"max 5.652087e+00 "
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Normalizing the Data - StandardScaler:\n",
"stsc = StandardScaler()\n",
"df_titanic[n_cols] = stsc.fit_transform(df_titanic[n_cols])\n",
"df_titanic[n_cols].describe()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "201e4ce9",
"metadata": {},
"outputs": [],
"source": [
"#so:\n",
"#the mean(5.000329e-15) is equal to zero.\n",
"#the standard deviation(std) of scaled columns are also equal to zero."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}