{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "3b7bcb5f", "metadata": {}, "outputs": [], "source": [ "#Correlation and Variance Threshold:\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.datasets import fetch_california_housing\n", "from sklearn.feature_selection import VarianceThreshold" ] }, { "cell_type": "code", "execution_count": 2, "id": "9c107a0a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeMedHouseVal
08.325241.06.9841271.023810322.02.55555637.88-122.234.526
18.301421.06.2381370.9718802401.02.10984237.86-122.223.585
27.257452.08.2881361.073446496.02.80226037.85-122.243.521
35.643152.05.8173521.073059558.02.54794537.85-122.253.413
43.846252.06.2818531.081081565.02.18146737.85-122.253.422
\n", "
" ], "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", "\n", " Longitude MedHouseVal \n", "0 -122.23 4.526 \n", "1 -122.22 3.585 \n", "2 -122.24 3.521 \n", "3 -122.25 3.413 \n", "4 -122.25 3.422 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "california_housing = fetch_california_housing(as_frame = True)\n", "california_housing.frame.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "114b8b30", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitude
08.325241.06.9841271.023810322.02.55555637.88-122.23
18.301421.06.2381370.9718802401.02.10984237.86-122.22
27.257452.08.2881361.073446496.02.80226037.85-122.24
35.643152.05.8173521.073059558.02.54794537.85-122.25
43.846252.06.2818531.081081565.02.18146737.85-122.25
\n", "
" ], "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", "\n", " Longitude \n", "0 -122.23 \n", "1 -122.22 \n", "2 -122.24 \n", "3 -122.25 \n", "4 -122.25 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "california_housing.data.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "8df57099", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 4.526\n", "1 3.585\n", "2 3.521\n", "3 3.413\n", "4 3.422\n", "Name: MedHouseVal, dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "california_housing.target.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "16dfc417", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "#Correlation Chart:\n", "corrCH = california_housing.frame.corr()\n", "plt.figure(figsize = (10, 8))\n", "sns.heatmap(corrCH, annot = True, cmap = \"PiYG\")" ] }, { "cell_type": "code", "execution_count": 7, "id": "303c5a8f", "metadata": {}, "outputs": [], "source": [ "#Variance Threshold:\n", "california_housing.frame[\"NewColumn\"] = 321" ] }, { "cell_type": "code", "execution_count": 8, "id": "e989ec2f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeMedHouseValNewColumn
08.325241.06.9841271.023810322.02.55555637.88-122.234.526321
18.301421.06.2381370.9718802401.02.10984237.86-122.223.585321
27.257452.08.2881361.073446496.02.80226037.85-122.243.521321
35.643152.05.8173521.073059558.02.54794537.85-122.253.413321
43.846252.06.2818531.081081565.02.18146737.85-122.253.422321
\n", "
" ], "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", "\n", " Longitude MedHouseVal NewColumn \n", "0 -122.23 4.526 321 \n", "1 -122.22 3.585 321 \n", "2 -122.24 3.521 321 \n", "3 -122.25 3.413 321 \n", "4 -122.25 3.422 321 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "california_housing.frame.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "3f223cd1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, True, True, True, True, True, True, True, True,\n", " False])" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "VarThresh = VarianceThreshold(threshold = 0)\n", "VarThresh.fit(california_housing.frame)\n", "VarThresh.get_support()" ] }, { "cell_type": "code", "execution_count": 11, "id": "64290f19", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, True, True, False, True, True, True, True, True,\n", " False])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "VarThresh = VarianceThreshold(threshold = 0.5)\n", "VarThresh.fit(california_housing.frame)\n", "VarThresh.get_support()" ] }, { "cell_type": "code", "execution_count": 12, "id": "bb2b4ad7", "metadata": {}, "outputs": [], "source": [ "#Chi Square and ANOVA F-value:\n", "from sklearn.datasets import load_iris\n", "from sklearn.feature_selection import SelectKBest\n", "from sklearn.feature_selection import SelectPercentile\n", "from sklearn.feature_selection import chi2\n", "from sklearn.feature_selection import f_classif" ] }, { "cell_type": "code", "execution_count": 13, "id": "55df9a59", "metadata": {}, "outputs": [], "source": [ "IRIS = load_iris()\n", "x = IRIS.data\n", "y = IRIS.target" ] }, { "cell_type": "code", "execution_count": 14, "id": "fcc8a808", "metadata": {}, "outputs": [], "source": [ "x = x.astype(int)" ] }, { "cell_type": "code", "execution_count": 17, "id": "a71771bc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featurescore
0sepal length (cm)10.287129
1sepal width (cm)5.022670
2petal length (cm)133.068548
3petal width (cm)74.279070
\n", "
" ], "text/plain": [ " feature score\n", "0 sepal length (cm) 10.287129\n", "1 sepal width (cm) 5.022670\n", "2 petal length (cm) 133.068548\n", "3 petal width (cm) 74.279070" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chi2_selector = SelectKBest(chi2, k = 2)\n", "KBest = chi2_selector.fit_transform(x, y)\n", "chi2_scores = pd.DataFrame(list(zip(IRIS.feature_names, chi2_selector.scores_)),\n", " columns = [\"feature\", \"score\"])\n", "chi2_scores" ] }, { "cell_type": "code", "execution_count": 18, "id": "67703b87", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Original Features : 4\n", "Number of Reduced Features : 2\n" ] } ], "source": [ "print (\"Number of Original Features :\", x.shape[1])\n", "print (\"Number of Reduced Features :\", KBest.shape[1])" ] }, { "cell_type": "code", "execution_count": 19, "id": "106efdfc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['petal length (cm)', 'petal width (cm)'], dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featurescore
0sepal length (cm)81.197151
1sepal width (cm)33.715004
2petal length (cm)1160.011597
3petal width (cm)385.483002
\n", "" ], "text/plain": [ " feature score\n", "0 sepal length (cm) 81.197151\n", "1 sepal width (cm) 33.715004\n", "2 petal length (cm) 1160.011597\n", "3 petal width (cm) 385.483002" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chi2_selector = SelectKBest(f_classif, k = 3)\n", "KBest = chi2_selector.fit_transform(x, y)\n", "chi2_scores = pd.DataFrame(list(zip(IRIS.feature_names, chi2_selector.scores_)), columns = [\"feature\", \"score\"])\n", "chi2_scores" ] }, { "cell_type": "code", "execution_count": 23, "id": "a997334f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Original Features : 4\n", "Number of Reduced Features : 3\n" ] } ], "source": [ "print (\"Number of Original Features :\", x.shape[1])\n", "print (\"Number of Reduced Features :\", KBest.shape[1])" ] }, { "cell_type": "code", "execution_count": 25, "id": "3aba64ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['sepal length (cm)', 'petal length (cm)', 'petal width (cm)'],\n", " dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featurescore
0sepal length (cm)10.287129
1sepal width (cm)5.022670
2petal length (cm)133.068548
3petal width (cm)74.279070
\n", "" ], "text/plain": [ " feature score\n", "0 sepal length (cm) 10.287129\n", "1 sepal width (cm) 5.022670\n", "2 petal length (cm) 133.068548\n", "3 petal width (cm) 74.279070" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chi2_selector = SelectPercentile(chi2)\n", "KBest = chi2_selector.fit_transform(x, y)\n", "chi2_scores = pd.DataFrame(list(zip(IRIS.feature_names, chi2_selector.scores_)), columns = [\"feature\", \"score\"])\n", "chi2_scores" ] }, { "cell_type": "code", "execution_count": 28, "id": "10ecb36e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Original Features : 4\n", "Number of Reduced Features : 1\n" ] } ], "source": [ "print (\"Number of Original Features :\", x.shape[1])\n", "print (\"Number of Reduced Features :\", KBest.shape[1])" ] }, { "cell_type": "code", "execution_count": 29, "id": "1947368b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['petal length (cm)'], dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featurescore
0sepal length (cm)81.197151
1sepal width (cm)33.715004
2petal length (cm)1160.011597
3petal width (cm)385.483002
\n", "" ], "text/plain": [ " feature score\n", "0 sepal length (cm) 81.197151\n", "1 sepal width (cm) 33.715004\n", "2 petal length (cm) 1160.011597\n", "3 petal width (cm) 385.483002" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "chi2_selector = SelectPercentile(f_classif)\n", "KBest = chi2_selector.fit_transform(x, y)\n", "chi2_scores = pd.DataFrame(list(zip(IRIS.feature_names, chi2_selector.scores_)), columns = [\"feature\", \"score\"])\n", "chi2_scores" ] }, { "cell_type": "code", "execution_count": 31, "id": "40202512", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of Original Features : 4\n", "Number of Reduced Features : 1\n" ] } ], "source": [ "print (\"Number of Original Features :\", x.shape[1])\n", "print (\"Number of Reduced Features :\", KBest.shape[1])" ] }, { "cell_type": "code", "execution_count": 32, "id": "291d71e7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['petal length (cm)'], dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agesexbmichildrensmokerregionexpenses
019female27.90yessouthwest16884.92
118male33.81nosoutheast1725.55
228male33.03nosoutheast4449.46
333male22.70nonorthwest21984.47
432male28.90nonorthwest3866.86
\n", "" ], "text/plain": [ " age sex bmi children smoker region expenses\n", "0 19 female 27.9 0 yes southwest 16884.92\n", "1 18 male 33.8 1 no southeast 1725.55\n", "2 28 male 33.0 3 no southeast 4449.46\n", "3 33 male 22.7 0 no northwest 21984.47\n", "4 32 male 28.9 0 no northwest 3866.86" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Data = pd.read_csv(\"insurance.csv\")\n", "Data.head()" ] }, { "cell_type": "code", "execution_count": 37, "id": "9f8cbbee", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\IRANICA\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:993: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", " y = column_or_1d(y, warn=True)\n" ] }, { "data": { "text/plain": [ "array([0.02776601, 0.09673063, 0.07626668])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "features = Data[[\"age\", \"bmi\", \"expenses\"]]\n", "target = Data[[\"region\"]]\n", "feature_scores = mutual_info_classif(features, target, random_state = 75)\n", "feature_scores" ] }, { "cell_type": "code", "execution_count": 38, "id": "12b50809", "metadata": {}, "outputs": [], "source": [ "#So bmi has highest score." ] }, { "cell_type": "code", "execution_count": null, "id": "4a3003ec", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }