{ "cells": [ { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import statistics \n", "from collections import Counter\n", "from sklearn.impute import SimpleImputer\n", "import sklearn.preprocessing as sk\n", "from scipy.stats import zscore\n", "import jenkspy\n", "import seaborn as sns; sns.set()\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.tree import DecisionTreeClassifier,export_graphviz,plot_tree\n", "from sklearn import tree\n", "from sklearn.naive_bayes import MultinomialNB\n", "import statsmodels.tools.tools as stattools\n", "from sklearn.metrics import accuracy_score,confusion_matrix,classification_report\n", "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random Forest on Diabetes Dataset" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 6 | \n", "148 | \n", "72 | \n", "35 | \n", "0 | \n", "33.6 | \n", "0.627 | \n", "50 | \n", "1 | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "0 | \n", "
1 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "1 | \n", "
2 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "0 | \n", "
3 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "1 | \n", "
4 | \n", "5 | \n", "116 | \n", "74 | \n", "0 | \n", "0 | \n", "25.6 | \n", "0.201 | \n", "30 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
762 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "0 | \n", "
763 | \n", "2 | \n", "122 | \n", "70 | \n", "27 | \n", "0 | \n", "36.8 | \n", "0.340 | \n", "27 | \n", "0 | \n", "
764 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "0 | \n", "
765 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "1 | \n", "
766 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "0 | \n", "
767 rows × 9 columns
\n", "\n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skinfold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (year) | \n", "Class variable (0 or 1) | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "1 | \n", "85 | \n", "66 | \n", "29 | \n", "0 | \n", "26.6 | \n", "0.351 | \n", "31 | \n", "0 | \n", "
1 | \n", "8 | \n", "183 | \n", "64 | \n", "0 | \n", "0 | \n", "23.3 | \n", "0.672 | \n", "32 | \n", "1 | \n", "
2 | \n", "1 | \n", "89 | \n", "66 | \n", "23 | \n", "94 | \n", "28.1 | \n", "0.167 | \n", "21 | \n", "0 | \n", "
3 | \n", "0 | \n", "137 | \n", "40 | \n", "35 | \n", "168 | \n", "43.1 | \n", "2.288 | \n", "33 | \n", "1 | \n", "
4 | \n", "5 | \n", "116 | \n", "74 | \n", "0 | \n", "0 | \n", "25.6 | \n", "0.201 | \n", "30 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
762 | \n", "10 | \n", "101 | \n", "76 | \n", "48 | \n", "180 | \n", "32.9 | \n", "0.171 | \n", "63 | \n", "0 | \n", "
763 | \n", "2 | \n", "122 | \n", "70 | \n", "27 | \n", "0 | \n", "36.8 | \n", "0.340 | \n", "27 | \n", "0 | \n", "
764 | \n", "5 | \n", "121 | \n", "72 | \n", "23 | \n", "112 | \n", "26.2 | \n", "0.245 | \n", "30 | \n", "0 | \n", "
765 | \n", "1 | \n", "126 | \n", "60 | \n", "0 | \n", "0 | \n", "30.1 | \n", "0.349 | \n", "47 | \n", "1 | \n", "
766 | \n", "1 | \n", "93 | \n", "70 | \n", "31 | \n", "0 | \n", "30.4 | \n", "0.315 | \n", "23 | \n", "0 | \n", "
767 rows × 9 columns
\n", "\n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skinfold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (year) | \n", "Class variable (0 or 1) | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "-0.843726 | \n", "-1.204829 | \n", "-0.529560 | \n", "0.532023 | \n", "-0.693559 | \n", "-0.851771 | \n", "-0.364265 | \n", "-0.188940 | \n", "0 | \n", "
1 | \n", "1.234240 | \n", "2.016619 | \n", "-0.694899 | \n", "-1.286882 | \n", "-0.693559 | \n", "-1.331770 | \n", "0.604701 | \n", "-0.103795 | \n", "1 | \n", "
2 | \n", "-0.843726 | \n", "-1.073342 | \n", "-0.529560 | \n", "0.155698 | \n", "0.122357 | \n", "-0.633591 | \n", "-0.919684 | \n", "-1.040393 | \n", "0 | \n", "
3 | \n", "-1.140579 | \n", "0.504511 | \n", "-2.678971 | \n", "0.908349 | \n", "0.764674 | \n", "1.548219 | \n", "5.482732 | \n", "-0.018650 | \n", "1 | \n", "
4 | \n", "0.343683 | \n", "-0.185800 | \n", "0.131797 | \n", "-1.286882 | \n", "-0.693559 | \n", "-0.997225 | \n", "-0.817052 | \n", "-0.274086 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
762 | \n", "1.827945 | \n", "-0.678879 | \n", "0.297136 | \n", "1.723720 | \n", "0.868833 | \n", "0.064588 | \n", "-0.907610 | \n", "2.535709 | \n", "0 | \n", "
763 | \n", "-0.546874 | \n", "0.011432 | \n", "-0.198881 | \n", "0.406582 | \n", "-0.693559 | \n", "0.631859 | \n", "-0.397469 | \n", "-0.529522 | \n", "0 | \n", "
764 | \n", "0.343683 | \n", "-0.021440 | \n", "-0.033542 | \n", "0.155698 | \n", "0.278596 | \n", "-0.909953 | \n", "-0.684235 | \n", "-0.274086 | \n", "0 | \n", "
765 | \n", "-0.843726 | \n", "0.142920 | \n", "-1.025578 | \n", "-1.286882 | \n", "-0.693559 | \n", "-0.342683 | \n", "-0.370302 | \n", "1.173384 | \n", "1 | \n", "
766 | \n", "-0.843726 | \n", "-0.941854 | \n", "-0.198881 | \n", "0.657465 | \n", "-0.693559 | \n", "-0.299046 | \n", "-0.472934 | \n", "-0.870103 | \n", "0 | \n", "
767 rows × 9 columns
\n", "\n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skinfold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (year) | \n", "Class variable (0 or 1) | \n", "
---|---|---|---|---|---|---|---|---|---|
289 | \n", "-1.140579 | \n", "-1.434933 | \n", "1.289172 | \n", "0.532023 | \n", "-0.346361 | \n", "0.646404 | \n", "-0.113722 | \n", "-1.040393 | \n", "0 | \n", "
328 | \n", "0.640535 | \n", "-0.547391 | \n", "-0.198881 | \n", "0.720186 | \n", "-0.103322 | \n", "-0.240865 | \n", "-1.055520 | \n", "0.321931 | \n", "0 | \n", "
394 | \n", "-0.546874 | \n", "0.175791 | \n", "-1.190917 | \n", "0.218419 | \n", "1.693429 | \n", "-0.691772 | \n", "3.405946 | \n", "-0.699812 | \n", "0 | \n", "
321 | \n", "-1.140579 | \n", "0.077176 | \n", "-0.198881 | \n", "-0.032464 | \n", "-0.693559 | \n", "-0.735408 | \n", "-0.657067 | \n", "0.236786 | \n", "1 | \n", "
326 | \n", "1.827945 | \n", "1.885132 | \n", "-0.198881 | \n", "-1.286882 | \n", "-0.693559 | \n", "0.384587 | \n", "-0.820071 | \n", "0.321931 | \n", "0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
579 | \n", "-1.140579 | \n", "0.964718 | \n", "1.454512 | \n", "1.598278 | \n", "-0.693559 | \n", "1.402765 | \n", "-0.303893 | \n", "-1.040393 | \n", "1 | \n", "
502 | \n", "0.937388 | \n", "-0.908982 | \n", "-0.694899 | \n", "0.281140 | \n", "-0.007843 | \n", "0.122770 | \n", "0.803927 | \n", "0.662512 | \n", "0 | \n", "
537 | \n", "-1.140579 | \n", "0.175791 | \n", "0.627815 | \n", "1.033790 | \n", "1.129232 | \n", "0.559132 | \n", "1.003154 | \n", "-0.870103 | \n", "0 | \n", "
196 | \n", "-0.250022 | \n", "-0.481647 | \n", "-0.860239 | \n", "-0.471511 | \n", "-0.276921 | \n", "-1.389951 | \n", "0.622812 | \n", "-0.870103 | \n", "1 | \n", "
175 | \n", "0.640535 | \n", "-1.204829 | \n", "0.462476 | \n", "-1.286882 | \n", "-0.693559 | \n", "-0.182683 | \n", "-0.270689 | \n", "0.747658 | \n", "0 | \n", "
613 rows × 9 columns
\n", "\n", " | Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skinfold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (year) | \n", "Class variable (0 or 1) | \n", "
---|---|---|---|---|---|---|---|---|---|
353 | \n", "-0.250022 | \n", "-1.040470 | \n", "0.462476 | \n", "-1.286882 | \n", "-0.693559 | \n", "1.490037 | \n", "0.263601 | \n", "-1.040393 | \n", "0 | \n", "
236 | \n", "-1.140579 | \n", "1.885132 | \n", "1.454512 | \n", "0.406582 | \n", "-0.693559 | \n", "1.693673 | \n", "0.646961 | \n", "-0.870103 | \n", "1 | \n", "
323 | \n", "-0.546874 | \n", "-0.317287 | \n", "0.214467 | \n", "0.720186 | \n", "-0.693559 | \n", "0.471860 | \n", "-0.977037 | \n", "-1.040393 | \n", "0 | \n", "
98 | \n", "-0.843726 | \n", "0.011432 | \n", "1.454512 | \n", "1.911883 | \n", "1.216032 | \n", "2.508215 | \n", "-0.442748 | \n", "-0.188940 | \n", "1 | \n", "
700 | \n", "0.640535 | \n", "0.110048 | \n", "0.462476 | \n", "0.657465 | \n", "-0.693559 | \n", "-0.706318 | \n", "0.281712 | \n", "1.343675 | \n", "1 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
153 | \n", "1.234240 | \n", "2.180979 | \n", "0.462476 | \n", "-1.286882 | \n", "-0.693559 | \n", "2.246398 | \n", "-1.010242 | \n", "0.832803 | \n", "1 | \n", "
392 | \n", "0.046831 | \n", "-0.185800 | \n", "-0.033542 | \n", "-0.534231 | \n", "0.061597 | \n", "-1.506314 | \n", "-0.026183 | \n", "0.321931 | \n", "0 | \n", "
308 | \n", "-0.546874 | \n", "0.077176 | \n", "-0.364221 | \n", "0.469303 | \n", "1.085832 | \n", "0.064588 | \n", "1.217473 | \n", "-0.274086 | \n", "1 | \n", "
70 | \n", "0.343683 | \n", "0.570255 | \n", "-0.694899 | \n", "0.908349 | \n", "0.521635 | \n", "-0.560864 | \n", "-0.183150 | \n", "-0.614667 | \n", "0 | \n", "
513 | \n", "-0.250022 | \n", "-0.744622 | \n", "-1.521596 | \n", "-0.095185 | \n", "0.052917 | \n", "-0.997225 | \n", "-0.958926 | \n", "-0.784957 | \n", "0 | \n", "
154 rows × 9 columns
\n", "\n", " | index | \n", "Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skinfold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (year) | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "289 | \n", "-1.140579 | \n", "-1.434933 | \n", "1.289172 | \n", "0.532023 | \n", "-0.346361 | \n", "0.646404 | \n", "-0.113722 | \n", "-1.040393 | \n", "
1 | \n", "328 | \n", "0.640535 | \n", "-0.547391 | \n", "-0.198881 | \n", "0.720186 | \n", "-0.103322 | \n", "-0.240865 | \n", "-1.055520 | \n", "0.321931 | \n", "
2 | \n", "394 | \n", "-0.546874 | \n", "0.175791 | \n", "-1.190917 | \n", "0.218419 | \n", "1.693429 | \n", "-0.691772 | \n", "3.405946 | \n", "-0.699812 | \n", "
3 | \n", "321 | \n", "-1.140579 | \n", "0.077176 | \n", "-0.198881 | \n", "-0.032464 | \n", "-0.693559 | \n", "-0.735408 | \n", "-0.657067 | \n", "0.236786 | \n", "
4 | \n", "326 | \n", "1.827945 | \n", "1.885132 | \n", "-0.198881 | \n", "-1.286882 | \n", "-0.693559 | \n", "0.384587 | \n", "-0.820071 | \n", "0.321931 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
608 | \n", "579 | \n", "-1.140579 | \n", "0.964718 | \n", "1.454512 | \n", "1.598278 | \n", "-0.693559 | \n", "1.402765 | \n", "-0.303893 | \n", "-1.040393 | \n", "
609 | \n", "502 | \n", "0.937388 | \n", "-0.908982 | \n", "-0.694899 | \n", "0.281140 | \n", "-0.007843 | \n", "0.122770 | \n", "0.803927 | \n", "0.662512 | \n", "
610 | \n", "537 | \n", "-1.140579 | \n", "0.175791 | \n", "0.627815 | \n", "1.033790 | \n", "1.129232 | \n", "0.559132 | \n", "1.003154 | \n", "-0.870103 | \n", "
611 | \n", "196 | \n", "-0.250022 | \n", "-0.481647 | \n", "-0.860239 | \n", "-0.471511 | \n", "-0.276921 | \n", "-1.389951 | \n", "0.622812 | \n", "-0.870103 | \n", "
612 | \n", "175 | \n", "0.640535 | \n", "-1.204829 | \n", "0.462476 | \n", "-1.286882 | \n", "-0.693559 | \n", "-0.182683 | \n", "-0.270689 | \n", "0.747658 | \n", "
613 rows × 9 columns
\n", "\n", " | index | \n", "Number of times pregnant | \n", "Plasma glucose concentration a 2 hours in an oral glucose tolerance test | \n", "Diastolic blood pressure (mm Hg) | \n", "Triceps skinfold thickness (mm) | \n", "2-Hour serum insulin (mu U/ml) | \n", "body mass index (weight in kg/(height in m)^2) | \n", "Diabetes pedigree function | \n", "Age (year) | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "353 | \n", "-0.250022 | \n", "-1.040470 | \n", "0.462476 | \n", "-1.286882 | \n", "-0.693559 | \n", "1.490037 | \n", "0.263601 | \n", "-1.040393 | \n", "
1 | \n", "236 | \n", "-1.140579 | \n", "1.885132 | \n", "1.454512 | \n", "0.406582 | \n", "-0.693559 | \n", "1.693673 | \n", "0.646961 | \n", "-0.870103 | \n", "
2 | \n", "323 | \n", "-0.546874 | \n", "-0.317287 | \n", "0.214467 | \n", "0.720186 | \n", "-0.693559 | \n", "0.471860 | \n", "-0.977037 | \n", "-1.040393 | \n", "
3 | \n", "98 | \n", "-0.843726 | \n", "0.011432 | \n", "1.454512 | \n", "1.911883 | \n", "1.216032 | \n", "2.508215 | \n", "-0.442748 | \n", "-0.188940 | \n", "
4 | \n", "700 | \n", "0.640535 | \n", "0.110048 | \n", "0.462476 | \n", "0.657465 | \n", "-0.693559 | \n", "-0.706318 | \n", "0.281712 | \n", "1.343675 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
149 | \n", "153 | \n", "1.234240 | \n", "2.180979 | \n", "0.462476 | \n", "-1.286882 | \n", "-0.693559 | \n", "2.246398 | \n", "-1.010242 | \n", "0.832803 | \n", "
150 | \n", "392 | \n", "0.046831 | \n", "-0.185800 | \n", "-0.033542 | \n", "-0.534231 | \n", "0.061597 | \n", "-1.506314 | \n", "-0.026183 | \n", "0.321931 | \n", "
151 | \n", "308 | \n", "-0.546874 | \n", "0.077176 | \n", "-0.364221 | \n", "0.469303 | \n", "1.085832 | \n", "0.064588 | \n", "1.217473 | \n", "-0.274086 | \n", "
152 | \n", "70 | \n", "0.343683 | \n", "0.570255 | \n", "-0.694899 | \n", "0.908349 | \n", "0.521635 | \n", "-0.560864 | \n", "-0.183150 | \n", "-0.614667 | \n", "
153 | \n", "513 | \n", "-0.250022 | \n", "-0.744622 | \n", "-1.521596 | \n", "-0.095185 | \n", "0.052917 | \n", "-0.997225 | \n", "-0.958926 | \n", "-0.784957 | \n", "
154 rows × 9 columns
\n", "