{ "cells": [ { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import statistics \n", "from collections import Counter\n", "from sklearn.impute import SimpleImputer\n", "import sklearn.preprocessing as sk\n", "from scipy.stats import zscore\n", "import jenkspy\n", "import seaborn as sns; sns.set()\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.tree import DecisionTreeClassifier,export_graphviz,plot_tree\n", "from sklearn import tree\n", "from sklearn.naive_bayes import MultinomialNB\n", "import statsmodels.tools.tools as stattools\n", "from sklearn.metrics import accuracy_score,confusion_matrix,classification_report\n", "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Random Forest on Diabetes Dataset" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
61487235033.60.627501
01856629026.60.351310
18183640023.30.672321
218966239428.10.167210
30137403516843.12.288331
45116740025.60.201300
..............................
76210101764818032.90.171630
76321227027036.80.340270
7645121722311226.20.245300
7651126600030.10.349471
7661937031030.40.315230
\n", "

767 rows × 9 columns

\n", "
" ], "text/plain": [ " 6 148 72 35 0 33.6 0.627 50 1\n", "0 1 85 66 29 0 26.6 0.351 31 0\n", "1 8 183 64 0 0 23.3 0.672 32 1\n", "2 1 89 66 23 94 28.1 0.167 21 0\n", "3 0 137 40 35 168 43.1 2.288 33 1\n", "4 5 116 74 0 0 25.6 0.201 30 0\n", ".. .. ... .. .. ... ... ... .. ..\n", "762 10 101 76 48 180 32.9 0.171 63 0\n", "763 2 122 70 27 0 36.8 0.340 27 0\n", "764 5 121 72 23 112 26.2 0.245 30 0\n", "765 1 126 60 0 0 30.1 0.349 47 1\n", "766 1 93 70 31 0 30.4 0.315 23 0\n", "\n", "[767 rows x 9 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diabetes=pd.read_csv('Diabetes.csv')\n", "diabetes" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1'], dtype='object')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_col=diabetes.columns\n", "dia_col" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skinfold thickness (mm)2-Hour serum insulin (mu U/ml)body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (year)Class variable (0 or 1)
01856629026.60.351310
18183640023.30.672321
218966239428.10.167210
30137403516843.12.288331
45116740025.60.201300
..............................
76210101764818032.90.171630
76321227027036.80.340270
7645121722311226.20.245300
7651126600030.10.349471
7661937031030.40.315230
\n", "

767 rows × 9 columns

\n", "
" ], "text/plain": [ " Number of times pregnant \\\n", "0 1 \n", "1 8 \n", "2 1 \n", "3 0 \n", "4 5 \n", ".. ... \n", "762 10 \n", "763 2 \n", "764 5 \n", "765 1 \n", "766 1 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "0 85 \n", "1 183 \n", "2 89 \n", "3 137 \n", "4 116 \n", ".. ... \n", "762 101 \n", "763 122 \n", "764 121 \n", "765 126 \n", "766 93 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skinfold thickness (mm) \\\n", "0 66 29 \n", "1 64 0 \n", "2 66 23 \n", "3 40 35 \n", "4 74 0 \n", ".. ... ... \n", "762 76 48 \n", "763 70 27 \n", "764 72 23 \n", "765 60 0 \n", "766 70 31 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "0 0 \n", "1 0 \n", "2 94 \n", "3 168 \n", "4 0 \n", ".. ... \n", "762 180 \n", "763 0 \n", "764 112 \n", "765 0 \n", "766 0 \n", "\n", " body mass index (weight in kg/(height in m)^2) \\\n", "0 26.6 \n", "1 23.3 \n", "2 28.1 \n", "3 43.1 \n", "4 25.6 \n", ".. ... \n", "762 32.9 \n", "763 36.8 \n", "764 26.2 \n", "765 30.1 \n", "766 30.4 \n", "\n", " Diabetes pedigree function Age (year) Class variable (0 or 1) \n", "0 0.351 31 0 \n", "1 0.672 32 1 \n", "2 0.167 21 0 \n", "3 2.288 33 1 \n", "4 0.201 30 0 \n", ".. ... ... ... \n", "762 0.171 63 0 \n", "763 0.340 27 0 \n", "764 0.245 30 0 \n", "765 0.349 47 1 \n", "766 0.315 23 0 \n", "\n", "[767 rows x 9 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "diabetes=diabetes.rename(columns={dia_col[0]: \"Number of times pregnant\",\n", " dia_col[1]: \"Plasma glucose concentration a 2 hours in an oral glucose tolerance test\" ,\n", " dia_col[2]: \"Diastolic blood pressure (mm Hg)\",\n", " dia_col[3]: \"Triceps skinfold thickness (mm)\",\n", " dia_col[4]: \"2-Hour serum insulin (mu U/ml)\",\n", " dia_col[5]: \"body mass index (weight in kg/(height in m)^2)\",\n", " dia_col[6]: \"Diabetes pedigree function\",\n", " dia_col[7]: \"Age (year)\",\n", " dia_col[8]: \"Class variable (0 or 1)\"})\n", "diabetes" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 767 entries, 0 to 766\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Number of times pregnant 767 non-null int64 \n", " 1 Plasma glucose concentration a 2 hours in an oral glucose tolerance test 767 non-null int64 \n", " 2 Diastolic blood pressure (mm Hg) 767 non-null int64 \n", " 3 Triceps skinfold thickness (mm) 767 non-null int64 \n", " 4 2-Hour serum insulin (mu U/ml) 767 non-null int64 \n", " 5 body mass index (weight in kg/(height in m)^2) 767 non-null float64\n", " 6 Diabetes pedigree function 767 non-null float64\n", " 7 Age (year) 767 non-null int64 \n", " 8 Class variable (0 or 1) 767 non-null int64 \n", "dtypes: float64(2), int64(7)\n", "memory usage: 54.1 KB\n" ] } ], "source": [ "diabetes.info()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def plot_series(dia_01,row,col,num_of_figs):\n", " columns=dia_01.columns\n", " fig, axs = plt.subplots(row,col,figsize=(15,15))\n", " counter=1\n", " for i in range(0,row):\n", " for j in range(0,col):\n", " if counter>num_of_figs:\n", " break\n", " axs[i,j].hist(dia_01.iloc[:,counter-1])\n", " axs[i,j].set_title(columns[counter-1])\n", " counter +=1\n", " plt.show()\n", "plot_series(diabetes,3,3,9)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sum of Zeros in Number of times pregnant column:\n", " 111 \n", "max of Number of times pregnant column:\n", " 17 \n", "sum of NAN in Number of times pregnant column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Plasma glucose concentration a 2 hours in an oral glucose tolerance test column:\n", " 5 \n", "max of Plasma glucose concentration a 2 hours in an oral glucose tolerance test column:\n", " 199 \n", "sum of NAN in Plasma glucose concentration a 2 hours in an oral glucose tolerance test column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Diastolic blood pressure (mm Hg) column:\n", " 35 \n", "max of Diastolic blood pressure (mm Hg) column:\n", " 122 \n", "sum of NAN in Diastolic blood pressure (mm Hg) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Triceps skinfold thickness (mm) column:\n", " 227 \n", "max of Triceps skinfold thickness (mm) column:\n", " 99 \n", "sum of NAN in Triceps skinfold thickness (mm) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in 2-Hour serum insulin (mu U/ml) column:\n", " 373 \n", "max of 2-Hour serum insulin (mu U/ml) column:\n", " 846 \n", "sum of NAN in 2-Hour serum insulin (mu U/ml) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in body mass index (weight in kg/(height in m)^2) column:\n", " 11 \n", "max of body mass index (weight in kg/(height in m)^2) column:\n", " 67.1 \n", "sum of NAN in body mass index (weight in kg/(height in m)^2) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Diabetes pedigree function column:\n", " 0 \n", "max of Diabetes pedigree function column:\n", " 2.42 \n", "sum of NAN in Diabetes pedigree function column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Age (year) column:\n", " 0 \n", "max of Age (year) column:\n", " 81 \n", "sum of NAN in Age (year) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n" ] } ], "source": [ "dia_col=diabetes.columns\n", "for c in dia_col:\n", " ser2 = diabetes[c]\n", " #pd.to_numeric(ser2).plot(kind='hist', stacked = True) \n", " if c==dia_col[8]:\n", " continue\n", " print('sum of Zeros in {} column:\\n'.format(c),ser2.eq(0).sum(),\n", " '\\nmax of {} column:\\n'.format(c),ser2.max(),\n", " '\\nsum of NAN in {} column:\\n'.format(c),ser2.isna().sum(),'\\n_-_-_-_-_-\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ " i think the zeros in column 2,3,6 are missing data." ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "m=(pd.to_numeric(diabetes[dia_col[1]]).mean()*diabetes[dia_col[1]].count())/(diabetes[dia_col[1]].count()-diabetes[dia_col[1]].eq(0).sum())\n", "diabetes[dia_col[1]]=diabetes[dia_col[1]].replace({0:m})\n", "m=(pd.to_numeric(diabetes[dia_col[2]]).mean()*diabetes[dia_col[2]].count())/(diabetes[dia_col[2]].count()-diabetes[dia_col[2]].eq(0).sum())\n", "diabetes[dia_col[2]]=diabetes[dia_col[2]].replace({0:m})\n", "m=(pd.to_numeric(diabetes[dia_col[5]]).mean()*diabetes[dia_col[5]].count())/(diabetes[dia_col[5]].count()-diabetes[dia_col[5]].eq(0).sum())\n", "diabetes[dia_col[5]]=diabetes[dia_col[5]].replace({0:m})\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sum of Zeros in Number of times pregnant column:\n", " 111 \n", "max of Number of times pregnant column:\n", " 17 \n", "sum of NAN in Number of times pregnant column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Plasma glucose concentration a 2 hours in an oral glucose tolerance test column:\n", " 0 \n", "max of Plasma glucose concentration a 2 hours in an oral glucose tolerance test column:\n", " 199.0 \n", "sum of NAN in Plasma glucose concentration a 2 hours in an oral glucose tolerance test column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Diastolic blood pressure (mm Hg) column:\n", " 0 \n", "max of Diastolic blood pressure (mm Hg) column:\n", " 122.0 \n", "sum of NAN in Diastolic blood pressure (mm Hg) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Triceps skinfold thickness (mm) column:\n", " 227 \n", "max of Triceps skinfold thickness (mm) column:\n", " 99 \n", "sum of NAN in Triceps skinfold thickness (mm) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in 2-Hour serum insulin (mu U/ml) column:\n", " 373 \n", "max of 2-Hour serum insulin (mu U/ml) column:\n", " 846 \n", "sum of NAN in 2-Hour serum insulin (mu U/ml) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in body mass index (weight in kg/(height in m)^2) column:\n", " 0 \n", "max of body mass index (weight in kg/(height in m)^2) column:\n", " 67.1 \n", "sum of NAN in body mass index (weight in kg/(height in m)^2) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Diabetes pedigree function column:\n", " 0 \n", "max of Diabetes pedigree function column:\n", " 2.42 \n", "sum of NAN in Diabetes pedigree function column:\n", " 0 \n", "_-_-_-_-_-\n", "\n", "sum of Zeros in Age (year) column:\n", " 0 \n", "max of Age (year) column:\n", " 81 \n", "sum of NAN in Age (year) column:\n", " 0 \n", "_-_-_-_-_-\n", "\n" ] } ], "source": [ "dia_col=diabetes.columns\n", "for c in dia_col:\n", " ser2 = diabetes[c]\n", " #pd.to_numeric(ser2).plot(kind='hist', stacked = True) \n", " if c==dia_col[8]:\n", " continue\n", " print('sum of Zeros in {} column:\\n'.format(c),ser2.eq(0).sum(),\n", " '\\nmax of {} column:\\n'.format(c),ser2.max(),\n", " '\\nsum of NAN in {} column:\\n'.format(c),ser2.isna().sum(),'\\n_-_-_-_-_-\\n')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1.00e+00, 8.50e+01, 6.60e+01, ..., 2.66e+01, 3.51e-01, 3.10e+01],\n", " [8.00e+00, 1.83e+02, 6.40e+01, ..., 2.33e+01, 6.72e-01, 3.20e+01],\n", " [1.00e+00, 8.90e+01, 6.60e+01, ..., 2.81e+01, 1.67e-01, 2.10e+01],\n", " ...,\n", " [5.00e+00, 1.21e+02, 7.20e+01, ..., 2.62e+01, 2.45e-01, 3.00e+01],\n", " [1.00e+00, 1.26e+02, 6.00e+01, ..., 3.01e+01, 3.49e-01, 4.70e+01],\n", " [1.00e+00, 9.30e+01, 7.00e+01, ..., 3.04e+01, 3.15e-01, 2.30e+01]])" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_norm=diabetes.copy()\n", "dia_y=dia_norm[dia_col[8]]\n", "dia_norm=dia_norm.drop([dia_col[8]],axis=1)\n", "dia_col=dia_norm.columns\n", "dia_col\n", "dia_norm=dia_norm.to_numpy()\n", "dia_norm" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skinfold thickness (mm)2-Hour serum insulin (mu U/ml)body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (year)Class variable (0 or 1)
0-0.843726-1.204829-0.5295600.532023-0.693559-0.851771-0.364265-0.1889400
11.2342402.016619-0.694899-1.286882-0.693559-1.3317700.604701-0.1037951
2-0.843726-1.073342-0.5295600.1556980.122357-0.633591-0.919684-1.0403930
3-1.1405790.504511-2.6789710.9083490.7646741.5482195.482732-0.0186501
40.343683-0.1858000.131797-1.286882-0.693559-0.997225-0.817052-0.2740860
..............................
7621.827945-0.6788790.2971361.7237200.8688330.064588-0.9076102.5357090
763-0.5468740.011432-0.1988810.406582-0.6935590.631859-0.397469-0.5295220
7640.343683-0.021440-0.0335420.1556980.278596-0.909953-0.684235-0.2740860
765-0.8437260.142920-1.025578-1.286882-0.693559-0.342683-0.3703021.1733841
766-0.843726-0.941854-0.1988810.657465-0.693559-0.299046-0.472934-0.8701030
\n", "

767 rows × 9 columns

\n", "
" ], "text/plain": [ " Number of times pregnant \\\n", "0 -0.843726 \n", "1 1.234240 \n", "2 -0.843726 \n", "3 -1.140579 \n", "4 0.343683 \n", ".. ... \n", "762 1.827945 \n", "763 -0.546874 \n", "764 0.343683 \n", "765 -0.843726 \n", "766 -0.843726 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "0 -1.204829 \n", "1 2.016619 \n", "2 -1.073342 \n", "3 0.504511 \n", "4 -0.185800 \n", ".. ... \n", "762 -0.678879 \n", "763 0.011432 \n", "764 -0.021440 \n", "765 0.142920 \n", "766 -0.941854 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skinfold thickness (mm) \\\n", "0 -0.529560 0.532023 \n", "1 -0.694899 -1.286882 \n", "2 -0.529560 0.155698 \n", "3 -2.678971 0.908349 \n", "4 0.131797 -1.286882 \n", ".. ... ... \n", "762 0.297136 1.723720 \n", "763 -0.198881 0.406582 \n", "764 -0.033542 0.155698 \n", "765 -1.025578 -1.286882 \n", "766 -0.198881 0.657465 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "0 -0.693559 \n", "1 -0.693559 \n", "2 0.122357 \n", "3 0.764674 \n", "4 -0.693559 \n", ".. ... \n", "762 0.868833 \n", "763 -0.693559 \n", "764 0.278596 \n", "765 -0.693559 \n", "766 -0.693559 \n", "\n", " body mass index (weight in kg/(height in m)^2) \\\n", "0 -0.851771 \n", "1 -1.331770 \n", "2 -0.633591 \n", "3 1.548219 \n", "4 -0.997225 \n", ".. ... \n", "762 0.064588 \n", "763 0.631859 \n", "764 -0.909953 \n", "765 -0.342683 \n", "766 -0.299046 \n", "\n", " Diabetes pedigree function Age (year) Class variable (0 or 1) \n", "0 -0.364265 -0.188940 0 \n", "1 0.604701 -0.103795 1 \n", "2 -0.919684 -1.040393 0 \n", "3 5.482732 -0.018650 1 \n", "4 -0.817052 -0.274086 0 \n", ".. ... ... ... \n", "762 -0.907610 2.535709 0 \n", "763 -0.397469 -0.529522 0 \n", "764 -0.684235 -0.274086 0 \n", "765 -0.370302 1.173384 1 \n", "766 -0.472934 -0.870103 0 \n", "\n", "[767 rows x 9 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scaler = sk.StandardScaler()\n", "dia_norm=scaler.fit_transform(dia_norm)\n", "dia_norm=pd.DataFrame(dia_norm,columns=dia_col)\n", "dia_norm= pd.concat((dia_norm,dia_y), axis = 1)\n", "dia_norm" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skinfold thickness (mm)2-Hour serum insulin (mu U/ml)body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (year)Class variable (0 or 1)
289-1.140579-1.4349331.2891720.532023-0.3463610.646404-0.113722-1.0403930
3280.640535-0.547391-0.1988810.720186-0.103322-0.240865-1.0555200.3219310
394-0.5468740.175791-1.1909170.2184191.693429-0.6917723.405946-0.6998120
321-1.1405790.077176-0.198881-0.032464-0.693559-0.735408-0.6570670.2367861
3261.8279451.885132-0.198881-1.286882-0.6935590.384587-0.8200710.3219310
..............................
579-1.1405790.9647181.4545121.598278-0.6935591.402765-0.303893-1.0403931
5020.937388-0.908982-0.6948990.281140-0.0078430.1227700.8039270.6625120
537-1.1405790.1757910.6278151.0337901.1292320.5591321.003154-0.8701030
196-0.250022-0.481647-0.860239-0.471511-0.276921-1.3899510.622812-0.8701031
1750.640535-1.2048290.462476-1.286882-0.693559-0.182683-0.2706890.7476580
\n", "

613 rows × 9 columns

\n", "
" ], "text/plain": [ " Number of times pregnant \\\n", "289 -1.140579 \n", "328 0.640535 \n", "394 -0.546874 \n", "321 -1.140579 \n", "326 1.827945 \n", ".. ... \n", "579 -1.140579 \n", "502 0.937388 \n", "537 -1.140579 \n", "196 -0.250022 \n", "175 0.640535 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "289 -1.434933 \n", "328 -0.547391 \n", "394 0.175791 \n", "321 0.077176 \n", "326 1.885132 \n", ".. ... \n", "579 0.964718 \n", "502 -0.908982 \n", "537 0.175791 \n", "196 -0.481647 \n", "175 -1.204829 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skinfold thickness (mm) \\\n", "289 1.289172 0.532023 \n", "328 -0.198881 0.720186 \n", "394 -1.190917 0.218419 \n", "321 -0.198881 -0.032464 \n", "326 -0.198881 -1.286882 \n", ".. ... ... \n", "579 1.454512 1.598278 \n", "502 -0.694899 0.281140 \n", "537 0.627815 1.033790 \n", "196 -0.860239 -0.471511 \n", "175 0.462476 -1.286882 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "289 -0.346361 \n", "328 -0.103322 \n", "394 1.693429 \n", "321 -0.693559 \n", "326 -0.693559 \n", ".. ... \n", "579 -0.693559 \n", "502 -0.007843 \n", "537 1.129232 \n", "196 -0.276921 \n", "175 -0.693559 \n", "\n", " body mass index (weight in kg/(height in m)^2) \\\n", "289 0.646404 \n", "328 -0.240865 \n", "394 -0.691772 \n", "321 -0.735408 \n", "326 0.384587 \n", ".. ... \n", "579 1.402765 \n", "502 0.122770 \n", "537 0.559132 \n", "196 -1.389951 \n", "175 -0.182683 \n", "\n", " Diabetes pedigree function Age (year) Class variable (0 or 1) \n", "289 -0.113722 -1.040393 0 \n", "328 -1.055520 0.321931 0 \n", "394 3.405946 -0.699812 0 \n", "321 -0.657067 0.236786 1 \n", "326 -0.820071 0.321931 0 \n", ".. ... ... ... \n", "579 -0.303893 -1.040393 1 \n", "502 0.803927 0.662512 0 \n", "537 1.003154 -0.870103 0 \n", "196 0.622812 -0.870103 1 \n", "175 -0.270689 0.747658 0 \n", "\n", "[613 rows x 9 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_col=dia_norm.columns\n", "dia_col\n", "d_train,d_test=train_test_split(dia_norm,test_size=0.2,random_state=7)\n", "d_train" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Number of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skinfold thickness (mm)2-Hour serum insulin (mu U/ml)body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (year)Class variable (0 or 1)
353-0.250022-1.0404700.462476-1.286882-0.6935591.4900370.263601-1.0403930
236-1.1405791.8851321.4545120.406582-0.6935591.6936730.646961-0.8701031
323-0.546874-0.3172870.2144670.720186-0.6935590.471860-0.977037-1.0403930
98-0.8437260.0114321.4545121.9118831.2160322.508215-0.442748-0.1889401
7000.6405350.1100480.4624760.657465-0.693559-0.7063180.2817121.3436751
..............................
1531.2342402.1809790.462476-1.286882-0.6935592.246398-1.0102420.8328031
3920.046831-0.185800-0.033542-0.5342310.061597-1.506314-0.0261830.3219310
308-0.5468740.077176-0.3642210.4693031.0858320.0645881.217473-0.2740861
700.3436830.570255-0.6948990.9083490.521635-0.560864-0.183150-0.6146670
513-0.250022-0.744622-1.521596-0.0951850.052917-0.997225-0.958926-0.7849570
\n", "

154 rows × 9 columns

\n", "
" ], "text/plain": [ " Number of times pregnant \\\n", "353 -0.250022 \n", "236 -1.140579 \n", "323 -0.546874 \n", "98 -0.843726 \n", "700 0.640535 \n", ".. ... \n", "153 1.234240 \n", "392 0.046831 \n", "308 -0.546874 \n", "70 0.343683 \n", "513 -0.250022 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "353 -1.040470 \n", "236 1.885132 \n", "323 -0.317287 \n", "98 0.011432 \n", "700 0.110048 \n", ".. ... \n", "153 2.180979 \n", "392 -0.185800 \n", "308 0.077176 \n", "70 0.570255 \n", "513 -0.744622 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skinfold thickness (mm) \\\n", "353 0.462476 -1.286882 \n", "236 1.454512 0.406582 \n", "323 0.214467 0.720186 \n", "98 1.454512 1.911883 \n", "700 0.462476 0.657465 \n", ".. ... ... \n", "153 0.462476 -1.286882 \n", "392 -0.033542 -0.534231 \n", "308 -0.364221 0.469303 \n", "70 -0.694899 0.908349 \n", "513 -1.521596 -0.095185 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "353 -0.693559 \n", "236 -0.693559 \n", "323 -0.693559 \n", "98 1.216032 \n", "700 -0.693559 \n", ".. ... \n", "153 -0.693559 \n", "392 0.061597 \n", "308 1.085832 \n", "70 0.521635 \n", "513 0.052917 \n", "\n", " body mass index (weight in kg/(height in m)^2) \\\n", "353 1.490037 \n", "236 1.693673 \n", "323 0.471860 \n", "98 2.508215 \n", "700 -0.706318 \n", ".. ... \n", "153 2.246398 \n", "392 -1.506314 \n", "308 0.064588 \n", "70 -0.560864 \n", "513 -0.997225 \n", "\n", " Diabetes pedigree function Age (year) Class variable (0 or 1) \n", "353 0.263601 -1.040393 0 \n", "236 0.646961 -0.870103 1 \n", "323 -0.977037 -1.040393 0 \n", "98 -0.442748 -0.188940 1 \n", "700 0.281712 1.343675 1 \n", ".. ... ... ... \n", "153 -1.010242 0.832803 1 \n", "392 -0.026183 0.321931 0 \n", "308 1.217473 -0.274086 1 \n", "70 -0.183150 -0.614667 0 \n", "513 -0.958926 -0.784957 0 \n", "\n", "[154 rows x 9 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_test" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 396\n", "1 217\n", "Name: Class variable (0 or 1), dtype: int64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_y_train=d_train[dia_col[8]]\n", "dia_y_train.value_counts()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexNumber of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skinfold thickness (mm)2-Hour serum insulin (mu U/ml)body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (year)
0289-1.140579-1.4349331.2891720.532023-0.3463610.646404-0.113722-1.040393
13280.640535-0.547391-0.1988810.720186-0.103322-0.240865-1.0555200.321931
2394-0.5468740.175791-1.1909170.2184191.693429-0.6917723.405946-0.699812
3321-1.1405790.077176-0.198881-0.032464-0.693559-0.735408-0.6570670.236786
43261.8279451.885132-0.198881-1.286882-0.6935590.384587-0.8200710.321931
..............................
608579-1.1405790.9647181.4545121.598278-0.6935591.402765-0.303893-1.040393
6095020.937388-0.908982-0.6948990.281140-0.0078430.1227700.8039270.662512
610537-1.1405790.1757910.6278151.0337901.1292320.5591321.003154-0.870103
611196-0.250022-0.481647-0.860239-0.471511-0.276921-1.3899510.622812-0.870103
6121750.640535-1.2048290.462476-1.286882-0.693559-0.182683-0.2706890.747658
\n", "

613 rows × 9 columns

\n", "
" ], "text/plain": [ " index Number of times pregnant \\\n", "0 289 -1.140579 \n", "1 328 0.640535 \n", "2 394 -0.546874 \n", "3 321 -1.140579 \n", "4 326 1.827945 \n", ".. ... ... \n", "608 579 -1.140579 \n", "609 502 0.937388 \n", "610 537 -1.140579 \n", "611 196 -0.250022 \n", "612 175 0.640535 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "0 -1.434933 \n", "1 -0.547391 \n", "2 0.175791 \n", "3 0.077176 \n", "4 1.885132 \n", ".. ... \n", "608 0.964718 \n", "609 -0.908982 \n", "610 0.175791 \n", "611 -0.481647 \n", "612 -1.204829 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skinfold thickness (mm) \\\n", "0 1.289172 0.532023 \n", "1 -0.198881 0.720186 \n", "2 -1.190917 0.218419 \n", "3 -0.198881 -0.032464 \n", "4 -0.198881 -1.286882 \n", ".. ... ... \n", "608 1.454512 1.598278 \n", "609 -0.694899 0.281140 \n", "610 0.627815 1.033790 \n", "611 -0.860239 -0.471511 \n", "612 0.462476 -1.286882 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "0 -0.346361 \n", "1 -0.103322 \n", "2 1.693429 \n", "3 -0.693559 \n", "4 -0.693559 \n", ".. ... \n", "608 -0.693559 \n", "609 -0.007843 \n", "610 1.129232 \n", "611 -0.276921 \n", "612 -0.693559 \n", "\n", " body mass index (weight in kg/(height in m)^2) \\\n", "0 0.646404 \n", "1 -0.240865 \n", "2 -0.691772 \n", "3 -0.735408 \n", "4 0.384587 \n", ".. ... \n", "608 1.402765 \n", "609 0.122770 \n", "610 0.559132 \n", "611 -1.389951 \n", "612 -0.182683 \n", "\n", " Diabetes pedigree function Age (year) \n", "0 -0.113722 -1.040393 \n", "1 -1.055520 0.321931 \n", "2 3.405946 -0.699812 \n", "3 -0.657067 0.236786 \n", "4 -0.820071 0.321931 \n", ".. ... ... \n", "608 -0.303893 -1.040393 \n", "609 0.803927 0.662512 \n", "610 1.003154 -0.870103 \n", "611 0.622812 -0.870103 \n", "612 -0.270689 0.747658 \n", "\n", "[613 rows x 9 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_x_train=d_train.copy()\n", "dia_x_train=dia_x_train.drop([dia_col[8]],axis=1)\n", "dia_x_train.reset_index()" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 104\n", "1 50\n", "Name: Class variable (0 or 1), dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_y_test=d_test[dia_col[8]]\n", "dia_y_test.value_counts()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexNumber of times pregnantPlasma glucose concentration a 2 hours in an oral glucose tolerance testDiastolic blood pressure (mm Hg)Triceps skinfold thickness (mm)2-Hour serum insulin (mu U/ml)body mass index (weight in kg/(height in m)^2)Diabetes pedigree functionAge (year)
0353-0.250022-1.0404700.462476-1.286882-0.6935591.4900370.263601-1.040393
1236-1.1405791.8851321.4545120.406582-0.6935591.6936730.646961-0.870103
2323-0.546874-0.3172870.2144670.720186-0.6935590.471860-0.977037-1.040393
398-0.8437260.0114321.4545121.9118831.2160322.508215-0.442748-0.188940
47000.6405350.1100480.4624760.657465-0.693559-0.7063180.2817121.343675
..............................
1491531.2342402.1809790.462476-1.286882-0.6935592.246398-1.0102420.832803
1503920.046831-0.185800-0.033542-0.5342310.061597-1.506314-0.0261830.321931
151308-0.5468740.077176-0.3642210.4693031.0858320.0645881.217473-0.274086
152700.3436830.570255-0.6948990.9083490.521635-0.560864-0.183150-0.614667
153513-0.250022-0.744622-1.521596-0.0951850.052917-0.997225-0.958926-0.784957
\n", "

154 rows × 9 columns

\n", "
" ], "text/plain": [ " index Number of times pregnant \\\n", "0 353 -0.250022 \n", "1 236 -1.140579 \n", "2 323 -0.546874 \n", "3 98 -0.843726 \n", "4 700 0.640535 \n", ".. ... ... \n", "149 153 1.234240 \n", "150 392 0.046831 \n", "151 308 -0.546874 \n", "152 70 0.343683 \n", "153 513 -0.250022 \n", "\n", " Plasma glucose concentration a 2 hours in an oral glucose tolerance test \\\n", "0 -1.040470 \n", "1 1.885132 \n", "2 -0.317287 \n", "3 0.011432 \n", "4 0.110048 \n", ".. ... \n", "149 2.180979 \n", "150 -0.185800 \n", "151 0.077176 \n", "152 0.570255 \n", "153 -0.744622 \n", "\n", " Diastolic blood pressure (mm Hg) Triceps skinfold thickness (mm) \\\n", "0 0.462476 -1.286882 \n", "1 1.454512 0.406582 \n", "2 0.214467 0.720186 \n", "3 1.454512 1.911883 \n", "4 0.462476 0.657465 \n", ".. ... ... \n", "149 0.462476 -1.286882 \n", "150 -0.033542 -0.534231 \n", "151 -0.364221 0.469303 \n", "152 -0.694899 0.908349 \n", "153 -1.521596 -0.095185 \n", "\n", " 2-Hour serum insulin (mu U/ml) \\\n", "0 -0.693559 \n", "1 -0.693559 \n", "2 -0.693559 \n", "3 1.216032 \n", "4 -0.693559 \n", ".. ... \n", "149 -0.693559 \n", "150 0.061597 \n", "151 1.085832 \n", "152 0.521635 \n", "153 0.052917 \n", "\n", " body mass index (weight in kg/(height in m)^2) \\\n", "0 1.490037 \n", "1 1.693673 \n", "2 0.471860 \n", "3 2.508215 \n", "4 -0.706318 \n", ".. ... \n", "149 2.246398 \n", "150 -1.506314 \n", "151 0.064588 \n", "152 -0.560864 \n", "153 -0.997225 \n", "\n", " Diabetes pedigree function Age (year) \n", "0 0.263601 -1.040393 \n", "1 0.646961 -0.870103 \n", "2 -0.977037 -1.040393 \n", "3 -0.442748 -0.188940 \n", "4 0.281712 1.343675 \n", ".. ... ... \n", "149 -1.010242 0.832803 \n", "150 -0.026183 0.321931 \n", "151 1.217473 -0.274086 \n", "152 -0.183150 -0.614667 \n", "153 -0.958926 -0.784957 \n", "\n", "[154 rows x 9 columns]" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_x_test=d_test.copy()\n", "dia_x_test=dia_x_test.drop([dia_col[8]],axis=1)\n", "dia_x_test.reset_index()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(criterion='entropy', max_depth=5, random_state=0)" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = RandomForestClassifier(criterion='entropy',n_estimators=100,max_depth = 5, random_state=0)\n", "model.fit(dia_x_train, dia_y_train)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "dia_y_predict=model.predict(dia_x_test)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.7987012987012987" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy_score(dia_y_test,dia_y_predict)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "seems like max_depth=5 or 10... has the max accuracy.\n", "if we we must test the depth to find the most efficient depth , increasing or redusing the depth may increase or reduse the accuracy." ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(criterion='entropy', max_depth=5, random_state=0)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = RandomForestClassifier(criterion='entropy',n_estimators=100,max_depth = 5, random_state=0)\n", "model.fit(dia_x_test, dia_y_test)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "y_pred=model.predict(dia_x_test)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "true negative : 103 \n", "false positive : 1 \n", "false negative : 4 \n", "true positive : 46\n" ] } ], "source": [ "tn, fp, fn, tp = confusion_matrix(dia_y_test, y_pred).ravel()\n", "print(\"true negative : {} \\nfalse positive : {} \\nfalse negative : {} \\ntrue positive : {}\".format(tn, fp, fn, tp))" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.96 0.99 0.98 104\n", " 1 0.98 0.92 0.95 50\n", "\n", " accuracy 0.97 154\n", " macro avg 0.97 0.96 0.96 154\n", "weighted avg 0.97 0.97 0.97 154\n", "\n" ] } ], "source": [ "print(classification_report(dia_y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "precision is the ratio tp / (tp + fp)\n", "\n", "The recall is the ratio tp / (tp + fn)\n", "\n", "The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.\n", "\n", "The support is the number of occurrences of each class in y_true." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }