{ "cells": [ { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import statistics \n", "from collections import Counter\n", "from sklearn.impute import SimpleImputer\n", "import sklearn.preprocessing as sk\n", "from scipy.stats import zscore\n", "import jenkspy\n", "import seaborn as sns; sns.set()\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.tree import DecisionTreeClassifier,export_graphviz,plot_tree\n", "from sklearn import tree\n", "from sklearn.naive_bayes import MultinomialNB\n", "import statsmodels.tools.tools as stattools\n", "from sklearn.metrics import accuracy_score,confusion_matrix,classification_report\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.cluster import KMeans\n", "from mpl_toolkits.mplot3d.axes3d import Axes3D\n", "from sklearn.metrics import silhouette_samples, silhouette_score\n", "from scipy.cluster.hierarchy import dendrogram, linkage,fcluster\n", "from urllib.request import urlretrieve" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Hierarchical Clustering on IRIS Dataset" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "iris = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('C:\\\\Users\\\\mohammad\\\\AppData\\\\Local\\\\Temp\\\\tmpol0l4cy2',\n", " )" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urlretrieve(iris)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
5.13.51.40.2Iris-setosa
04.93.01.40.2Iris-setosa
14.73.21.30.2Iris-setosa
24.63.11.50.2Iris-setosa
35.03.61.40.2Iris-setosa
45.43.91.70.4Iris-setosa
..................
1446.73.05.22.3Iris-virginica
1456.32.55.01.9Iris-virginica
1466.53.05.22.0Iris-virginica
1476.23.45.42.3Iris-virginica
1485.93.05.11.8Iris-virginica
\n", "

149 rows × 5 columns

\n", "
" ], "text/plain": [ " 5.1 3.5 1.4 0.2 Iris-setosa\n", "0 4.9 3.0 1.4 0.2 Iris-setosa\n", "1 4.7 3.2 1.3 0.2 Iris-setosa\n", "2 4.6 3.1 1.5 0.2 Iris-setosa\n", "3 5.0 3.6 1.4 0.2 Iris-setosa\n", "4 5.4 3.9 1.7 0.4 Iris-setosa\n", ".. ... ... ... ... ...\n", "144 6.7 3.0 5.2 2.3 Iris-virginica\n", "145 6.3 2.5 5.0 1.9 Iris-virginica\n", "146 6.5 3.0 5.2 2.0 Iris-virginica\n", "147 6.2 3.4 5.4 2.3 Iris-virginica\n", "148 5.9 3.0 5.1 1.8 Iris-virginica\n", "\n", "[149 rows x 5 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_iris = pd.read_csv(iris, sep=',')\n", "d_iris" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_widthIris-setosa
04.93.01.40.2Iris-setosa
14.73.21.30.2Iris-setosa
24.63.11.50.2Iris-setosa
35.03.61.40.2Iris-setosa
45.43.91.70.4Iris-setosa
..................
1446.73.05.22.3Iris-virginica
1456.32.55.01.9Iris-virginica
1466.53.05.22.0Iris-virginica
1476.23.45.42.3Iris-virginica
1485.93.05.11.8Iris-virginica
\n", "

149 rows × 5 columns

\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width Iris-setosa\n", "0 4.9 3.0 1.4 0.2 Iris-setosa\n", "1 4.7 3.2 1.3 0.2 Iris-setosa\n", "2 4.6 3.1 1.5 0.2 Iris-setosa\n", "3 5.0 3.6 1.4 0.2 Iris-setosa\n", "4 5.4 3.9 1.7 0.4 Iris-setosa\n", ".. ... ... ... ... ...\n", "144 6.7 3.0 5.2 2.3 Iris-virginica\n", "145 6.3 2.5 5.0 1.9 Iris-virginica\n", "146 6.5 3.0 5.2 2.0 Iris-virginica\n", "147 6.2 3.4 5.4 2.3 Iris-virginica\n", "148 5.9 3.0 5.1 1.8 Iris-virginica\n", "\n", "[149 rows x 5 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_iris=d_iris.rename(columns={\"5.1\": \"sepal_length\", \"3.5\": \"sepal_width\",\"1.4\": \"petal_length\",\"0.2\": \"petal_width\"})\n", "d_iris" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal_lengthsepal_widthpetal_lengthpetal_width
04.93.01.40.2
14.73.21.30.2
24.63.11.50.2
35.03.61.40.2
45.43.91.70.4
...............
1446.73.05.22.3
1456.32.55.01.9
1466.53.05.22.0
1476.23.45.42.3
1485.93.05.11.8
\n", "

149 rows × 4 columns

\n", "
" ], "text/plain": [ " sepal_length sepal_width petal_length petal_width\n", "0 4.9 3.0 1.4 0.2\n", "1 4.7 3.2 1.3 0.2\n", "2 4.6 3.1 1.5 0.2\n", "3 5.0 3.6 1.4 0.2\n", "4 5.4 3.9 1.7 0.4\n", ".. ... ... ... ...\n", "144 6.7 3.0 5.2 2.3\n", "145 6.3 2.5 5.0 1.9\n", "146 6.5 3.0 5.2 2.0\n", "147 6.2 3.4 5.4 2.3\n", "148 5.9 3.0 5.1 1.8\n", "\n", "[149 rows x 4 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d_iris=d_iris.drop([\"Iris-setosa\"],axis=1)\n", "d_iris" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "Z = linkage(d_iris)\n", "fig = plt.figure(figsize=(25, 10))\n", "dn = dendrogram(Z)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "this plot shows how many clusters we can choose from this data set(the maximum count)." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A=fcluster(linkage(d_iris), 6, criterion=\"distance\")\n", "A" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\n", " 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "B=fcluster(linkage(d_iris), t=2, criterion='maxclust')\n", "B" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.scatter(B, A, marker = '^')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }