{ "cells": [ { "cell_type": "code", "execution_count": 33, "id": "ad9dfc56", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sepal length (cm)sepal width (cm)petal length (cm)petal width (cm)
05.13.51.40.2
14.93.01.40.2
24.73.21.30.2
34.63.11.50.2
45.03.61.40.2
...............
1456.73.05.22.3
1466.32.55.01.9
1476.53.05.22.0
1486.23.45.42.3
1495.93.05.11.8
\n", "

150 rows × 4 columns

\n", "
" ], "text/plain": [ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)\n", "0 5.1 3.5 1.4 0.2\n", "1 4.9 3.0 1.4 0.2\n", "2 4.7 3.2 1.3 0.2\n", "3 4.6 3.1 1.5 0.2\n", "4 5.0 3.6 1.4 0.2\n", ".. ... ... ... ...\n", "145 6.7 3.0 5.2 2.3\n", "146 6.3 2.5 5.0 1.9\n", "147 6.5 3.0 5.2 2.0\n", "148 6.2 3.4 5.4 2.3\n", "149 5.9 3.0 5.1 1.8\n", "\n", "[150 rows x 4 columns]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "from sklearn.datasets import load_iris\n", "import pandas as pd\n", "\n", "dataset = load_iris()\n", "data = dataset[\"data\"]\n", "target = dataset[\"target\"]\n", "df = pd.DataFrame(data, columns=dataset.feature_names)\n", "df" ] }, { "cell_type": "code", "execution_count": 39, "id": "3d6c8091", "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "scaler = StandardScaler()\n", "data_scaled = scaler.fit_transform(data)\n", "pca = PCA(n_components=1)\n", "data_pca = pca.fit_transform(data_scaled)" ] }, { "cell_type": "code", "execution_count": 46, "id": "2b5ce1fb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'PCA')" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.subplot(2, 2, 1)\n", "plt.scatter(data[:, 0], data[:, 1] , s = 5, c=target)\n", "plt.title(\"Sepal data\")\n", "plt.xlabel(\"Lenght\")\n", "plt.ylabel(\"Width\")\n", "plt.subplot(2, 2, 2)\n", "plt.scatter(data[:, 2], data[:, 3] , s = 5, c=target)\n", "plt.title(\"Petal data\")\n", "plt.xlabel(\"Lenght\")\n", "plt.ylabel(\"Width\")\n", "\n", "plt.subplot(2, 2, 3)\n", "plt.scatter(data_pca, data_pca, s=5)\n", "plt.title(\"PCA\")" ] }, { "cell_type": "code", "execution_count": 41, "id": "8927eaee", "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)\n", "X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(data_pca, target, test_size=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 42, "id": "e7fe4afc", "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.model_selection import cross_val_score" ] }, { "cell_type": "markdown", "id": "05475e3b", "metadata": {}, "source": [ "Now we Try to use PCA" ] }, { "cell_type": "code", "execution_count": 43, "id": "a61716a1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Without PCA: \n", "Fold 1\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 10\n", " 1 1.00 1.00 1.00 9\n", " 2 1.00 1.00 1.00 11\n", "\n", " accuracy 1.00 30\n", " macro avg 1.00 1.00 1.00 30\n", "weighted avg 1.00 1.00 1.00 30\n", "\n", "[[10 0 0]\n", " [ 0 9 0]\n", " [ 0 0 11]]\n", "Fold 2\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 13\n", " 1 1.00 1.00 1.00 10\n", " 2 1.00 1.00 1.00 7\n", "\n", " accuracy 1.00 30\n", " macro avg 1.00 1.00 1.00 30\n", "weighted avg 1.00 1.00 1.00 30\n", "\n", "[[13 0 0]\n", " [ 0 10 0]\n", " [ 0 0 7]]\n", "Fold 3\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 12\n", " 1 1.00 0.90 0.95 10\n", " 2 0.89 1.00 0.94 8\n", "\n", " accuracy 0.97 30\n", " macro avg 0.96 0.97 0.96 30\n", "weighted avg 0.97 0.97 0.97 30\n", "\n", "[[12 0 0]\n", " [ 0 9 1]\n", " [ 0 0 8]]\n", "Fold 4\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 8\n", " 1 1.00 0.80 0.89 10\n", " 2 0.86 1.00 0.92 12\n", "\n", " accuracy 0.93 30\n", " macro avg 0.95 0.93 0.94 30\n", "weighted avg 0.94 0.93 0.93 30\n", "\n", "[[ 8 0 0]\n", " [ 0 8 2]\n", " [ 0 0 12]]\n", "Fold 5\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 7\n", " 1 0.92 1.00 0.96 11\n", " 2 1.00 0.92 0.96 12\n", "\n", " accuracy 0.97 30\n", " macro avg 0.97 0.97 0.97 30\n", "weighted avg 0.97 0.97 0.97 30\n", "\n", "[[ 7 0 0]\n", " [ 0 11 0]\n", " [ 0 1 11]]\n", "AVG Accuracy: 0.9733333333333334 +- 0.024944382578492935\n", "With PCA: \n", "Fold 1\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 10\n", " 1 0.88 0.78 0.82 9\n", " 2 0.83 0.91 0.87 11\n", "\n", " accuracy 0.90 30\n", " macro avg 0.90 0.90 0.90 30\n", "weighted avg 0.90 0.90 0.90 30\n", "\n", "[[10 0 0]\n", " [ 0 7 2]\n", " [ 0 1 10]]\n", "Fold 2\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 13\n", " 1 0.89 0.80 0.84 10\n", " 2 0.75 0.86 0.80 7\n", "\n", " accuracy 0.90 30\n", " macro avg 0.88 0.89 0.88 30\n", "weighted avg 0.90 0.90 0.90 30\n", "\n", "[[13 0 0]\n", " [ 0 8 2]\n", " [ 0 1 6]]\n", "Fold 3\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 12\n", " 1 0.82 0.90 0.86 10\n", " 2 0.86 0.75 0.80 8\n", "\n", " accuracy 0.90 30\n", " macro avg 0.89 0.88 0.89 30\n", "weighted avg 0.90 0.90 0.90 30\n", "\n", "[[12 0 0]\n", " [ 0 9 1]\n", " [ 0 2 6]]\n", "Fold 4\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 8\n", " 1 0.78 0.70 0.74 10\n", " 2 0.77 0.83 0.80 12\n", "\n", " accuracy 0.83 30\n", " macro avg 0.85 0.84 0.85 30\n", "weighted avg 0.83 0.83 0.83 30\n", "\n", "[[ 8 0 0]\n", " [ 0 7 3]\n", " [ 0 2 10]]\n", "Fold 5\n", " precision recall f1-score support\n", "\n", " 0 1.00 1.00 1.00 7\n", " 1 0.79 1.00 0.88 11\n", " 2 1.00 0.75 0.86 12\n", "\n", " accuracy 0.90 30\n", " macro avg 0.93 0.92 0.91 30\n", "weighted avg 0.92 0.90 0.90 30\n", "\n", "[[ 7 0 0]\n", " [ 0 11 0]\n", " [ 0 3 9]]\n", "AVG Accuracy: 0.8866666666666667 +- 0.02666666666666666\n", "Score difference: -0.08666666666666667\n" ] } ], "source": [ "from sklearn.model_selection import KFold\n", "from sklearn import metrics\n", "\n", "\n", "def KFoldValidation(X, y):\n", " skf = KFold(n_splits=5, shuffle=True, random_state=42)\n", " fold, accs = 1, []\n", " for train, test in skf.split(X, y):\n", " print(\"Fold\", fold)\n", " X_train, X_test = X[train], X[test]\n", " clf = KNeighborsClassifier()\n", " clf.fit(X_train, y[train])\n", " y_pred = clf.predict(X_test)\n", " y_true = y[test]\n", "\n", " print(metrics.classification_report(y_true, y_pred))\n", " print(metrics.confusion_matrix(y_true, y_pred))\n", " accs.append(metrics.accuracy_score(y_true, y_pred))\n", " fold += 1\n", "\n", " print(\"AVG Accuracy:\", np.mean(accs), \"+-\", np.std(accs))\n", " return np.mean(accs)\n", "\n", "print(\"Without PCA: \")\n", "score = KFoldValidation(data, target)\n", "print(\"With PCA: \")\n", "score_pca = KFoldValidation(data_pca, target)\n", "\n", "print(f\"Score difference: {score_pca - score}\")" ] }, { "cell_type": "markdown", "id": "e429db79", "metadata": {}, "source": [ "As we can see with this dataset, the classifier trained on the principal components gives worse results than the same classifier trained on the \"normal\" training set" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" } }, "nbformat": 4, "nbformat_minor": 5 }