{ "cells": [ { "cell_type": "code", "execution_count": 13, "id": "israeli-vampire", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 14, "id": "french-withdrawal", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "106129\n" ] } ], "source": [ "txt_file = open('../tempest.txt', \"r\")\n", "text = txt_file.read()\n", "text = [c.lower() for c in text]\n", "N = len(text)\n", "\n", "print(N)" ] }, { "cell_type": "code", "execution_count": 15, "id": "fresh-excerpt", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "39\n" ] } ], "source": [ "characters, occurrences = np.unique(text, return_counts=True)\n", "n = len(characters)\n", "print(n)\n", "probs = occurrences/N" ] }, { "cell_type": "code", "execution_count": 20, "id": "interim-argument", "metadata": {}, "outputs": [], "source": [ "def entropy(p):\n", " idx = np.where(p>0)\n", " h=-np.sum(np.multiply(p[idx],np.log2(p[idx])))\n", " return h\n", "\n", "def entropy_emp(p,n):\n", " idx = np.where(p>0)\n", " h=-np.sum(np.multiply(p[idx],np.log2(p[idx])))\n", " eh=np.sqrt(np.sum(np.multiply(np.square(np.log2(np.e*p)), np.multiply(p,1-p))/n)) \n", " return h,eh\n", " \n", " \n", "def KLdivergence(p,q):\n", " idx = np.where(p>0)\n", " return np.sum(np.nan_to_num(np.multiply(p[idx],np.log2(p[idx]))))-\\\n", " np.sum(np.nan_to_num(np.multiply(p[idx],np.log2(q[idx]))))\n", "\n", "\n", "def mutual_information(pxy):\n", " px=np.sum(pxy,axis=0)\n", " py=np.sum(pxy,axis=1)\n", " hx=entropy(px)\n", " hy=entropy(py)\n", " hxy=entropy(pxy.reshape(-1))\n", " MI=hx+hy-hxy\n", " hxcy=hx-MI\n", " hycx=hy-MI\n", " return MI,hx,hy\n", "\n" ] }, { "cell_type": "code", "execution_count": 21, "id": "sunrise-shore", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.229589052568906 0.00952034755131061\n" ] } ], "source": [ "Hall, EHall = entropy_emp(probs,N)\n", "print(Hall,EHall)" ] }, { "cell_type": "code", "execution_count": 22, "id": "entertaining-guest", "metadata": {}, "outputs": [], "source": [ "L=100\n", "auto_mi = np.zeros(L)\n", "\n", "\n", "for l in range(1,L+1):\n", " text1=text[:-l]\n", " text2=text[l:]\n", " text3=[text1[i]+text2[i] for i in range(N-l)]\n", " \n", " character_pairs, occurrences_pairs_nonzero = np.unique(text3, return_counts=True)\n", " \n", " occurrences_pairs=np.zeros((n,n))\n", " \n", " for i in range(n):\n", " ci=characters[i]\n", " for j in range(n):\n", " cj=characters[j]\n", " c=ci+cj\n", " idx = np.where(character_pairs==c)[0]\n", " if len(idx)>0:\n", " occurrences_pairs[i,j]=np.squeeze(occurrences_pairs_nonzero[idx])\n", "\n", " occurrences_pairs=occurrences_pairs/(N-l)\n", " \n", " auto_mi[l-1],hx,hy = mutual_information(occurrences_pairs)\n", " \n" ] }, { "cell_type": "code", "execution_count": 28, "id": "jewish-laugh", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.8218745848897342\n", "0.3370541214802376\n", "0.18309971372248057\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.plot(auto_mi)\n", "plt.xticks(size=15)\n", "plt.yticks(size=15)\n", "plt.xlabel(\"l\",size=15)\n", "plt.ylabel(\"MI\",size=15)\n", "plt.xlim(0,20)\n", "plt.grid()\n", "\n", "print(auto_mi[0])\n", "print(auto_mi[1])\n", "print(auto_mi[2])" ] }, { "cell_type": "code", "execution_count": 34, "id": "worthy-strengthening", "metadata": {}, "outputs": [], "source": [ "text_subsampled = [text[i] for i in range(1,N,10)]" ] }, { "cell_type": "code", "execution_count": 47, "id": "engaged-manhattan", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.229589052568906 4.22939412884961 0.00952034755131061 0.030112454202498615\n" ] } ], "source": [ "characters_sub, occurrences_sub = np.unique(text_subsampled, return_counts=True)\n", "Nsub = len(text_subsampled)\n", "probs = occurrences_sub/Nsub\n", "\n", "\n", "Hsub, EHsub = entropy_emp(probs,Nsub)\n", "print(Hall,Hsub,EHall,EHsub)" ] }, { "cell_type": "code", "execution_count": null, "id": "different-somerset", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }