{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "israeli-vampire", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 6, "id": "french-withdrawal", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "106129\n" ] } ], "source": [ "txt_file = open('../tempest.txt', \"r\")\n", "text = txt_file.read()\n", "text = [c.lower() for c in text]\n", "N = len(text)\n", "\n", "print(N)" ] }, { "cell_type": "code", "execution_count": 7, "id": "fresh-excerpt", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "39\n" ] } ], "source": [ "characters, occurrences = np.unique(text, return_counts=True)\n", "n = len(characters)\n", "print(n)\n", "probs = occurrences/N" ] }, { "cell_type": "code", "execution_count": 22, "id": "interim-argument", "metadata": {}, "outputs": [], "source": [ "def entropy(p):\n", " idx = np.where(p>0)\n", " h=-np.sum(np.multiply(p[idx],np.log2(p[idx])))\n", " return h\n", "\n", "def entropy_emp(p,n):\n", " idx = np.where(p>0)\n", " h=-np.sum(np.multiply(p[idx],np.log2(p[idx])))\n", " eh=np.sqrt(np.sum(np.multiply(np.square(np.log2(np.e*p)), np.multiply(p,1-p))/n)) \n", " return h,eh\n", " \n", " \n", "def KLdivergence(p,q):\n", " idx = np.where(p>0)\n", " return np.sum(np.nan_to_num(np.multiply(p[idx],np.log2(p[idx]))))-\\\n", " np.sum(np.nan_to_num(np.multiply(p[idx],np.log2(q[idx]))))\n", "\n", "\n", "def mutual_information(pxy):\n", " px=np.sum(pxy,axis=0)\n", " py=np.sum(pxy,axis=1)\n", " hx=entropy(px)\n", " hy=entropy(py)\n", " hxy=entropy(pxy.reshape(-1))\n", " MI=hx+hy-hxy\n", " hxcy=hx-MI\n", " hycx=hy-MI\n", " return MI,hx,hy\n", "\n", "\n", "\n", "def huffman(symbols,probs):\n", " codewords = {}\n", " \n", " N=np.shape(probs)[0]\n", "\n", " pup=np.copy(probs)\n", " idxup=np.arange(N)\n", " \n", " \n", " for n in range(N): \n", " codewords[symbols[n]] = []\n", " \n", " \n", " for n in range(N-1):\n", " idx1,idx2=np.argsort(pup)[:2]\n", " \n", " #print(idx1,idx2)\n", " \n", " idxm=min(idx1,idx2)\n", " idxp=max(idx1,idx2)\n", " \n", " #print(idxm,idxp)\n", " \n", " idxl0=np.where(idxup==idx1)[0]\n", " \n", " for i in idxl0:\n", " codewords[symbols[i]].insert(0,1)\n", " \n", " idxl1=np.where(idxup==idx2)[0]\n", "\n", " for i in idxl1:\n", " codewords[symbols[i]].insert(0,0)\n", "\n", " idxl2=np.where(idxup==idxp)[0]\n", " \n", " idxup[idxl2]=idxm\n", " \n", " idxl3=np.where(idxup>idxp)[0]\n", " idxup[idxl3]=idxup[idxl3]-1\n", " \n", " \n", " #print(idxup)\n", " \n", " pup[idxm]=pup[idx1]+pup[idx2]\n", " pup = np.delete(pup, idxp)\n", " \n", " #print(pup)\n", " \n", " \n", " return codewords\n", " \n", " \n", " \n", " \n", "#test\n", " \n" ] }, { "cell_type": "code", "execution_count": 23, "id": "sunrise-shore", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.229589052568906 0.00952034755131061\n" ] } ], "source": [ "Hall, EHall = entropy_emp(probs,N)\n", "print(Hall,EHall)" ] }, { "cell_type": "code", "execution_count": 37, "id": "different-somerset", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'\\n': [0, 0, 1, 1, 0],\n", " ' ': [1, 0],\n", " '!': [0, 1, 0, 0, 0, 0, 0, 1, 0],\n", " '&': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1],\n", " \"'\": [0, 0, 0, 1, 0, 0, 1, 1],\n", " ',': [0, 1, 1, 1, 0, 0],\n", " '-': [0, 1, 0, 0, 0, 0, 0, 0, 0],\n", " '.': [0, 1, 0, 0, 0, 0, 1],\n", " ':': [0, 1, 0, 0, 0, 0, 0, 0, 1],\n", " ';': [1, 1, 0, 1, 1, 0, 1, 0],\n", " '?': [1, 1, 0, 1, 1, 0, 1, 1, 0],\n", " '[': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1],\n", " ']': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0],\n", " 'a': [1, 1, 0, 0],\n", " 'b': [1, 1, 0, 1, 1, 1],\n", " 'c': [0, 1, 1, 1, 0, 1],\n", " 'd': [1, 1, 0, 1, 0],\n", " 'e': [0, 0, 1, 0],\n", " 'f': [0, 1, 1, 1, 1, 0],\n", " 'g': [0, 0, 0, 1, 0, 0, 0],\n", " 'h': [0, 0, 0, 1, 1],\n", " 'i': [1, 1, 1, 0],\n", " 'j': [1, 1, 0, 1, 1, 0, 1, 1, 1, 1],\n", " 'k': [1, 1, 0, 1, 1, 0, 0],\n", " 'l': [0, 1, 0, 0, 1],\n", " 'm': [0, 0, 1, 1, 1, 0],\n", " 'n': [1, 1, 1, 1],\n", " 'o': [0, 1, 1, 0],\n", " 'p': [0, 1, 1, 1, 1, 1],\n", " 'q': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],\n", " 'r': [0, 0, 0, 0, 1],\n", " 's': [0, 0, 0, 0, 0],\n", " 't': [0, 1, 0, 1],\n", " 'u': [0, 0, 0, 1, 0, 1],\n", " 'v': [0, 0, 0, 1, 0, 0, 1, 0],\n", " 'w': [0, 0, 1, 1, 1, 1],\n", " 'x': [1, 1, 0, 1, 1, 0, 1, 1, 1, 0],\n", " 'y': [0, 1, 0, 0, 0, 1],\n", " 'z': [0, 1, 0, 0, 0, 0, 0, 1, 1, 1]}\n" ] } ], "source": [ "import pprint\n", "\n", "code=huffman(characters,probs)\n", "\n", "\n", "pprint.pprint(code)" ] }, { "cell_type": "code", "execution_count": 32, "id": "latter-hometown", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[5, 2, 9, 13, 8, 6, 9, 7, 9, 8, 9, 12, 13, 4, 6, 6, 5, 4, 6, 7, 5, 4, 10, 7, 5, 6, 4, 4, 6, 11, 5, 5, 4, 6, 8, 6, 10, 6, 10]\n", "4.262689745498403\n" ] } ], "source": [ "ll = [len(c) for c in code.values()]\n", "\n", "print(ll)\n", "\n", "ave_len = np.sum(np.multiply(np.array(ll),np.array(probs)))\n", "\n", "print(ave_len)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "decent-fountain", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }