{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "israeli-vampire",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "french-withdrawal",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "106129\n"
     ]
    }
   ],
   "source": [
    "txt_file = open('../tempest.txt', \"r\")\n",
    "text = txt_file.read()\n",
    "text = [c.lower() for c in text]\n",
    "N = len(text)\n",
    "\n",
    "print(N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fresh-excerpt",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39\n"
     ]
    }
   ],
   "source": [
    "characters, occurrences = np.unique(text, return_counts=True)\n",
    "n = len(characters)\n",
    "print(n)\n",
    "probs = occurrences/N"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "interim-argument",
   "metadata": {},
   "outputs": [],
   "source": [
    "def entropy(p):\n",
    "    idx = np.where(p>0)\n",
    "    h=-np.sum(np.multiply(p[idx],np.log2(p[idx])))\n",
    "    return h\n",
    "\n",
    "def entropy_emp(p,n):\n",
    "    idx = np.where(p>0)\n",
    "    h=-np.sum(np.multiply(p[idx],np.log2(p[idx])))\n",
    "    eh=np.sqrt(np.sum(np.multiply(np.square(np.log2(np.e*p)), np.multiply(p,1-p))/n)) \n",
    "    return h,eh\n",
    "    \n",
    "    \n",
    "def KLdivergence(p,q):\n",
    "    idx = np.where(p>0)\n",
    "    return np.sum(np.nan_to_num(np.multiply(p[idx],np.log2(p[idx]))))-\\\n",
    "            np.sum(np.nan_to_num(np.multiply(p[idx],np.log2(q[idx]))))\n",
    "\n",
    "\n",
    "def mutual_information(pxy):\n",
    "    px=np.sum(pxy,axis=0)\n",
    "    py=np.sum(pxy,axis=1)\n",
    "    hx=entropy(px)\n",
    "    hy=entropy(py)\n",
    "    hxy=entropy(pxy.reshape(-1))\n",
    "    MI=hx+hy-hxy\n",
    "    hxcy=hx-MI\n",
    "    hycx=hy-MI\n",
    "    return MI,hx,hy\n",
    "\n",
    "\n",
    "\n",
    "def huffman(symbols,probs):\n",
    "    codewords = {}\n",
    "    \n",
    "    N=np.shape(probs)[0]\n",
    "\n",
    "    pup=np.copy(probs)\n",
    "    idxup=np.arange(N)\n",
    "    \n",
    "    \n",
    "    for n in range(N):    \n",
    "        codewords[symbols[n]] = []\n",
    "        \n",
    "        \n",
    "    for n in range(N-1):\n",
    "        idx1,idx2=np.argsort(pup)[:2]\n",
    "        \n",
    "        #print(idx1,idx2)\n",
    "        \n",
    "        idxm=min(idx1,idx2)\n",
    "        idxp=max(idx1,idx2)\n",
    "        \n",
    "        #print(idxm,idxp)\n",
    "        \n",
    "        idxl0=np.where(idxup==idx1)[0]\n",
    "        \n",
    "        for i in idxl0:\n",
    "            codewords[symbols[i]].insert(0,1)\n",
    "        \n",
    "        idxl1=np.where(idxup==idx2)[0]\n",
    "\n",
    "        for i in idxl1:\n",
    "            codewords[symbols[i]].insert(0,0)\n",
    "\n",
    "        idxl2=np.where(idxup==idxp)[0]\n",
    "        \n",
    "        idxup[idxl2]=idxm\n",
    "        \n",
    "        idxl3=np.where(idxup>idxp)[0]\n",
    "        idxup[idxl3]=idxup[idxl3]-1\n",
    "        \n",
    "        \n",
    "        #print(idxup)\n",
    "        \n",
    "        pup[idxm]=pup[idx1]+pup[idx2]\n",
    "        pup = np.delete(pup, idxp)\n",
    "        \n",
    "        #print(pup)\n",
    "        \n",
    "    \n",
    "    return codewords\n",
    "        \n",
    "        \n",
    "        \n",
    "    \n",
    "#test\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "sunrise-shore",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.229589052568906 0.00952034755131061\n"
     ]
    }
   ],
   "source": [
    "Hall, EHall = entropy_emp(probs,N)\n",
    "print(Hall,EHall)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "different-somerset",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'\\n': [0, 0, 1, 1, 0],\n",
      " ' ': [1, 0],\n",
      " '!': [0, 1, 0, 0, 0, 0, 0, 1, 0],\n",
      " '&': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1],\n",
      " \"'\": [0, 0, 0, 1, 0, 0, 1, 1],\n",
      " ',': [0, 1, 1, 1, 0, 0],\n",
      " '-': [0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
      " '.': [0, 1, 0, 0, 0, 0, 1],\n",
      " ':': [0, 1, 0, 0, 0, 0, 0, 0, 1],\n",
      " ';': [1, 1, 0, 1, 1, 0, 1, 0],\n",
      " '?': [1, 1, 0, 1, 1, 0, 1, 1, 0],\n",
      " '[': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1],\n",
      " ']': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0],\n",
      " 'a': [1, 1, 0, 0],\n",
      " 'b': [1, 1, 0, 1, 1, 1],\n",
      " 'c': [0, 1, 1, 1, 0, 1],\n",
      " 'd': [1, 1, 0, 1, 0],\n",
      " 'e': [0, 0, 1, 0],\n",
      " 'f': [0, 1, 1, 1, 1, 0],\n",
      " 'g': [0, 0, 0, 1, 0, 0, 0],\n",
      " 'h': [0, 0, 0, 1, 1],\n",
      " 'i': [1, 1, 1, 0],\n",
      " 'j': [1, 1, 0, 1, 1, 0, 1, 1, 1, 1],\n",
      " 'k': [1, 1, 0, 1, 1, 0, 0],\n",
      " 'l': [0, 1, 0, 0, 1],\n",
      " 'm': [0, 0, 1, 1, 1, 0],\n",
      " 'n': [1, 1, 1, 1],\n",
      " 'o': [0, 1, 1, 0],\n",
      " 'p': [0, 1, 1, 1, 1, 1],\n",
      " 'q': [0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0],\n",
      " 'r': [0, 0, 0, 0, 1],\n",
      " 's': [0, 0, 0, 0, 0],\n",
      " 't': [0, 1, 0, 1],\n",
      " 'u': [0, 0, 0, 1, 0, 1],\n",
      " 'v': [0, 0, 0, 1, 0, 0, 1, 0],\n",
      " 'w': [0, 0, 1, 1, 1, 1],\n",
      " 'x': [1, 1, 0, 1, 1, 0, 1, 1, 1, 0],\n",
      " 'y': [0, 1, 0, 0, 0, 1],\n",
      " 'z': [0, 1, 0, 0, 0, 0, 0, 1, 1, 1]}\n"
     ]
    }
   ],
   "source": [
    "import pprint\n",
    "\n",
    "code=huffman(characters,probs)\n",
    "\n",
    "\n",
    "pprint.pprint(code)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "latter-hometown",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[5, 2, 9, 13, 8, 6, 9, 7, 9, 8, 9, 12, 13, 4, 6, 6, 5, 4, 6, 7, 5, 4, 10, 7, 5, 6, 4, 4, 6, 11, 5, 5, 4, 6, 8, 6, 10, 6, 10]\n",
      "4.262689745498403\n"
     ]
    }
   ],
   "source": [
    "ll = [len(c) for c in code.values()]\n",
    "\n",
    "print(ll)\n",
    "\n",
    "ave_len = np.sum(np.multiply(np.array(ll),np.array(probs)))\n",
    "\n",
    "print(ave_len)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "decent-fountain",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}