Created using Colaboratory

2023-10-22 18:51:09 +01:00
parent 9acd27c8b3
commit 6ac37b5548
1 changed files with 521 additions and 0 deletions
@@ -0,0 +1,521 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyNEAhORON7DFN1dZMhDK/PO",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap19/19_4_Temporal_Difference_Methods.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Notebook 19.4: Temporal difference methods**\n",
+        "\n",
+        "This notebook investigates temporal differnece methods for  tabular reinforcement learning as described in section 19.3.3 of the book\n",
+        "\n",
+        "Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n",
+        "\n",
+        "Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions."
+      ],
+      "metadata": {
+        "id": "t9vk9Elugvmi"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "from PIL import Image"
+      ],
+      "metadata": {
+        "id": "OLComQyvCIJ7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get local copies of components of images\n",
+        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Empty.png\n",
+        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Hole.png\n",
+        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Fish.png\n",
+        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Penguin.png"
+      ],
+      "metadata": {
+        "id": "ZsvrUszPLyEG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Ugly class that takes care of drawing pictures like in the book.\n",
+        "# You can totally ignore this code!\n",
+        "class DrawMDP:\n",
+        "  # Constructor initializes parameters\n",
+        "  def __init__(self, n_row, n_col):\n",
+        "    self.empty_image = np.asarray(Image.open('Empty.png'))\n",
+        "    self.hole_image = np.asarray(Image.open('Hole.png'))\n",
+        "    self.fish_image = np.asarray(Image.open('Fish.png'))\n",
+        "    self.penguin_image = np.asarray(Image.open('Penguin.png'))\n",
+        "    self.fig,self.ax = plt.subplots()\n",
+        "    self.n_row = n_row\n",
+        "    self.n_col = n_col\n",
+        "\n",
+        "    my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
+        "    my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
+        "    r = np.floor(my_colormap_vals_dec/(256*256))\n",
+        "    g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
+        "    b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
+        "    self.colormap = np.vstack((r,g,b)).transpose()/255.0\n",
+        "\n",
+        "\n",
+        "  def draw_text(self, text, row, col, position, color):\n",
+        "    if position == 'bc':\n",
+        "      self.ax.text( 83*col+41,83 * (row+1) -5, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
+        "    if position == 'tc':\n",
+        "      self.ax.text( 83*col+41,83 * (row) +10, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
+        "    if position == 'lc':\n",
+        "      self.ax.text( 83*col+2,83 * (row) +41, text, verticalalignment=\"center\", color=color, fontweight='bold', rotation=90)\n",
+        "    if position == 'rc':\n",
+        "      self.ax.text( 83*(col+1)-5,83 * (row) +41, text, horizontalalignment=\"right\", verticalalignment=\"center\", color=color, fontweight='bold', rotation=-90)\n",
+        "    if position == 'tl':\n",
+        "      self.ax.text( 83*col+5,83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"left\", color=color, fontweight='bold')\n",
+        "    if position == 'tr':\n",
+        "      self.ax.text( 83*(col+1)-5, 83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"right\", color=color, fontweight='bold')\n",
+        "\n",
+        "  # Draws a set of states\n",
+        "  def draw_path(self, path, color1, color2):\n",
+        "    for i in range(len(path)-1):\n",
+        "      row_start = np.floor(path[i]/self.n_col)\n",
+        "      row_end = np.floor(path[i+1]/self.n_col)\n",
+        "      col_start = path[i] - row_start * self.n_col\n",
+        "      col_end = path[i+1] - row_end * self.n_col\n",
+        "\n",
+        "      color_index = int(np.floor(255 * i/(len(path)-1.)))\n",
+        "      self.ax.plot([col_start * 83+41 + i, col_end * 83+41 + i ],[row_start * 83+41 +  i, row_end * 83+41 + i ], color=(self.colormap[color_index,0],self.colormap[color_index,1],self.colormap[color_index,2]))\n",
+        "\n",
+        "\n",
+        "  # Draw deterministic policy\n",
+        "  def draw_deterministic_policy(self,i, action):\n",
+        "      row = np.floor(i/self.n_col)\n",
+        "      col = i - row * self.n_col\n",
+        "      center_x = 83 * col + 41\n",
+        "      center_y = 83 * row + 41\n",
+        "      arrow_base_width = 10\n",
+        "      arrow_height = 15\n",
+        "      # Draw arrow pointing upward\n",
+        "      if action ==0:\n",
+        "          triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
+        "                              [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
+        "                              [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
+        "      # Draw arrow pointing right\n",
+        "      if action ==1:\n",
+        "          triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
+        "                              [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
+        "                              [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
+        "      # Draw arrow pointing downward\n",
+        "      if action ==2:\n",
+        "          triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
+        "                              [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
+        "                              [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
+        "      # Draw arrow pointing left\n",
+        "      if action ==3:\n",
+        "          triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
+        "                              [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
+        "                              [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
+        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
+        "\n",
+        "  # Draw stochastic policy\n",
+        "  def draw_stochastic_policy(self,i, action_probs):\n",
+        "      row = np.floor(i/self.n_col)\n",
+        "      col = i - row * self.n_col\n",
+        "      offset = 20\n",
+        "      # Draw arrow pointing upward\n",
+        "      center_x = 83 * col + 41\n",
+        "      center_y = 83 * row + 41 - offset\n",
+        "      arrow_base_width = 15 * action_probs[0]\n",
+        "      arrow_height = 20 * action_probs[0]\n",
+        "      triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
+        "                          [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
+        "                          [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
+        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
+        "\n",
+        "      # Draw arrow pointing right\n",
+        "      center_x = 83 * col + 41 + offset\n",
+        "      center_y = 83 * row + 41\n",
+        "      arrow_base_width = 15 * action_probs[1]\n",
+        "      arrow_height = 20 * action_probs[1]\n",
+        "      triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
+        "                          [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
+        "                          [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
+        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
+        "\n",
+        "      # Draw arrow pointing downward\n",
+        "      center_x = 83 * col + 41\n",
+        "      center_y = 83 * row + 41 +offset\n",
+        "      arrow_base_width = 15 * action_probs[2]\n",
+        "      arrow_height = 20 * action_probs[2]\n",
+        "      triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
+        "                          [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
+        "                          [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
+        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
+        "\n",
+        "      # Draw arrow pointing left\n",
+        "      center_x = 83 * col + 41 -offset\n",
+        "      center_y = 83 * row + 41\n",
+        "      arrow_base_width = 15 * action_probs[3]\n",
+        "      arrow_height = 20 * action_probs[3]\n",
+        "      triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
+        "                          [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
+        "                          [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
+        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
+        "\n",
+        "\n",
+        "  def draw(self, layout, state=None, draw_state_index= False, rewards=None, policy=None, state_values=None, state_action_values=None,path1=None, path2 = None):\n",
+        "    # Construct the image\n",
+        "    image_out = np.zeros((self.n_row * 83, self.n_col * 83, 4),dtype='uint8')\n",
+        "    for c_row in range (self.n_row):\n",
+        "      for c_col in range(self.n_col):\n",
+        "        if layout[c_row * self.n_col + c_col]==0:\n",
+        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.empty_image\n",
+        "        elif layout[c_row * self.n_col + c_col]==1:\n",
+        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.hole_image\n",
+        "        else:\n",
+        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.fish_image\n",
+        "        if state is not None and state == c_row * self.n_col + c_col:\n",
+        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.penguin_image\n",
+        "\n",
+        "    # Draw the image\n",
+        "    plt.imshow(image_out)\n",
+        "    self.ax.get_xaxis().set_visible(False)\n",
+        "    self.ax.get_yaxis().set_visible(False)\n",
+        "    self.ax.spines['top'].set_visible(False)\n",
+        "    self.ax.spines['right'].set_visible(False)\n",
+        "    self.ax.spines['bottom'].set_visible(False)\n",
+        "    self.ax.spines['left'].set_visible(False)\n",
+        "\n",
+        "    if draw_state_index:\n",
+        "      for c_cell in range(layout.size):\n",
+        "          self.draw_text(\"%d\"%(c_cell), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tl','k')\n",
+        "\n",
+        "    # Draw the policy as triangles\n",
+        "    if policy is not None:\n",
+        "        # If the policy is deterministic\n",
+        "        if len(policy) == len(layout):\n",
+        "          for i in range(len(layout)):\n",
+        "            self.draw_deterministic_policy(i, policy[i])\n",
+        "        # Else it is stochastic\n",
+        "        else:\n",
+        "          for i in range(len(layout)):\n",
+        "            self.draw_stochastic_policy(i,policy[:,i])\n",
+        "\n",
+        "\n",
+        "    if path1 is not None:\n",
+        "      self.draw_path(path1, np.array([1.0, 0.0, 0.0]), np.array([0.0, 1.0, 1.0]))\n",
+        "\n",
+        "    if rewards is not None:\n",
+        "        for c_cell in range(layout.size):\n",
+        "          self.draw_text(\"%d\"%(rewards[c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tr','r')\n",
+        "\n",
+        "    if state_values is not None:\n",
+        "        for c_cell in range(layout.size):\n",
+        "          self.draw_text(\"%2.2f\"%(state_values[c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'bc','black')\n",
+        "\n",
+        "    if state_action_values is not None:\n",
+        "        for c_cell in range(layout.size):\n",
+        "          self.draw_text(\"%2.2f\"%(state_action_values[0, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tc','black')\n",
+        "          self.draw_text(\"%2.2f\"%(state_action_values[1, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'rc','black')\n",
+        "          self.draw_text(\"%2.2f\"%(state_action_values[2, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'bc','black')\n",
+        "          self.draw_text(\"%2.2f\"%(state_action_values[3, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'lc','black')\n",
+        "\n",
+        "    plt.show()"
+      ],
+      "metadata": {
+        "id": "Gq1HfJsHN3SB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# We're going to work on the problem depicted in figure 19.10a\n",
+        "n_rows = 4; n_cols = 4\n",
+        "layout = np.zeros(n_rows * n_cols)\n",
+        "reward_structure = np.zeros(n_rows * n_cols)\n",
+        "layout[9] = 1 ; reward_structure[9] = -2\n",
+        "layout[10] = 1; reward_structure[10] = -2\n",
+        "layout[14] = 1; reward_structure[14] = -2\n",
+        "layout[15] = 2; reward_structure[15] = 3\n",
+        "initial_state = 0\n",
+        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
+        "mdp_drawer.draw(layout, state = initial_state, rewards=reward_structure, draw_state_index = True)"
+      ],
+      "metadata": {
+        "id": "eBQ7lTpJQBSe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "For clarity, the black numbers are the state number and the red numbers are the reward for being in that state.  Note that the states are indexed from 0 rather than 1 as in the book to make the code neater."
+      ],
+      "metadata": {
+        "id": "6Vku6v_se2IG"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's define the state transition function $Pr(s_{t+1}|s_{t},a)$ in full where $a$ is the actions.  Here $a=0$ means try to go upward, $a=1$, right, $a=2$ down and $a=3$ right.  However, the ice is slippery, so we don't always go the direction we want to.\n",
+        "\n",
+        "Note that as for the states, we've indexed the actions from zero (unlike in the book) so they map to the indices of arrays better"
+      ],
+      "metadata": {
+        "id": "Fhc6DzZNOjiC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "transition_probabilities_given_action0 = np.array(\\\n",
+        "[[0.00 , 0.33, 0.00, 0.00,  0.50, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.50 , 0.00, 0.33, 0.00,  0.00, 0.50, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.33, 0.00, 0.50,  0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.33, 0.00,  0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.50 , 0.00, 0.00, 0.00,  0.00, 0.17, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.34, 0.00, 0.00,  0.25, 0.00, 0.17, 0.00,   0.00, 0.50, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.34, 0.00,  0.00, 0.17, 0.00, 0.25,   0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.50,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.17, 0.00, 0.00,   0.75, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.16, 0.00, 0.00,   0.25, 0.00, 0.17, 0.00,   0.00, 0.50, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.16, 0.00,   0.00, 0.17, 0.00, 0.25,   0.00, 0.00, 0.50, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.75 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.16, 0.00, 0.00,   0.25, 0.00, 0.25, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.16, 0.00,   0.00, 0.25, 0.00, 0.25 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.25, 0.00 ],\n",
+        "])\n",
+        "\n",
+        "transition_probabilities_given_action1 = np.array(\\\n",
+        "[[0.00 , 0.25, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.75 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.50, 0.00, 0.50,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.50, 0.00,  0.00, 0.00, 0.00, 0.33,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.25 , 0.00, 0.00, 0.00,  0.00, 0.17, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.25, 0.00, 0.00,  0.50, 0.00, 0.17, 0.00,   0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.25, 0.00,  0.00, 0.50, 0.00, 0.33,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.50,  0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.33,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.17, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.16, 0.00, 0.00,   0.50, 0.00, 0.17, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.16, 0.00,   0.00, 0.50, 0.00, 0.33,   0.00, 0.00, 0.25, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.34,   0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.50 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.16, 0.00, 0.00,   0.75, 0.00, 0.25, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.16, 0.00,   0.00, 0.50, 0.00, 0.50 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.34,   0.00, 0.00, 0.50, 0.00 ],\n",
+        "])\n",
+        "\n",
+        "transition_probabilities_given_action2 = np.array(\\\n",
+        "[[0.00 , 0.25, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.25 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.25, 0.00, 0.25,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.25, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.75 , 0.00, 0.00, 0.00,  0.00, 0.17, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.50, 0.00, 0.00,  0.25, 0.00, 0.17, 0.00,   0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.50, 0.00,  0.00, 0.16, 0.00, 0.25,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.75,  0.00, 0.00, 0.16, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.50, 0.00, 0.00, 0.00,   0.00, 0.17, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.50, 0.00, 0.00,   0.25, 0.00, 0.17, 0.00,   0.00, 0.33, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.50, 0.00,   0.00, 0.16, 0.00, 0.25,   0.00, 0.00, 0.33, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.16, 0.00,   0.00, 0.00, 0.00, 0.50 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00,   0.00, 0.33, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.50, 0.00, 0.00,   0.50, 0.00, 0.33, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.50, 0.00,   0.00, 0.34, 0.00, 0.50 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.34, 0.00 ],\n",
+        "])\n",
+        "\n",
+        "transition_probabilities_given_action3 = np.array(\\\n",
+        "[[0.00 , 0.25, 0.00, 0.00,  0.33, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.50 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.50, 0.00, 0.75,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.50, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.50 , 0.00, 0.00, 0.00,  0.00, 0.50, 0.00, 0.00,   0.33, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.25, 0.00, 0.00,  0.33, 0.00, 0.50, 0.00,   0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.50,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.25,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.34, 0.00, 0.00, 0.00,   0.00, 0.50, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.16, 0.00, 0.00,   0.33, 0.00, 0.50, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.16, 0.00,   0.00, 0.17, 0.00, 0.50,   0.00, 0.00, 0.25, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.25 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.34, 0.00, 0.00, 0.00,   0.00, 0.50, 0.00, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.16, 0.00, 0.00,   0.50, 0.00, 0.50, 0.00 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.16, 0.00,   0.00, 0.25, 0.00, 0.75 ],\n",
+        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.25, 0.00 ],\n",
+        "])\n",
+        "\n",
+        "# Store all of these in a three dimension array\n",
+        "# Pr(s_{t+1}=2|s_{t}=1, a_{t}=3] is stored at position [2,1,3]\n",
+        "transition_probabilities_given_action = np.concatenate((np.expand_dims(transition_probabilities_given_action0,2),\n",
+        "                                                        np.expand_dims(transition_probabilities_given_action1,2),\n",
+        "                                                        np.expand_dims(transition_probabilities_given_action2,2),\n",
+        "                                                        np.expand_dims(transition_probabilities_given_action3,2)),axis=2)"
+      ],
+      "metadata": {
+        "id": "l7rT78BbOgTi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def q_learning_step(state_action_values, reward, state, new_state, action, gamma, alpha = 0.1):\n",
+        "  # TODO -- write this function\n",
+        "  # Replace this line\n",
+        "  state_action_values_after = np.copy(state_action_values)\n",
+        "\n",
+        "  return state_action_values_after"
+      ],
+      "metadata": {
+        "id": "5pO6-9ACWhiV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# This takes a single step from an MDP which just has a completely random policy\n",
+        "def markov_decision_process_step(state, transition_probabilities_given_action, reward_structure):\n",
+        "  # Pick action\n",
+        "  action = np.random.randint(4)\n",
+        "  # Update the state\n",
+        "  new_state = np.random.choice(a=np.arange(0,transition_probabilities_given_action.shape[0]),p = transition_probabilities_given_action[:,state,action])\n",
+        "  # Return the reward -- here the reward is for leaving the state\n",
+        "  reward = reward_structure[state]\n",
+        "\n",
+        "  return new_state, reward, action"
+      ],
+      "metadata": {
+        "id": "akjrncMF-FkU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the state-action values to random numbers\n",
+        "np.random.seed(0)\n",
+        "n_state = transition_probabilities_given_action.shape[0]\n",
+        "n_action = transition_probabilities_given_action.shape[2]\n",
+        "state_action_values = np.random.normal(size=(n_action, n_state))\n",
+        "gamma = 0.9\n",
+        "\n",
+        "policy = np.argmax(state_action_values, axis=0).astype(int)\n",
+        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
+        "mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values, rewards = reward_structure)\n",
+        "\n",
+        "# Now let's simulate a single Q-learning step\n",
+        "initial_state = 9\n",
+        "print(\"Initial state = \", initial_state)\n",
+        "new_state, reward, action = markov_decision_process_step(initial_state, transition_probabilities_given_action, reward_structure)\n",
+        "print(\"Action = \", action)\n",
+        "print(\"New state = \", new_state)\n",
+        "print(\"Reward = \", reward)\n",
+        "\n",
+        "state_action_values_after = q_learning_step(state_action_values, reward, initial_state, new_state, action, gamma)\n",
+        "print(\"Your value:\",state_action_values_after[action, initial_state])\n",
+        "print(\"True value:  0.27650262412468796\")\n",
+        "\n",
+        "policy = np.argmax(state_action_values, axis=0).astype(int)\n",
+        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
+        "mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values_after, rewards = reward_structure)\n"
+      ],
+      "metadata": {
+        "id": "Fu5_VjvbSwfJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's run this for a while and watch the policy improve"
+      ],
+      "metadata": {
+        "id": "Ogh0qucmb68J"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the state-action values to random numbers\n",
+        "np.random.seed(0)\n",
+        "n_state = transition_probabilities_given_action.shape[0]\n",
+        "n_action = transition_probabilities_given_action.shape[2]\n",
+        "state_action_values = np.random.normal(size=(n_action, n_state))\n",
+        "# Hard code termination state of finding fish\n",
+        "state_action_values[:,n_state-1] = 3.0\n",
+        "gamma = 0.9\n",
+        "\n",
+        "# Draw the initial setup\n",
+        "policy = np.argmax(state_action_values, axis=0).astype(int)\n",
+        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
+        "mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values, rewards = reward_structure)\n",
+        "\n",
+        "\n",
+        "state= np.random.randint(n_state-1)\n",
+        "\n",
+        "# Run for a number of iterations\n",
+        "for c_iter in range(10000):\n",
+        "  new_state, reward, action = markov_decision_process_step(state, transition_probabilities_given_action, reward_structure)\n",
+        "  state_action_values_after = q_learning_step(state_action_values, reward, state, new_state, action, gamma)\n",
+        "  # If in termination state, reset state randomly\n",
+        "  if new_state==15:\n",
+        "    state= np.random.randint(n_state-1)\n",
+        "  else:\n",
+        "    state = new_state\n",
+        "  # Update the policy\n",
+        "  state_action_values = np.copy(state_action_values_after)\n",
+        "  policy = np.argmax(state_action_values, axis=0).astype(int)\n",
+        "\n",
+        "# Draw the final situation\n",
+        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
+        "mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values, rewards = reward_structure)"
+      ],
+      "metadata": {
+        "id": "qQFhwVqPcCFH"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}