{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap19/19_3_Monte_Carlo_Methods.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "t9vk9Elugvmi"
      },
      "source": [
        "# **Notebook 19.3: Monte-Carlo methods**\n",
        "\n",
        "This notebook investigates Monte Carlo methods for  tabular reinforcement learning as described in section 19.3.2 of the book\n",
        "\n",
        "NOTE!  There is a mistake in Figure 19.11 in the first printing of the book, so check the errata to avoid becoming confused.  Apologies!\n",
        "\n",
        "Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n",
        "\n",
        "Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions.\n",
        "\n",
        "Thanks to [Akshil Patel](https://www.akshilpatel.com) and [Jessica Nicholson](https://jessicanicholson1.github.io) for their help in preparing this notebook."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OLComQyvCIJ7"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from PIL import Image\n",
        "\n",
        "from IPython.display import clear_output\n",
        "from time import sleep"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ZsvrUszPLyEG"
      },
      "outputs": [],
      "source": [
        "# Get local copies of components of images\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Empty.png\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Hole.png\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Fish.png\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Penguin.png"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Gq1HfJsHN3SB"
      },
      "outputs": [],
      "source": [
        "# Ugly class that takes care of drawing pictures like in the book.\n",
        "# You can totally ignore this code!\n",
        "class DrawMDP:\n",
        "  # Constructor initializes parameters\n",
        "  def __init__(self, n_row, n_col):\n",
        "    self.empty_image = np.asarray(Image.open('Empty.png'))\n",
        "    self.hole_image = np.asarray(Image.open('Hole.png'))\n",
        "    self.fish_image = np.asarray(Image.open('Fish.png'))\n",
        "    self.penguin_image = np.asarray(Image.open('Penguin.png'))\n",
        "    self.fig,self.ax = plt.subplots()\n",
        "    self.n_row = n_row\n",
        "    self.n_col = n_col\n",
        "\n",
        "    my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
        "    my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
        "    r = np.floor(my_colormap_vals_dec/(256*256))\n",
        "    g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
        "    b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
        "    self.colormap = np.vstack((r,g,b)).transpose()/255.0\n",
        "\n",
        "\n",
        "  def draw_text(self, text, row, col, position, color):\n",
        "    if position == 'bc':\n",
        "      self.ax.text( 83*col+41,83 * (row+1) -5, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
        "    if position == 'tc':\n",
        "      self.ax.text( 83*col+41,83 * (row) +10, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
        "    if position == 'lc':\n",
        "      self.ax.text( 83*col+2,83 * (row) +41, text, verticalalignment=\"center\", color=color, fontweight='bold', rotation=90)\n",
        "    if position == 'rc':\n",
        "      self.ax.text( 83*(col+1)-5,83 * (row) +41, text, horizontalalignment=\"right\", verticalalignment=\"center\", color=color, fontweight='bold', rotation=-90)\n",
        "    if position == 'tl':\n",
        "      self.ax.text( 83*col+5,83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"left\", color=color, fontweight='bold')\n",
        "    if position == 'tr':\n",
        "      self.ax.text( 83*(col+1)-5, 83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"right\", color=color, fontweight='bold')\n",
        "\n",
        "  # Draws a set of states\n",
        "  def draw_path(self, path, color1, color2):\n",
        "    for i in range(len(path)-1):\n",
        "      row_start = np.floor(path[i]/self.n_col)\n",
        "      row_end = np.floor(path[i+1]/self.n_col)\n",
        "      col_start = path[i] - row_start * self.n_col\n",
        "      col_end = path[i+1] - row_end * self.n_col\n",
        "\n",
        "      color_index = int(np.floor(255 * i/(len(path)-1.)))\n",
        "      self.ax.plot([col_start * 83+41 + i, col_end * 83+41 + i ],[row_start * 83+41 +  i, row_end * 83+41 + i ], color=(self.colormap[color_index,0],self.colormap[color_index,1],self.colormap[color_index,2]))\n",
        "\n",
        "\n",
        "  # Draw deterministic policy\n",
        "  def draw_deterministic_policy(self,i, action):\n",
        "      row = np.floor(i/self.n_col)\n",
        "      col = i - row * self.n_col\n",
        "      center_x = 83 * col + 41\n",
        "      center_y = 83 * row + 41\n",
        "      arrow_base_width = 10\n",
        "      arrow_height = 15\n",
        "      # Draw arrow pointing upward\n",
        "      if action ==0:\n",
        "          triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
        "                              [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
        "                              [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
        "      # Draw arrow pointing right\n",
        "      if action ==1:\n",
        "          triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
        "                              [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
        "                              [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      # Draw arrow pointing downward\n",
        "      if action ==2:\n",
        "          triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
        "                              [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
        "                              [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
        "      # Draw arrow pointing left\n",
        "      if action ==3:\n",
        "          triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
        "                              [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
        "                              [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "  # Draw stochastic policy\n",
        "  def draw_stochastic_policy(self,i, action_probs):\n",
        "      row = np.floor(i/self.n_col)\n",
        "      col = i - row * self.n_col\n",
        "      offset = 20\n",
        "      # Draw arrow pointing upward\n",
        "      center_x = 83 * col + 41\n",
        "      center_y = 83 * row + 41 - offset\n",
        "      arrow_base_width = 15 * action_probs[0]\n",
        "      arrow_height = 20 * action_probs[0]\n",
        "      triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
        "                          [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
        "                          [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "      # Draw arrow pointing right\n",
        "      center_x = 83 * col + 41 + offset\n",
        "      center_y = 83 * row + 41\n",
        "      arrow_base_width = 15 * action_probs[1]\n",
        "      arrow_height = 20 * action_probs[1]\n",
        "      triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
        "                          [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
        "                          [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "      # Draw arrow pointing downward\n",
        "      center_x = 83 * col + 41\n",
        "      center_y = 83 * row + 41 +offset\n",
        "      arrow_base_width = 15 * action_probs[2]\n",
        "      arrow_height = 20 * action_probs[2]\n",
        "      triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
        "                          [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
        "                          [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "      # Draw arrow pointing left\n",
        "      center_x = 83 * col + 41 -offset\n",
        "      center_y = 83 * row + 41\n",
        "      arrow_base_width = 15 * action_probs[3]\n",
        "      arrow_height = 20 * action_probs[3]\n",
        "      triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
        "                          [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
        "                          [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "\n",
        "  def draw(self, layout, state=None, draw_state_index= False, rewards=None, policy=None, state_values=None, state_action_values=None,path1=None, path2 = None):\n",
        "    # Construct the image\n",
        "    image_out = np.zeros((self.n_row * 83, self.n_col * 83, 4),dtype='uint8')\n",
        "    for c_row in range (self.n_row):\n",
        "      for c_col in range(self.n_col):\n",
        "        if layout[c_row * self.n_col + c_col]==0:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.empty_image\n",
        "        elif layout[c_row * self.n_col + c_col]==1:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.hole_image\n",
        "        else:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.fish_image\n",
        "        if state is not None and state == c_row * self.n_col + c_col:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.penguin_image\n",
        "\n",
        "    # Draw the image\n",
        "    plt.imshow(image_out)\n",
        "    self.ax.get_xaxis().set_visible(False)\n",
        "    self.ax.get_yaxis().set_visible(False)\n",
        "    self.ax.spines['top'].set_visible(False)\n",
        "    self.ax.spines['right'].set_visible(False)\n",
        "    self.ax.spines['bottom'].set_visible(False)\n",
        "    self.ax.spines['left'].set_visible(False)\n",
        "\n",
        "    if draw_state_index:\n",
        "      for c_cell in range(layout.size):\n",
        "          self.draw_text(\"%d\"%(c_cell), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tl','k')\n",
        "\n",
        "    # Draw the policy as triangles\n",
        "    if policy is not None:\n",
        "        # If the policy is deterministic\n",
        "        if len(policy) == len(layout):\n",
        "          for i in range(len(layout)):\n",
        "            self.draw_deterministic_policy(i, policy[i])\n",
        "        # Else it is stochastic\n",
        "        else:\n",
        "          for i in range(len(layout)):\n",
        "            self.draw_stochastic_policy(i,policy[:,i])\n",
        "\n",
        "\n",
        "    if path1 is not None:\n",
        "      self.draw_path(path1, np.array([1.0, 0.0, 0.0]), np.array([0.0, 1.0, 1.0]))\n",
        "\n",
        "    if rewards is not None:\n",
        "        for c_cell in range(layout.size):\n",
        "          self.draw_text(\"%d\"%(rewards[c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tr','r')\n",
        "\n",
        "    if state_values is not None:\n",
        "        for c_cell in range(layout.size):\n",
        "          self.draw_text(\"%2.2f\"%(state_values[c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'bc','black')\n",
        "\n",
        "    if state_action_values is not None:\n",
        "        for c_cell in range(layout.size):\n",
        "          self.draw_text(\"%2.2f\"%(state_action_values[0, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tc','black')\n",
        "          self.draw_text(\"%2.2f\"%(state_action_values[1, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'rc','black')\n",
        "          self.draw_text(\"%2.2f\"%(state_action_values[2, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'bc','black')\n",
        "          self.draw_text(\"%2.2f\"%(state_action_values[3, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'lc','black')\n",
        "\n",
        "\n",
        "\n",
        "    plt.show()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "eBQ7lTpJQBSe"
      },
      "outputs": [],
      "source": [
        "# We're going to work on the problem depicted in figure 19.10a\n",
        "n_rows = 4; n_cols = 4\n",
        "layout = np.zeros(n_rows * n_cols)\n",
        "reward_structure = np.zeros(n_rows * n_cols)\n",
        "layout[9] = 1 ; reward_structure[9] = -2    # Hole\n",
        "layout[10] = 1; reward_structure[10] = -2   # Hole\n",
        "layout[14] = 1; reward_structure[14] = -2   # Hole\n",
        "layout[15] = 2; reward_structure[15] = 3    # Fish\n",
        "initial_state = 0\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = initial_state, rewards=reward_structure, draw_state_index = True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "6Vku6v_se2IG"
      },
      "source": [
        "For clarity, the black numbers are the state number and the red numbers are the reward for being in that state.  Note that the states are indexed from 0 rather than 1 as in the book to make the code neater."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Fhc6DzZNOjiC"
      },
      "source": [
        "Now let's define the state transition function $Pr(s_{t+1}|s_{t},a)$ in full where $a$ is the actions.  Here $a=0$ means try to go upward, $a=1$, right, $a=2$ down and $a=3$ right.  However, the ice is slippery, so we don't always go the direction we want to.\n",
        "\n",
        "Note that as for the states, we've indexed the actions from zero (unlike in the book) so they map to the indices of arrays better"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l7rT78BbOgTi"
      },
      "outputs": [],
      "source": [
        "transition_probabilities_given_action0 = np.array(\\\n",
        "[[0.90, 0.05, 0.00, 0.00,  0.85, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.05, 0.85, 0.05, 0.00,  0.00, 0.85, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.05, 0.85, 0.05,  0.00, 0.00, 0.85, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.05, 0.90,  0.00, 0.00, 0.00, 0.85,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.05, 0.00, 0.00, 0.00,  0.05, 0.05, 0.00, 0.00,  0.85, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.05, 0.00, 0.00,  0.05, 0.00, 0.05, 0.00,  0.00, 0.85, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.05,  0.00, 0.00, 0.85, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.05, 0.05,  0.00, 0.00, 0.00, 0.85,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.05, 0.05, 0.00, 0.00,  0.85, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.05, 0.00, 0.00,  0.05, 0.00, 0.05, 0.00,  0.00, 0.85, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.05,  0.00, 0.00, 0.85, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.05, 0.05,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.10, 0.05, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.05, 0.00, 0.00,  0.05, 0.05, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.05, 0.00,  0.00, 0.05, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.05, 0.00]])\n",
        "\n",
        "\n",
        "transition_probabilities_given_action1 = np.array(\\\n",
        "[[0.10, 0.05, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.85, 0.05, 0.05, 0.00,  0.00, 0.05, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.85, 0.05, 0.05,  0.00, 0.00, 0.05, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.85, 0.90,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.05, 0.00, 0.00, 0.00,  0.05, 0.05, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.05, 0.00, 0.00,  0.85, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.05, 0.00,  0.00, 0.85, 0.00, 0.05,  0.00, 0.00, 0.05, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.85, 0.85,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.05, 0.05, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.05, 0.00, 0.00,  0.85, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.05, 0.00,  0.00, 0.85, 0.00, 0.05,  0.00, 0.00, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.85, 0.85,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.10, 0.05, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.05, 0.00, 0.00,  0.85, 0.05, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.05, 0.00,  0.00, 0.85, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.85, 0.00]])\n",
        "\n",
        "\n",
        "transition_probabilities_given_action2 = np.array(\\\n",
        "[[0.10, 0.05, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.05, 0.05, 0.05, 0.00,  0.00, 0.05, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.05, 0.05, 0.05,  0.00, 0.00, 0.05, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.05, 0.10,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.85, 0.00, 0.00, 0.00,  0.05, 0.05, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.85, 0.00, 0.00,  0.05, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.85, 0.00,  0.00, 0.05, 0.00, 0.05,  0.00, 0.00, 0.05, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.85,  0.00, 0.00, 0.05, 0.05,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.85, 0.00, 0.00, 0.00,  0.05, 0.05, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.85, 0.00, 0.00,  0.05, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.85, 0.00,  0.00, 0.05, 0.00, 0.05,  0.00, 0.00, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.85,  0.00, 0.00, 0.05, 0.05,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.85, 0.00, 0.00, 0.00,  0.90, 0.05, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.85, 0.00, 0.00,  0.05, 0.85, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.85, 0.00,  0.00, 0.05, 0.85, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.85,  0.00, 0.00, 0.05, 0.00]])\n",
        "\n",
        "transition_probabilities_given_action3 = np.array(\\\n",
        "[[0.90, 0.85, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.05, 0.05, 0.85, 0.00,  0.00, 0.05, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.05, 0.05, 0.85,  0.00, 0.00, 0.05, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.05, 0.10,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.05, 0.00, 0.00, 0.00,  0.85, 0.85, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.05, 0.00, 0.00,  0.05, 0.00, 0.85, 0.00,  0.00, 0.05, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.85,  0.00, 0.00, 0.05, 0.00,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.05, 0.05,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.85, 0.85, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.05, 0.00, 0.00,  0.05, 0.00, 0.85, 0.00,  0.00, 0.05, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.05, 0.00,  0.00, 0.05, 0.00, 0.85,  0.00, 0.00, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.05, 0.05,  0.00, 0.00, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.05, 0.00, 0.00, 0.00,  0.90, 0.85, 0.00, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.05, 0.00, 0.00,  0.05, 0.05, 0.85, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.05, 0.00,  0.00, 0.05, 0.05, 0.00],\n",
        " [0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.05,  0.00, 0.00, 0.05, 0.00]])\n",
        "\n",
        "\n",
        "\n",
        "# Store all of these in a three dimension array\n",
        "# Pr(s_{t+1}=2|s_{t}=1, a_{t}=3] is stored at position [2,1,3]\n",
        "transition_probabilities_given_action = np.concatenate((np.expand_dims(transition_probabilities_given_action0,2),\n",
        "                                                        np.expand_dims(transition_probabilities_given_action1,2),\n",
        "                                                        np.expand_dims(transition_probabilities_given_action2,2),\n",
        "                                                        np.expand_dims(transition_probabilities_given_action3,2)),axis=2)\n",
        "\n",
        "print('Grid Size:', len(transition_probabilities_given_action[0]))\n",
        "print()\n",
        "print('Transition Probabilities for when next state = 2:')\n",
        "print(transition_probabilities_given_action[2])\n",
        "print()\n",
        "print('Transitions Probabilities for when next state = 2 and current state = 1')\n",
        "print(transition_probabilities_given_action[2][1])\n",
        "print()\n",
        "print('Transitions Probabilities for  when next state = 2 and current state = 1 and action = 3 (Left):')\n",
        "print(transition_probabilities_given_action[2][1][3])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BHWjp6Qq4tBF"
      },
      "source": [
        "## Implementation Details\n",
        "\n",
        "We provide the following methods:\n",
        "\n",
        "- **`markov_decision_process_step_stochastic`** - this function selects an action based on the stochastic policy for the current state, updates the state based on the transition probabilities associated with the chosen action, and returns the new state, the reward obtained for the new state, the chosen action, and whether the episode terminates.\n",
        "\n",
        "- **`get_one_episode`** - this function simulates an episode of agent-environment interaction. It returns the states, rewards, and actions seen in that episode, which we can then use to update the agent.\n",
        "\n",
        "- **`calculate_returns`** - this function calls on the **`calculate_return`** function that computes the discounted sum of rewards from a specific step, in a sequence of rewards.\n",
        "\n",
        "You have to implement the following methods:\n",
        "\n",
        "- **`deterministic_policy_to_epsilon_greedy`** - given a deterministic policy, where one action is chosen per state, this function computes the $\\epsilon$-greedy version of that policy, where each of the four actions has some nonzero probability of being selected per state. In each state, the probability of selecting each of the actions should sum to 1.\n",
        "\n",
        "- **`update_policy_mc`** - this function updates the action-value function using the Monte Carlo method.  We use the rollout trajectories collected using `get_one_episode` to calculate the returns. Then update the action values towards the Monte Carlo estimate of the return for each state."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "akjrncMF-FkU"
      },
      "outputs": [],
      "source": [
        "# This takes a single step from an MDP\n",
        "def markov_decision_process_step_stochastic(state, transition_probabilities_given_action, reward_structure, terminal_states, stochastic_policy):\n",
        "  # Pick action\n",
        "  action = np.random.choice(a=np.arange(0,4,1),p=stochastic_policy[:,state])\n",
        "\n",
        "  # Update the state\n",
        "  new_state = np.random.choice(a=np.arange(0,transition_probabilities_given_action.shape[0]),p = transition_probabilities_given_action[:,state,action])\n",
        "  # Return the reward\n",
        "  reward = reward_structure[new_state]\n",
        "  is_terminal = new_state in [terminal_states]\n",
        "\n",
        "  return new_state, reward, action, is_terminal"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "bFYvF9nAloIA"
      },
      "outputs": [],
      "source": [
        "# Run one episode and return actions, rewards, returns\n",
        "def get_one_episode(initial_state, transition_probabilities_given_action, reward_structure, terminal_states, stochastic_policy):\n",
        "\n",
        "    states  = []\n",
        "    rewards = []\n",
        "    actions = []\n",
        "\n",
        "    states.append(initial_state)\n",
        "    state = initial_state\n",
        "\n",
        "    is_terminal = False\n",
        "    # While we haven't reached a terminal state\n",
        "    while not is_terminal:\n",
        "        # Keep stepping through MDP\n",
        "        state, reward, action, is_terminal = markov_decision_process_step_stochastic(state,\n",
        "                                                                                     transition_probabilities_given_action,\n",
        "                                                                                     reward_structure,\n",
        "                                                                                     terminal_states,\n",
        "                                                                                     stochastic_policy)\n",
        "        states.append(state)\n",
        "        rewards.append(reward)\n",
        "        actions.append(action)\n",
        "\n",
        "    states  = np.array(states, dtype=\"uint8\")\n",
        "    rewards = np.array(rewards)\n",
        "    actions = np.array(actions, dtype=\"uint8\")\n",
        "\n",
        "    # If the episode was terminated early, we need to compute the return differently using r_{t+1} + gamma*V(s_{t+1})\n",
        "    return states, rewards, actions"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qJhOrIId4tBF"
      },
      "outputs": [],
      "source": [
        "def visualize_one_episode(states, actions):\n",
        "    # Define actions for visualization\n",
        "    acts = ['up', 'right', 'down', 'left']\n",
        "\n",
        "    # Iterate over the states and actions\n",
        "    for i in range(len(states)):\n",
        "\n",
        "        if i == 0:\n",
        "            print('Starting State:', states[i])\n",
        "\n",
        "        elif i == len(states)-1:\n",
        "            print('Episode Done:', states[i])\n",
        "\n",
        "        else:\n",
        "            print('State', states[i-1])\n",
        "            a = actions[i]\n",
        "            print('Action:', acts[a])\n",
        "            print('Next State:', states[i])\n",
        "\n",
        "        # Visualize the current state using the MDP drawer\n",
        "        mdp_drawer.draw(layout, state=states[i], rewards=reward_structure, draw_state_index=True)\n",
        "        clear_output(True)\n",
        "\n",
        "        # Pause for a short duration to allow observation\n",
        "        sleep(1.5)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_AKwdtQQHzIK"
      },
      "outputs": [],
      "source": [
        "# Convert deterministic policy (1x16) to an epsilon greedy stochastic policy (4x16)\n",
        "def deterministic_policy_to_epsilon_greedy(policy, epsilon=0.2):\n",
        "  # TODO -- write this function\n",
        "  # You should wind up with a 4x16 matrix, with epsilon/3 in every position except the real policy\n",
        "  # The columns should sum to one\n",
        "  # Replace this line:\n",
        "  stochastic_policy = np.ones((4,16)) * 0.25\n",
        "\n",
        "\n",
        "  return stochastic_policy"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OhVXw2Favo-w"
      },
      "source": [
        "Let's try generating an episode"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5zQ1Oh9Zvnwt"
      },
      "outputs": [],
      "source": [
        "# Set seed so random numbers always the same\n",
        "np.random.seed(6)\n",
        "# Print in compact form\n",
        "np.set_printoptions(precision=3)\n",
        "\n",
        "# Let's start with by setting the policy randomly\n",
        "policy = np.random.choice(size= n_rows * n_cols, a=np.arange(0,4,1))\n",
        "\n",
        "# Convert deterministic policy to stochastic\n",
        "stochastic_policy = deterministic_policy_to_epsilon_greedy(policy)\n",
        "\n",
        "print(\"Initial Penguin Policy:\")\n",
        "print(policy)\n",
        "print()\n",
        "print('Stochastic Penguin Policy:')\n",
        "print(stochastic_policy)\n",
        "print()\n",
        "\n",
        "initial_state = 5\n",
        "terminal_states=[15]\n",
        "states, rewards, actions = get_one_episode(initial_state,transition_probabilities_given_action, reward_structure, terminal_states, stochastic_policy)\n",
        "\n",
        "print('Initial Penguin Position:')\n",
        "mdp_drawer.draw(layout, state = initial_state, rewards=reward_structure, draw_state_index = True)\n",
        "\n",
        "print('Total steps to termination:', len(states))\n",
        "print('Final Reward:', np.sum(rewards))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "KJH-UGKk4tBF"
      },
      "outputs": [],
      "source": [
        "#this visualizes the complete episode\n",
        "visualize_one_episode(states, actions)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nl6rtNffwhcU"
      },
      "source": [
        "We'll need to calculate the returns (discounted cumulative reward) for each state action pair"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "FxrItqGPLTq7"
      },
      "outputs": [],
      "source": [
        "def calculate_returns(rewards, gamma):\n",
        "  returns = np.zeros(len(rewards))\n",
        "  for c_return in range(len(returns)):\n",
        "    returns[c_return] = calculate_return(rewards[c_return:], gamma)\n",
        "  return returns\n",
        "\n",
        "def calculate_return(rewards, gamma):\n",
        "  return_val = 0.0\n",
        "  for i in range(len(rewards)):\n",
        "      return_val += rewards[i] * np.power(gamma, i)\n",
        "  return return_val"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DX1KfHRhzUOU"
      },
      "source": [
        "This routine does the main work of the on-policy Monte Carlo method.  We repeatedly rollout episods, calculate the returns.  Then we figure out the average return for each state action pair, and choose the next policy as the action that has greatest state action value at each state."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "hCghcKlOJXSM"
      },
      "outputs": [],
      "source": [
        "def update_policy_mc(initial_state, transition_probabilities_given_action, reward_structure, terminal_states, stochastic_policy, gamma, n_rollouts=1):\n",
        "  # Create two matrices to store total returns for each action/state pair and the\n",
        "  # number of times we observed that action/state pair\n",
        "  n_state = transition_probabilities_given_action.shape[0]\n",
        "  n_action = transition_probabilities_given_action.shape[2]\n",
        "  # Contains the total returns seen for taking this action at this state\n",
        "  state_action_returns_total = np.zeros((n_action, n_state))\n",
        "  # Contains the number of times we have taken this action in this state\n",
        "  state_action_count = np.zeros((n_action,n_state))\n",
        "\n",
        "  # For each rollout\n",
        "  for c_rollout in range(n_rollouts):\n",
        "      # TODO -- Complete this function\n",
        "      # 1. Draw a random state from 0 to 14\n",
        "      # 2. Get one episode starting at that state\n",
        "      # 3. Compute the returns\n",
        "      # 4. For each position in the trajectory, update state_action_returns_total and state_action_count\n",
        "      # Replace these two lines\n",
        "      state_action_returns_total[0,1] = state_action_returns_total[0,1]\n",
        "      state_action_count[0,1] = state_action_count[0,1]\n",
        "\n",
        "\n",
        "  # Normalize -- add small number to denominator to avoid divide by zero\n",
        "  state_action_values = state_action_returns_total/( state_action_count+0.00001)\n",
        "  policy = np.argmax(state_action_values, axis=0).astype(int)\n",
        "  return policy, state_action_values\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "8jWhDlkaKj7Q"
      },
      "outputs": [],
      "source": [
        "# Set seed so random numbers always the same\n",
        "np.random.seed(0)\n",
        "# Print in compact form\n",
        "np.set_printoptions(precision=3)\n",
        "\n",
        "# Let's start with by setting the policy randomly\n",
        "policy = np.random.choice(size= n_rows * n_cols, a=np.arange(0,4,1))\n",
        "gamma = 0.9\n",
        "print(\"Initial policy:\")\n",
        "print(policy)\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, policy = policy, rewards = reward_structure)\n",
        "\n",
        "terminal_states = [15]\n",
        "# Track all the policies so we can visualize them later\n",
        "all_policies = []\n",
        "n_policy_update = 2000\n",
        "for c_policy_update in range(n_policy_update):\n",
        "    # Convert policy to stochastic\n",
        "    stochastic_policy = deterministic_policy_to_epsilon_greedy(policy)\n",
        "    # Update policy by Monte Carlo method\n",
        "    policy, state_action_values = update_policy_mc(initial_state, transition_probabilities_given_action, reward_structure, terminal_states, stochastic_policy, gamma, n_rollouts=100)\n",
        "    all_policies.append(policy)\n",
        "\n",
        "    # Print out 10 snapshots of progress\n",
        "    if (c_policy_update % (n_policy_update//10) == 0) or c_policy_update == n_policy_update - 1:\n",
        "        print(\"Updated policy\")\n",
        "        print(policy)\n",
        "        mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "        mdp_drawer.draw(layout, policy = policy, rewards = reward_structure, state_action_values=state_action_values)\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "j7Ny47kTEMzH"
      },
      "source": [
        "You can see a definite improvement to the policy"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.12"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}