udlbook/Notebooks/Chap19/19_1_Markov_Decision_Processes.ipynb

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyPg3umHnqmIXX6jGe809Nxf",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap19/19_1_Markov_Decision_Processes.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# **Notebook 19.1: Markov Decision Processes**\n",
        "\n",
        "This notebook investigates Markov decision processes as described in section 19.1 of the book.\n",
        "\n",
        "Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n",
        "\n",
        "Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions."
      ],
      "metadata": {
        "id": "t9vk9Elugvmi"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from PIL import Image"
      ],
      "metadata": {
        "id": "OLComQyvCIJ7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Get local copies of components of images\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Empty.png\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Hole.png\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Fish.png\n",
        "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Penguin.png"
      ],
      "metadata": {
        "id": "ZsvrUszPLyEG"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Ugly class that takes care of drawing pictures like in the book.\n",
        "# You can totally ignore this code!\n",
        "class DrawMDP:\n",
        "  # Constructor initializes parameters\n",
        "  def __init__(self, n_row, n_col):\n",
        "    self.empty_image = np.asarray(Image.open('Empty.png'))\n",
        "    self.hole_image = np.asarray(Image.open('Hole.png'))\n",
        "    self.fish_image = np.asarray(Image.open('Fish.png'))\n",
        "    self.penguin_image = np.asarray(Image.open('Penguin.png'))\n",
        "    self.fig,self.ax = plt.subplots()\n",
        "    self.n_row = n_row\n",
        "    self.n_col = n_col\n",
        "\n",
        "    my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
        "    my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
        "    r = np.floor(my_colormap_vals_dec/(256*256))\n",
        "    g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
        "    b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
        "    self.colormap = np.vstack((r,g,b)).transpose()/255.0\n",
        "\n",
        "\n",
        "  def draw_text(self, text, row, col, position, color):\n",
        "    if position == 'bc':\n",
        "      self.ax.text( 83*col+41,83 * (row+1) -10, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
        "    if position == 'tl':\n",
        "      self.ax.text( 83*col+5,83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"left\", color=color, fontweight='bold')\n",
        "\n",
        "  # Draws a set of states\n",
        "  def draw_path(self, path, color1, color2):\n",
        "    for i in range(len(path)-1):\n",
        "      row_start = np.floor(path[i]/self.n_col)\n",
        "      row_end = np.floor(path[i+1]/self.n_col)\n",
        "      col_start = path[i] - row_start * self.n_col\n",
        "      col_end = path[i+1] - row_end * self.n_col\n",
        "\n",
        "      color_index = int(np.floor(255 * i/(len(path)-1.)))\n",
        "      self.ax.plot([col_start * 83+41 + i, col_end * 83+41 + i ],[row_start * 83+41 +  i, row_end * 83+41 + i ], color=(self.colormap[color_index,0],self.colormap[color_index,1],self.colormap[color_index,2]))\n",
        "\n",
        "\n",
        "  # Draw deterministic policy\n",
        "  def draw_deterministic_policy(self,i, action):\n",
        "      row = np.floor(i/self.n_col)\n",
        "      col = i - row * self.n_col\n",
        "      center_x = 83 * col + 41\n",
        "      center_y = 83 * row + 41\n",
        "      arrow_base_width = 10\n",
        "      arrow_height = 15\n",
        "      # Draw arrow pointing upward\n",
        "      if action ==0:\n",
        "          triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
        "                              [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
        "                              [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
        "      # Draw arrow pointing right\n",
        "      if action ==1:\n",
        "          triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
        "                              [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
        "                              [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      # Draw arrow pointing downward\n",
        "      if action ==2:\n",
        "          triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
        "                              [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
        "                              [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
        "      # Draw arrow pointing left\n",
        "      if action ==3:\n",
        "          triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
        "                              [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
        "                              [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "  # Draw stochastic policy\n",
        "  def draw_stochastic_policy(self,i, action_probs):\n",
        "      row = np.floor(i/self.n_col)\n",
        "      col = i - row * self.n_col\n",
        "      offset = 20\n",
        "      # Draw arrow pointing upward\n",
        "      center_x = 83 * col + 41\n",
        "      center_y = 83 * row + 41 - offset\n",
        "      arrow_base_width = 15 * action_probs[0]\n",
        "      arrow_height = 20 * action_probs[0]\n",
        "      triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
        "                          [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
        "                          [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "      # Draw arrow pointing right\n",
        "      center_x = 83 * col + 41 + offset\n",
        "      center_y = 83 * row + 41\n",
        "      arrow_base_width = 15 * action_probs[1]\n",
        "      arrow_height = 20 * action_probs[1]\n",
        "      triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
        "                          [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
        "                          [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "      # Draw arrow pointing downward\n",
        "      center_x = 83 * col + 41\n",
        "      center_y = 83 * row + 41 +offset\n",
        "      arrow_base_width = 15 * action_probs[2]\n",
        "      arrow_height = 20 * action_probs[2]\n",
        "      triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
        "                          [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
        "                          [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "      # Draw arrow pointing left\n",
        "      center_x = 83 * col + 41 -offset\n",
        "      center_y = 83 * row + 41\n",
        "      arrow_base_width = 15 * action_probs[3]\n",
        "      arrow_height = 20 * action_probs[3]\n",
        "      triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
        "                          [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
        "                          [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
        "      self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
        "\n",
        "\n",
        "\n",
        "\n",
        "  def draw(self, layout, state, draw_state_index= False, rewards=None, policy=None, state_values=None, action_values=None,path1=None, path2 = None):\n",
        "    # Construct the image\n",
        "    image_out = np.zeros((self.n_row * 83, self.n_col * 83, 4),dtype='uint8')\n",
        "    for c_row in range (self.n_row):\n",
        "      for c_col in range(self.n_col):\n",
        "        if layout[c_row * self.n_col + c_col]==0:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.empty_image\n",
        "        elif layout[c_row * self.n_col + c_col]==1:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.hole_image\n",
        "        else:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.fish_image\n",
        "        if state == c_row * self.n_col + c_col:\n",
        "          image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.penguin_image\n",
        "\n",
        "    # Draw the image\n",
        "    plt.imshow(image_out)\n",
        "    self.ax.get_xaxis().set_visible(False)\n",
        "    self.ax.get_yaxis().set_visible(False)\n",
        "    self.ax.spines['top'].set_visible(False)\n",
        "    self.ax.spines['right'].set_visible(False)\n",
        "    self.ax.spines['bottom'].set_visible(False)\n",
        "    self.ax.spines['left'].set_visible(False)\n",
        "\n",
        "    if draw_state_index:\n",
        "      for c_cell in range(layout.size):\n",
        "          self.draw_text(\"%d\"%(c_cell), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tl','k')\n",
        "\n",
        "    # Draw the policy as triangles\n",
        "    if policy is not None:\n",
        "        # If the policy is deterministic\n",
        "        if len(policy) == len(layout):\n",
        "          for i in range(len(layout)):\n",
        "            self.draw_deterministic_policy(i, policy[i])\n",
        "        # Else it is stochastic\n",
        "        else:\n",
        "          for i in range(len(layout)):\n",
        "            self.draw_stochastic_policy(i,policy[:,i])\n",
        "\n",
        "\n",
        "    if path1 is not None:\n",
        "      # self.draw_path(path1, np.array([0.81, 0.51, 0.38]), np.array([1.0, 0.2, 0.5]))\n",
        "      self.draw_path(path1, np.array([1.0, 0.0, 0.0]), np.array([0.0, 1.0, 1.0]))\n",
        "\n",
        "\n",
        "    plt.show()"
      ],
      "metadata": {
        "id": "Gq1HfJsHN3SB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Let's draw the initial situation with the penguin in top right\n",
        "n_rows = 4; n_cols = 4\n",
        "layout = np.zeros(n_rows * n_cols)\n",
        "initial_state = 0\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = initial_state, draw_state_index = True)"
      ],
      "metadata": {
        "id": "eBQ7lTpJQBSe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Note that the states are indexed from 0 rather than 1 as in the book to make\n",
        "the code neater."
      ],
      "metadata": {
        "id": "P7P40UyMunKb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the state probabilities\n",
        "transition_probabilities = np.array( \\\n",
        "[[0.00 , 0.33, 0.00, 0.00,  0.33, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.50 , 0.00, 0.33, 0.00,  0.00, 0.25, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.33, 0.00, 0.50,  0.00, 0.00, 0.25, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.33, 0.00,  0.00, 0.00, 0.00, 0.33,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.50 , 0.00, 0.00, 0.00,  0.00, 0.25, 0.00, 0.00,   0.33, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.34, 0.00, 0.00,  0.33, 0.00, 0.25, 0.00,   0.00, 0.25, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.34, 0.00,  0.00, 0.25, 0.00, 0.33,   0.00, 0.00, 0.25, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.50,  0.00, 0.00, 0.25, 0.00,   0.00, 0.00, 0.00, 0.33,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.34, 0.00, 0.00, 0.00,   0.00, 0.25, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.25, 0.00, 0.00,   0.33, 0.00, 0.25, 0.00,   0.00, 0.33, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.25, 0.00,   0.00, 0.25, 0.00, 0.33,   0.00, 0.00, 0.33, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.34,   0.00, 0.00, 0.25, 0.00,   0.00, 0.00, 0.00, 0.50 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.34, 0.00, 0.00, 0.00,   0.00, 0.33, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.25, 0.00, 0.00,   0.50, 0.00, 0.33, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.25, 0.00,   0.00, 0.34, 0.00, 0.50 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.34,   0.00, 0.00, 0.34, 0.00 ],\n",
        "])\n",
        "initial_state = 0"
      ],
      "metadata": {
        "id": "wgFcIi4YQJWI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Define a step from the Markov process"
      ],
      "metadata": {
        "id": "axllRDDuDDLS"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def markov_process_step(state, transition_probabilities):\n",
        "  # TODO -- update the state according to the appropriate transition probabilities\n",
        "  # One way to do this is to use np.random.choice\n",
        "  # Replace this line:\n",
        "  new_state = 0\n",
        "\n",
        "\n",
        "  return new_state"
      ],
      "metadata": {
        "id": "FrSZrS67sdbN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Run the Markov process for 10 steps and visualise the results"
      ],
      "metadata": {
        "id": "uTj7rN6LDFXd"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "np.random.seed(0)\n",
        "T = 10\n",
        "states = np.zeros(T, dtype='uint8')\n",
        "states[0] = 0\n",
        "for t in range(T-1):\n",
        "  states[t+1] = markov_process_step(states[t], transition_probabilities)\n",
        "\n",
        "\n",
        "\n",
        "print(\"Your States:\", states)\n",
        "print(\"True States: [ 0  4  8  9 10  9 10  9 13 14]\")\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = states[0], path1=states, draw_state_index = True)"
      ],
      "metadata": {
        "id": "lRIdjagCwP62"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Define a Markov one step of a reward process."
      ],
      "metadata": {
        "id": "QLyjyBjjDMin"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def markov_reward_process_step(state, transition_probabilities, reward_structure):\n",
        "\n",
        "    # TODO -- write this function\n",
        "    # Update the state.  Return a reward of +1 if the Penguin lands on the fish\n",
        "    # or zero otherwise.\n",
        "    # Replace this line\n",
        "    new_state = 0; reward = 0\n",
        "\n",
        "\n",
        "    return new_state, reward"
      ],
      "metadata": {
        "id": "YPHSJRKx-pgO"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Run the Markov reward process for 10 steps and visualise the results"
      ],
      "metadata": {
        "id": "AIz8QEiRFoCm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Set up the reward structure so it matches figure 19.2\n",
        "reward_structure = np.zeros((16,1))\n",
        "reward_structure[3] = 1; reward_structure[8] = 1; reward_structure[10] = 1\n",
        "\n",
        "# Initialize random numbers\n",
        "np.random.seed(0)\n",
        "T = 10\n",
        "# Set up the states, so the fish are in the same positions as figure 19.2\n",
        "states = np.zeros(T, dtype='uint8')\n",
        "rewards = np.zeros(T, dtype='uint8')\n",
        "\n",
        "states[0] = 0\n",
        "for t in range(T-1):\n",
        "  states[t+1],rewards[t+1] = markov_reward_process_step(states[t], transition_probabilities, reward_structure)\n",
        "\n",
        "print(\"Your States:\", states)\n",
        "print(\"Your Rewards:\", rewards)\n",
        "print(\"True Rewards: [0 0 1 0 1 0 1 0 0 0]\")\n",
        "\n",
        "\n",
        "# Draw the figure\n",
        "layout = np.zeros(n_rows * n_cols)\n",
        "layout[3] = 2; layout[8] = 2 ; layout[10] = 2\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = states[0], path1=states, draw_state_index = True)"
      ],
      "metadata": {
        "id": "0p1gCpGoFn4M"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now let's calculate the return -- the sum of discounted future rewards"
      ],
      "metadata": {
        "id": "lyz47NWrITfj"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def calculate_return(rewards, gamma):\n",
        "  # TODO -- you write this function\n",
        "  # It should compute one return for the start of the sequence (i.e. G_1)\n",
        "  # Replace this line\n",
        "  return_val = 0.0\n",
        "\n",
        "\n",
        "  return return_val"
      ],
      "metadata": {
        "id": "4fEuBRPnFm_N"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "gamma = 0.9\n",
        "for t in range(len(states)):\n",
        "  print(\"Return at time %d = %3.3f\"%(t, calculate_return(rewards[t:],gamma)))\n",
        "\n",
        "# Reality check!\n",
        "print(\"True return at time 0: 1.998\")"
      ],
      "metadata": {
        "id": "o19lQgM3JrOz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now let's define the state transition function $Pr(s_{t+1}|s_{t},a)$ in full where $a$ is the actions.  Here $a=0$ means try to go upward, $a=1$, right, $a=2$ down and $a=3$ right.  However, the ice is slippery, so we don't always go the direction we want to.\n",
        "\n",
        "Note that as for the states, we've indexed the actions from zero (unlike in the book, so they map to the indices of arrays better)"
      ],
      "metadata": {
        "id": "Fhc6DzZNOjiC"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "transition_probabilities_given_action1 = np.array(\\\n",
        "[[0.00 , 0.33, 0.00, 0.00,  0.50, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.50 , 0.00, 0.33, 0.00,  0.00, 0.50, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.33, 0.00, 0.50,  0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.33, 0.00,  0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.50 , 0.00, 0.00, 0.00,  0.00, 0.17, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.34, 0.00, 0.00,  0.25, 0.00, 0.17, 0.00,   0.00, 0.50, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.34, 0.00,  0.00, 0.17, 0.00, 0.25,   0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.50,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.17, 0.00, 0.00,   0.75, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.16, 0.00, 0.00,   0.25, 0.00, 0.17, 0.00,   0.00, 0.50, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.16, 0.00,   0.00, 0.17, 0.00, 0.25,   0.00, 0.00, 0.50, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.75 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.16, 0.00, 0.00,   0.25, 0.00, 0.25, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.16, 0.00,   0.00, 0.25, 0.00, 0.25 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.25, 0.00 ],\n",
        "])\n",
        "\n",
        "transition_probabilities_given_action2 = np.array(\\\n",
        "[[0.00 , 0.25, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.75 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.50, 0.00, 0.50,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.50, 0.00,  0.00, 0.00, 0.00, 0.33,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.25 , 0.00, 0.00, 0.00,  0.00, 0.17, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.25, 0.00, 0.00,  0.50, 0.00, 0.17, 0.00,   0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.25, 0.00,  0.00, 0.50, 0.00, 0.33,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.50,  0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.33,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.17, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.16, 0.00, 0.00,   0.50, 0.00, 0.17, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.16, 0.00,   0.00, 0.50, 0.00, 0.33,   0.00, 0.00, 0.25, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.34,   0.00, 0.00, 0.50, 0.00,   0.00, 0.00, 0.00, 0.50 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.16, 0.00, 0.00,   0.75, 0.00, 0.25, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.16, 0.00,   0.00, 0.50, 0.00, 0.50 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.34,   0.00, 0.00, 0.50, 0.00 ],\n",
        "])\n",
        "\n",
        "transition_probabilities_given_action3 = np.array(\\\n",
        "[[0.00 , 0.25, 0.00, 0.00,  0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.25 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.25, 0.00, 0.25,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.25, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.75 , 0.00, 0.00, 0.00,  0.00, 0.17, 0.00, 0.00,   0.25, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.50, 0.00, 0.00,  0.25, 0.00, 0.17, 0.00,   0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.50, 0.00,  0.00, 0.16, 0.00, 0.25,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.75,  0.00, 0.00, 0.16, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.50, 0.00, 0.00, 0.00,   0.00, 0.17, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.50, 0.00, 0.00,   0.25, 0.00, 0.17, 0.00,   0.00, 0.33, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.50, 0.00,   0.00, 0.16, 0.00, 0.25,   0.00, 0.00, 0.33, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.16, 0.00,   0.00, 0.00, 0.00, 0.50 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00,   0.00, 0.33, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.50, 0.00, 0.00,   0.50, 0.00, 0.33, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.50, 0.00,   0.00, 0.34, 0.00, 0.50 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.50,   0.00, 0.00, 0.34, 0.00 ],\n",
        "])\n",
        "\n",
        "transition_probabilities_given_action4 = np.array(\\\n",
        "[[0.00 , 0.25, 0.00, 0.00,  0.33, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.50 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.50, 0.00, 0.75,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.50, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.50 , 0.00, 0.00, 0.00,  0.00, 0.50, 0.00, 0.00,   0.33, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.25, 0.00, 0.00,  0.33, 0.00, 0.50, 0.00,   0.00, 0.17, 0.00, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.25, 0.00,  0.00, 0.17, 0.00, 0.50,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.25,  0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.34, 0.00, 0.00, 0.00,   0.00, 0.50, 0.00, 0.00,   0.50, 0.00, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.16, 0.00, 0.00,   0.33, 0.00, 0.50, 0.00,   0.00, 0.25, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.16, 0.00,   0.00, 0.17, 0.00, 0.50,   0.00, 0.00, 0.25, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.17, 0.00,   0.00, 0.00, 0.00, 0.25 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.34, 0.00, 0.00, 0.00,   0.00, 0.50, 0.00, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.16, 0.00, 0.00,   0.50, 0.00, 0.50, 0.00 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.16, 0.00,   0.00, 0.25, 0.00, 0.75 ],\n",
        " [0.00 , 0.00, 0.00, 0.00,  0.00, 0.00, 0.00, 0.00,   0.00, 0.00, 0.00, 0.25,   0.00, 0.00, 0.25, 0.00 ],\n",
        "])\n",
        "\n",
        "# Store all of these in a three dimension array\n",
        "# Pr(s_{t+1}=2|s_{t}=1, a_{t}=3] is stored at position [2,1,3]\n",
        "transition_probabilities_given_action = np.concatenate((np.expand_dims(transition_probabilities_given_action1,2),\n",
        "                                                        np.expand_dims(transition_probabilities_given_action2,2),\n",
        "                                                        np.expand_dims(transition_probabilities_given_action3,2),\n",
        "                                                        np.expand_dims(transition_probabilities_given_action4,2)),axis=2)"
      ],
      "metadata": {
        "id": "l7rT78BbOgTi"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Now we need a policy.  Let's start with the deterministic policy in figure 19.5a:\n",
        "policy = [2,2,1,1, 2,1,1,1, 1,1,0,2, 1,0,1,1]\n",
        "\n",
        "# Let's draw the policy first\n",
        "layout = np.zeros(n_rows * n_cols)\n",
        "layout[15] = 2\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = states[0], policy = policy, draw_state_index = True)"
      ],
      "metadata": {
        "id": "8jWhDlkaKj7Q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def markov_decision_process_step_deterministic(state, transition_probabilities_given_action, reward_structure, policy):\n",
        "  # TODO -- complete this function.\n",
        "  # For each state, there's is a corresponding action.\n",
        "  # Draw the next state based on the current state and that action\n",
        "  # and calculate the reward\n",
        "  # Replace this line:\n",
        "  new_state = 0; reward = 0;\n",
        "\n",
        "  return new_state, reward\n"
      ],
      "metadata": {
        "id": "dueNbS2SUVUK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Set up the reward structure so it matches figure 19.2\n",
        "reward_structure = np.zeros((16,1))\n",
        "reward_structure[15] = 1\n",
        "\n",
        "# Initialize random number seed\n",
        "np.random.seed(3)\n",
        "T = 10\n",
        "# Set up the states, so the fish are in the same positions as figure 19.5\n",
        "states = np.zeros(T, dtype='uint8')\n",
        "rewards = np.zeros(T, dtype='uint8')\n",
        "\n",
        "states[0] = 0\n",
        "for t in range(T-1):\n",
        "  states[t+1],rewards[t+1] = markov_decision_process_step_deterministic(states[t], transition_probabilities_given_action, reward_structure, policy)\n",
        "\n",
        "print(\"Your States:\", states)\n",
        "print(\"True States: [ 0  4  8  9 13 14 15 11  7  3]\")\n",
        "print(\"Your Rewards:\", rewards)\n",
        "print(\"True Rewards: [0 0 0 0 0 0 1 0 0 0]\")\n",
        "\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = states[0], path1=states, policy = policy, draw_state_index = True)"
      ],
      "metadata": {
        "id": "4Du5aUfd2Lci"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "You can see that the Penguin usually follows the policy, (heads in the direction of the cyan arrows (when it can).  But sometimes, the penguin \"slips\" to a different neighboring state\n",
        "\n",
        "Now let's investigate a stochastic policy"
      ],
      "metadata": {
        "id": "bLEd8xug33b-"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "np.random.seed(0)\n",
        "# Let's now choose a random policy.  We'll generate a set of random numbers and pass\n",
        "# them through a softmax function\n",
        "stochastic_policy = np.random.normal(size=(4,n_rows*n_cols))\n",
        "stochastic_policy = np.exp(stochastic_policy) / (np.ones((4,1))@ np.expand_dims(np.sum(np.exp(stochastic_policy), axis=0),0))\n",
        "np.set_printoptions(precision=2)\n",
        "print(stochastic_policy)\n",
        "\n",
        "# Let's draw the policy first\n",
        "layout = np.zeros(n_rows * n_cols)\n",
        "layout[15] = 2\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = states[0], path1=states, policy = stochastic_policy, draw_state_index = True)"
      ],
      "metadata": {
        "id": "o7T0b3tyilDc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def markov_decision_process_step_stochastic(state, transition_probabilities_given_action, reward_structure, stochastic_policy):\n",
        "  # TODO -- complete this function.\n",
        "  # For each state, there's is a corresponding distribution over actions\n",
        "  # Draw a sample from that distribution to get the action\n",
        "  # Draw the next state based on the current state and that action\n",
        "  # and calculate the reward\n",
        "  # Replace this line:\n",
        "  new_state = 0; reward = 0;action = 0\n",
        "\n",
        "\n",
        "\n",
        "  return new_state, reward, action"
      ],
      "metadata": {
        "id": "T68mTZSe6A3w"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Set up the reward structure so it matches figure 19.2\n",
        "reward_structure = np.zeros((16,1))\n",
        "reward_structure[15] = 1\n",
        "\n",
        "# Initialize random number seed\n",
        "np.random.seed(0)\n",
        "T = 10\n",
        "# Set up the states, so the fish are in the same positions as figure 19.5\n",
        "states = np.zeros(T, dtype='uint8')\n",
        "rewards = np.zeros(T, dtype='uint8')\n",
        "actions = np.zeros(T-1, dtype='uint8')\n",
        "\n",
        "states[0] = 0\n",
        "for t in range(T-1):\n",
        "  states[t+1],rewards[t+1],actions[t] = markov_decision_process_step_stochastic(states[t], transition_probabilities_given_action, reward_structure, stochastic_policy)\n",
        "\n",
        "print(\"Actions\", actions)\n",
        "print(\"Your States:\", states)\n",
        "print(\"Your Rewards:\", rewards)\n",
        "\n",
        "mdp_drawer = DrawMDP(n_rows, n_cols)\n",
        "mdp_drawer.draw(layout, state = states[0], path1=states, policy = stochastic_policy, draw_state_index = True)"
      ],
      "metadata": {
        "id": "hMRVYX2HtqMg"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}