diff --git a/Notebooks/Chap19/19_1_Markov_Decision_Processes.ipynb b/Notebooks/Chap19/19_1_Markov_Decision_Processes.ipynb index c5ac07f..c5cd6d6 100644 --- a/Notebooks/Chap19/19_1_Markov_Decision_Processes.ipynb +++ b/Notebooks/Chap19/19_1_Markov_Decision_Processes.ipynb @@ -4,7 +4,7 @@ "metadata": { "colab": { "provenance": [], - "authorship_tag": "ABX9TyMForqbtn4usiIlRAenjCfh", + "authorship_tag": "ABX9TyPg3umHnqmIXX6jGe809Nxf", "include_colab_link": true }, "kernelspec": { @@ -46,13 +46,691 @@ "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", - "\n" + "from PIL import Image" ], "metadata": { "id": "OLComQyvCIJ7" }, "execution_count": null, "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Get local copies of components of images\n", + "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Empty.png\n", + "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Hole.png\n", + "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Fish.png\n", + "!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Penguin.png" + ], + "metadata": { + "id": "ZsvrUszPLyEG" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Ugly class that takes care of drawing pictures like in the book.\n", + "# You can totally ignore this code!\n", + "class DrawMDP:\n", + " # Constructor initializes parameters\n", + " def __init__(self, n_row, n_col):\n", + " self.empty_image = np.asarray(Image.open('Empty.png'))\n", + " self.hole_image = np.asarray(Image.open('Hole.png'))\n", + " self.fish_image = np.asarray(Image.open('Fish.png'))\n", + " self.penguin_image = np.asarray(Image.open('Penguin.png'))\n", + " self.fig,self.ax = plt.subplots()\n", + " self.n_row = n_row\n", + " self.n_col = n_col\n", + "\n", + " my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n", + " my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n", + " r = np.floor(my_colormap_vals_dec/(256*256))\n", + " g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n", + " b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n", + " self.colormap = np.vstack((r,g,b)).transpose()/255.0\n", + "\n", + "\n", + " def draw_text(self, text, row, col, position, color):\n", + " if position == 'bc':\n", + " self.ax.text( 83*col+41,83 * (row+1) -10, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n", + " if position == 'tl':\n", + " self.ax.text( 83*col+5,83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"left\", color=color, fontweight='bold')\n", + "\n", + " # Draws a set of states\n", + " def draw_path(self, path, color1, color2):\n", + " for i in range(len(path)-1):\n", + " row_start = np.floor(path[i]/self.n_col)\n", + " row_end = np.floor(path[i+1]/self.n_col)\n", + " col_start = path[i] - row_start * self.n_col\n", + " col_end = path[i+1] - row_end * self.n_col\n", + "\n", + " color_index = int(np.floor(255 * i/(len(path)-1.)))\n", + " self.ax.plot([col_start * 83+41 + i, col_end * 83+41 + i ],[row_start * 83+41 + i, row_end * 83+41 + i ], color=(self.colormap[color_index,0],self.colormap[color_index,1],self.colormap[color_index,2]))\n", + "\n", + "\n", + " # Draw deterministic policy\n", + " def draw_deterministic_policy(self,i, action):\n", + " row = np.floor(i/self.n_col)\n", + " col = i - row * self.n_col\n", + " center_x = 83 * col + 41\n", + " center_y = 83 * row + 41\n", + " arrow_base_width = 10\n", + " arrow_height = 15\n", + " # Draw arrow pointing upward\n", + " if action ==0:\n", + " triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n", + " [center_x - arrow_base_width/2, center_y+arrow_height/2],\n", + " [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n", + " # Draw arrow pointing right\n", + " if action ==1:\n", + " triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n", + " [center_x - arrow_height/2, center_y-arrow_base_width/2],\n", + " [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n", + " # Draw arrow pointing downward\n", + " if action ==2:\n", + " triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n", + " [center_x - arrow_base_width/2, center_y-arrow_height/2],\n", + " [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n", + " # Draw arrow pointing left\n", + " if action ==3:\n", + " triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n", + " [center_x + arrow_height/2, center_y-arrow_base_width/2],\n", + " [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n", + " self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n", + "\n", + " # Draw stochastic policy\n", + " def draw_stochastic_policy(self,i, action_probs):\n", + " row = np.floor(i/self.n_col)\n", + " col = i - row * self.n_col\n", + " offset = 20\n", + " # Draw arrow pointing upward\n", + " center_x = 83 * col + 41\n", + " center_y = 83 * row + 41 - offset\n", + " arrow_base_width = 15 * action_probs[0]\n", + " arrow_height = 20 * action_probs[0]\n", + " triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n", + " [center_x - arrow_base_width/2, center_y+arrow_height/2],\n", + " [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n", + " self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n", + "\n", + " # Draw arrow pointing right\n", + " center_x = 83 * col + 41 + offset\n", + " center_y = 83 * row + 41\n", + " arrow_base_width = 15 * action_probs[1]\n", + " arrow_height = 20 * action_probs[1]\n", + " triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n", + " [center_x - arrow_height/2, center_y-arrow_base_width/2],\n", + " [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n", + " self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n", + "\n", + " # Draw arrow pointing downward\n", + " center_x = 83 * col + 41\n", + " center_y = 83 * row + 41 +offset\n", + " arrow_base_width = 15 * action_probs[2]\n", + " arrow_height = 20 * action_probs[2]\n", + " triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n", + " [center_x - arrow_base_width/2, center_y-arrow_height/2],\n", + " [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n", + " self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n", + "\n", + " # Draw arrow pointing left\n", + " center_x = 83 * col + 41 -offset\n", + " center_y = 83 * row + 41\n", + " arrow_base_width = 15 * action_probs[3]\n", + " arrow_height = 20 * action_probs[3]\n", + " triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n", + " [center_x + arrow_height/2, center_y-arrow_base_width/2],\n", + " [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n", + " self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n", + "\n", + "\n", + "\n", + "\n", + " def draw(self, layout, state, draw_state_index= False, rewards=None, policy=None, state_values=None, action_values=None,path1=None, path2 = None):\n", + " # Construct the image\n", + " image_out = np.zeros((self.n_row * 83, self.n_col * 83, 4),dtype='uint8')\n", + " for c_row in range (self.n_row):\n", + " for c_col in range(self.n_col):\n", + " if layout[c_row * self.n_col + c_col]==0:\n", + " image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.empty_image\n", + " elif layout[c_row * self.n_col + c_col]==1:\n", + " image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.hole_image\n", + " else:\n", + " image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.fish_image\n", + " if state == c_row * self.n_col + c_col:\n", + " image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.penguin_image\n", + "\n", + " # Draw the image\n", + " plt.imshow(image_out)\n", + " self.ax.get_xaxis().set_visible(False)\n", + " self.ax.get_yaxis().set_visible(False)\n", + " self.ax.spines['top'].set_visible(False)\n", + " self.ax.spines['right'].set_visible(False)\n", + " self.ax.spines['bottom'].set_visible(False)\n", + " self.ax.spines['left'].set_visible(False)\n", + "\n", + " if draw_state_index:\n", + " for c_cell in range(layout.size):\n", + " self.draw_text(\"%d\"%(c_cell), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tl','k')\n", + "\n", + " # Draw the policy as triangles\n", + " if policy is not None:\n", + " # If the policy is deterministic\n", + " if len(policy) == len(layout):\n", + " for i in range(len(layout)):\n", + " self.draw_deterministic_policy(i, policy[i])\n", + " # Else it is stochastic\n", + " else:\n", + " for i in range(len(layout)):\n", + " self.draw_stochastic_policy(i,policy[:,i])\n", + "\n", + "\n", + " if path1 is not None:\n", + " # self.draw_path(path1, np.array([0.81, 0.51, 0.38]), np.array([1.0, 0.2, 0.5]))\n", + " self.draw_path(path1, np.array([1.0, 0.0, 0.0]), np.array([0.0, 1.0, 1.0]))\n", + "\n", + "\n", + " plt.show()" + ], + "metadata": { + "id": "Gq1HfJsHN3SB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Let's draw the initial situation with the penguin in top right\n", + "n_rows = 4; n_cols = 4\n", + "layout = np.zeros(n_rows * n_cols)\n", + "initial_state = 0\n", + "mdp_drawer = DrawMDP(n_rows, n_cols)\n", + "mdp_drawer.draw(layout, state = initial_state, draw_state_index = True)" + ], + "metadata": { + "id": "eBQ7lTpJQBSe" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Note that the states are indexed from 0 rather than 1 as in the book to make\n", + "the code neater." + ], + "metadata": { + "id": "P7P40UyMunKb" + } + }, + { + "cell_type": "code", + "source": [ + "# Define the state probabilities\n", + "transition_probabilities = np.array( \\\n", + "[[0.00 , 0.33, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.50 , 0.00, 0.33, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.33, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.50 , 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.34, 0.00, 0.00, 0.33, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.34, 0.00, 0.00, 0.25, 0.00, 0.33, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00, 0.33, 0.00, 0.00, 0.33, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.34, 0.00, 0.50 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.34, 0.00 ],\n", + "])\n", + "initial_state = 0" + ], + "metadata": { + "id": "wgFcIi4YQJWI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Define a step from the Markov process" + ], + "metadata": { + "id": "axllRDDuDDLS" + } + }, + { + "cell_type": "code", + "source": [ + "def markov_process_step(state, transition_probabilities):\n", + " # TODO -- update the state according to the appropriate transition probabilities\n", + " # One way to do this is to use np.random.choice\n", + " # Replace this line:\n", + " new_state = 0\n", + "\n", + "\n", + " return new_state" + ], + "metadata": { + "id": "FrSZrS67sdbN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Run the Markov process for 10 steps and visualise the results" + ], + "metadata": { + "id": "uTj7rN6LDFXd" + } + }, + { + "cell_type": "code", + "source": [ + "np.random.seed(0)\n", + "T = 10\n", + "states = np.zeros(T, dtype='uint8')\n", + "states[0] = 0\n", + "for t in range(T-1):\n", + " states[t+1] = markov_process_step(states[t], transition_probabilities)\n", + "\n", + "\n", + "\n", + "print(\"Your States:\", states)\n", + "print(\"True States: [ 0 4 8 9 10 9 10 9 13 14]\")\n", + "mdp_drawer = DrawMDP(n_rows, n_cols)\n", + "mdp_drawer.draw(layout, state = states[0], path1=states, draw_state_index = True)" + ], + "metadata": { + "id": "lRIdjagCwP62" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Define a Markov one step of a reward process." + ], + "metadata": { + "id": "QLyjyBjjDMin" + } + }, + { + "cell_type": "code", + "source": [ + "def markov_reward_process_step(state, transition_probabilities, reward_structure):\n", + "\n", + " # TODO -- write this function\n", + " # Update the state. Return a reward of +1 if the Penguin lands on the fish\n", + " # or zero otherwise.\n", + " # Replace this line\n", + " new_state = 0; reward = 0\n", + "\n", + "\n", + " return new_state, reward" + ], + "metadata": { + "id": "YPHSJRKx-pgO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Run the Markov reward process for 10 steps and visualise the results" + ], + "metadata": { + "id": "AIz8QEiRFoCm" + } + }, + { + "cell_type": "code", + "source": [ + "# Set up the reward structure so it matches figure 19.2\n", + "reward_structure = np.zeros((16,1))\n", + "reward_structure[3] = 1; reward_structure[8] = 1; reward_structure[10] = 1\n", + "\n", + "# Initialize random numbers\n", + "np.random.seed(0)\n", + "T = 10\n", + "# Set up the states, so the fish are in the same positions as figure 19.2\n", + "states = np.zeros(T, dtype='uint8')\n", + "rewards = np.zeros(T, dtype='uint8')\n", + "\n", + "states[0] = 0\n", + "for t in range(T-1):\n", + " states[t+1],rewards[t+1] = markov_reward_process_step(states[t], transition_probabilities, reward_structure)\n", + "\n", + "print(\"Your States:\", states)\n", + "print(\"Your Rewards:\", rewards)\n", + "print(\"True Rewards: [0 0 1 0 1 0 1 0 0 0]\")\n", + "\n", + "\n", + "# Draw the figure\n", + "layout = np.zeros(n_rows * n_cols)\n", + "layout[3] = 2; layout[8] = 2 ; layout[10] = 2\n", + "mdp_drawer = DrawMDP(n_rows, n_cols)\n", + "mdp_drawer.draw(layout, state = states[0], path1=states, draw_state_index = True)" + ], + "metadata": { + "id": "0p1gCpGoFn4M" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's calculate the return -- the sum of discounted future rewards" + ], + "metadata": { + "id": "lyz47NWrITfj" + } + }, + { + "cell_type": "code", + "source": [ + "def calculate_return(rewards, gamma):\n", + " # TODO -- you write this function\n", + " # It should compute one return for the start of the sequence (i.e. G_1)\n", + " # Replace this line\n", + " return_val = 0.0\n", + "\n", + "\n", + " return return_val" + ], + "metadata": { + "id": "4fEuBRPnFm_N" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "gamma = 0.9\n", + "for t in range(len(states)):\n", + " print(\"Return at time %d = %3.3f\"%(t, calculate_return(rewards[t:],gamma)))\n", + "\n", + "# Reality check!\n", + "print(\"True return at time 0: 1.998\")" + ], + "metadata": { + "id": "o19lQgM3JrOz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's define the state transition function $Pr(s_{t+1}|s_{t},a)$ in full where $a$ is the actions. Here $a=0$ means try to go upward, $a=1$, right, $a=2$ down and $a=3$ right. However, the ice is slippery, so we don't always go the direction we want to.\n", + "\n", + "Note that as for the states, we've indexed the actions from zero (unlike in the book, so they map to the indices of arrays better)" + ], + "metadata": { + "id": "Fhc6DzZNOjiC" + } + }, + { + "cell_type": "code", + "source": [ + "transition_probabilities_given_action1 = np.array(\\\n", + "[[0.00 , 0.33, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.50 , 0.00, 0.33, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.33, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.50 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.34, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.34, 0.00, 0.00, 0.17, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.75, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.17, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.75 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.25, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.25 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00 ],\n", + "])\n", + "\n", + "transition_probabilities_given_action2 = np.array(\\\n", + "[[0.00 , 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.75 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.50, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.25 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.25, 0.00, 0.00, 0.50, 0.00, 0.17, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00, 0.00, 0.25, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.75, 0.00, 0.25, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.50 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.50, 0.00 ],\n", + "])\n", + "\n", + "transition_probabilities_given_action3 = np.array(\\\n", + "[[0.00 , 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.25 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.25, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.75 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.50, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.75, 0.00, 0.00, 0.16, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.34, 0.00, 0.50 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.34, 0.00 ],\n", + "])\n", + "\n", + "transition_probabilities_given_action4 = np.array(\\\n", + "[[0.00 , 0.25, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.50 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.50, 0.00, 0.75, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.50 , 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.25, 0.00, 0.00, 0.33, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.33, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.17, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.25 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.50, 0.00 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.75 ],\n", + " [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00 ],\n", + "])\n", + "\n", + "# Store all of these in a three dimension array\n", + "# Pr(s_{t+1}=2|s_{t}=1, a_{t}=3] is stored at position [2,1,3]\n", + "transition_probabilities_given_action = np.concatenate((np.expand_dims(transition_probabilities_given_action1,2),\n", + " np.expand_dims(transition_probabilities_given_action2,2),\n", + " np.expand_dims(transition_probabilities_given_action3,2),\n", + " np.expand_dims(transition_probabilities_given_action4,2)),axis=2)" + ], + "metadata": { + "id": "l7rT78BbOgTi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Now we need a policy. Let's start with the deterministic policy in figure 19.5a:\n", + "policy = [2,2,1,1, 2,1,1,1, 1,1,0,2, 1,0,1,1]\n", + "\n", + "# Let's draw the policy first\n", + "layout = np.zeros(n_rows * n_cols)\n", + "layout[15] = 2\n", + "mdp_drawer = DrawMDP(n_rows, n_cols)\n", + "mdp_drawer.draw(layout, state = states[0], policy = policy, draw_state_index = True)" + ], + "metadata": { + "id": "8jWhDlkaKj7Q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def markov_decision_process_step_deterministic(state, transition_probabilities_given_action, reward_structure, policy):\n", + " # TODO -- complete this function.\n", + " # For each state, theres is a corresponding action.\n", + " # Draw the next state based on the current state and that action\n", + " # and calculate the reward\n", + " # Replace this line:\n", + " new_state = 0; reward = 0;\n", + "\n", + " return new_state, reward\n" + ], + "metadata": { + "id": "dueNbS2SUVUK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Set up the reward structure so it matches figure 19.2\n", + "reward_structure = np.zeros((16,1))\n", + "reward_structure[15] = 1\n", + "\n", + "# Initialize random number seed\n", + "np.random.seed(3)\n", + "T = 10\n", + "# Set up the states, so the fish are in the same positions as figure 19.5\n", + "states = np.zeros(T, dtype='uint8')\n", + "rewards = np.zeros(T, dtype='uint8')\n", + "\n", + "states[0] = 0\n", + "for t in range(T-1):\n", + " states[t+1],rewards[t+1] = markov_decision_process_step_deterministic(states[t], transition_probabilities_given_action, reward_structure, policy)\n", + "\n", + "print(\"Your States:\", states)\n", + "print(\"True States: [ 0 4 8 9 13 14 15 11 7 3]\")\n", + "print(\"Your Rewards:\", rewards)\n", + "print(\"True Rewards: [0 0 0 0 0 0 1 0 0 0]\")\n", + "\n", + "mdp_drawer = DrawMDP(n_rows, n_cols)\n", + "mdp_drawer.draw(layout, state = states[0], path1=states, policy = policy, draw_state_index = True)" + ], + "metadata": { + "id": "4Du5aUfd2Lci" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "You can see that the Penguin usually follows the policy, (heads in the direction of the cyan arrows (when it can). But sometimes, the penguin \"slips\" to a different neighboring state\n", + "\n", + "Now let's investigate a stochastic policy" + ], + "metadata": { + "id": "bLEd8xug33b-" + } + }, + { + "cell_type": "code", + "source": [ + "np.random.seed(0)\n", + "# Let's now choose a random policy. We'll generate a set of random numbers and pass\n", + "# them through a softmax function\n", + "stochastic_policy = np.random.normal(size=(4,n_rows*n_cols))\n", + "stochastic_policy = np.exp(stochastic_policy) / (np.ones((4,1))@ np.expand_dims(np.sum(np.exp(stochastic_policy), axis=0),0))\n", + "np.set_printoptions(precision=2)\n", + "print(stochastic_policy)\n", + "\n", + "# Let's draw the policy first\n", + "layout = np.zeros(n_rows * n_cols)\n", + "layout[15] = 2\n", + "mdp_drawer = DrawMDP(n_rows, n_cols)\n", + "mdp_drawer.draw(layout, state = states[0], path1=states, policy = stochastic_policy, draw_state_index = True)" + ], + "metadata": { + "id": "o7T0b3tyilDc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def markov_decision_process_step_stochastic(state, transition_probabilities_given_action, reward_structure, stochastic_policy):\n", + " # TODO -- complete this function.\n", + " # For each state, theres is a corresponding distribution over actions\n", + " # Draw a sample from that distribution to get the action\n", + " # Draw the next state based on the current state and that action\n", + " # and calculate the reward\n", + " # Replace this line:\n", + " new_state = 0; reward = 0;action = 0\n", + "\n", + "\n", + "\n", + " return new_state, reward, action" + ], + "metadata": { + "id": "T68mTZSe6A3w" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Set up the reward structure so it matches figure 19.2\n", + "reward_structure = np.zeros((16,1))\n", + "reward_structure[15] = 1\n", + "\n", + "# Initialize random number seed\n", + "np.random.seed(0)\n", + "T = 10\n", + "# Set up the states, so the fish are in the same positions as figure 19.5\n", + "states = np.zeros(T, dtype='uint8')\n", + "rewards = np.zeros(T, dtype='uint8')\n", + "actions = np.zeros(T-1, dtype='uint8')\n", + "\n", + "states[0] = 0\n", + "for t in range(T-1):\n", + " states[t+1],rewards[t+1],actions[t] = markov_decision_process_step_stochastic(states[t], transition_probabilities_given_action, reward_structure, stochastic_policy)\n", + "\n", + "print(\"Actions\", actions)\n", + "print(\"Your States:\", states)\n", + "print(\"Your Rewards:\", rewards)\n", + "\n", + "mdp_drawer = DrawMDP(n_rows, n_cols)\n", + "mdp_drawer.draw(layout, state = states[0], path1=states, policy = stochastic_policy, draw_state_index = True)" + ], + "metadata": { + "id": "hMRVYX2HtqMg" + }, + "execution_count": null, + "outputs": [] } ] } \ No newline at end of file