Created using Colaboratory
This commit is contained in:
521
Notebooks/Chap19/19_4_Temporal_Difference_Methods.ipynb
Normal file
521
Notebooks/Chap19/19_4_Temporal_Difference_Methods.ipynb
Normal file
@@ -0,0 +1,521 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": [],
|
||||
"authorship_tag": "ABX9TyNEAhORON7DFN1dZMhDK/PO",
|
||||
"include_colab_link": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "view-in-github",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap19/19_4_Temporal_Difference_Methods.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# **Notebook 19.4: Temporal difference methods**\n",
|
||||
"\n",
|
||||
"This notebook investigates temporal differnece methods for tabular reinforcement learning as described in section 19.3.3 of the book\n",
|
||||
"\n",
|
||||
"Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n",
|
||||
"\n",
|
||||
"Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions."
|
||||
],
|
||||
"metadata": {
|
||||
"id": "t9vk9Elugvmi"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from PIL import Image"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "OLComQyvCIJ7"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Get local copies of components of images\n",
|
||||
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Empty.png\n",
|
||||
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Hole.png\n",
|
||||
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Fish.png\n",
|
||||
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Penguin.png"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "ZsvrUszPLyEG"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Ugly class that takes care of drawing pictures like in the book.\n",
|
||||
"# You can totally ignore this code!\n",
|
||||
"class DrawMDP:\n",
|
||||
" # Constructor initializes parameters\n",
|
||||
" def __init__(self, n_row, n_col):\n",
|
||||
" self.empty_image = np.asarray(Image.open('Empty.png'))\n",
|
||||
" self.hole_image = np.asarray(Image.open('Hole.png'))\n",
|
||||
" self.fish_image = np.asarray(Image.open('Fish.png'))\n",
|
||||
" self.penguin_image = np.asarray(Image.open('Penguin.png'))\n",
|
||||
" self.fig,self.ax = plt.subplots()\n",
|
||||
" self.n_row = n_row\n",
|
||||
" self.n_col = n_col\n",
|
||||
"\n",
|
||||
" my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
|
||||
" my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
|
||||
" r = np.floor(my_colormap_vals_dec/(256*256))\n",
|
||||
" g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
|
||||
" b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
|
||||
" self.colormap = np.vstack((r,g,b)).transpose()/255.0\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" def draw_text(self, text, row, col, position, color):\n",
|
||||
" if position == 'bc':\n",
|
||||
" self.ax.text( 83*col+41,83 * (row+1) -5, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
|
||||
" if position == 'tc':\n",
|
||||
" self.ax.text( 83*col+41,83 * (row) +10, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
|
||||
" if position == 'lc':\n",
|
||||
" self.ax.text( 83*col+2,83 * (row) +41, text, verticalalignment=\"center\", color=color, fontweight='bold', rotation=90)\n",
|
||||
" if position == 'rc':\n",
|
||||
" self.ax.text( 83*(col+1)-5,83 * (row) +41, text, horizontalalignment=\"right\", verticalalignment=\"center\", color=color, fontweight='bold', rotation=-90)\n",
|
||||
" if position == 'tl':\n",
|
||||
" self.ax.text( 83*col+5,83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"left\", color=color, fontweight='bold')\n",
|
||||
" if position == 'tr':\n",
|
||||
" self.ax.text( 83*(col+1)-5, 83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"right\", color=color, fontweight='bold')\n",
|
||||
"\n",
|
||||
" # Draws a set of states\n",
|
||||
" def draw_path(self, path, color1, color2):\n",
|
||||
" for i in range(len(path)-1):\n",
|
||||
" row_start = np.floor(path[i]/self.n_col)\n",
|
||||
" row_end = np.floor(path[i+1]/self.n_col)\n",
|
||||
" col_start = path[i] - row_start * self.n_col\n",
|
||||
" col_end = path[i+1] - row_end * self.n_col\n",
|
||||
"\n",
|
||||
" color_index = int(np.floor(255 * i/(len(path)-1.)))\n",
|
||||
" self.ax.plot([col_start * 83+41 + i, col_end * 83+41 + i ],[row_start * 83+41 + i, row_end * 83+41 + i ], color=(self.colormap[color_index,0],self.colormap[color_index,1],self.colormap[color_index,2]))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" # Draw deterministic policy\n",
|
||||
" def draw_deterministic_policy(self,i, action):\n",
|
||||
" row = np.floor(i/self.n_col)\n",
|
||||
" col = i - row * self.n_col\n",
|
||||
" center_x = 83 * col + 41\n",
|
||||
" center_y = 83 * row + 41\n",
|
||||
" arrow_base_width = 10\n",
|
||||
" arrow_height = 15\n",
|
||||
" # Draw arrow pointing upward\n",
|
||||
" if action ==0:\n",
|
||||
" triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
|
||||
" [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
|
||||
" [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
|
||||
" # Draw arrow pointing right\n",
|
||||
" if action ==1:\n",
|
||||
" triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
|
||||
" [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||
" [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||
" # Draw arrow pointing downward\n",
|
||||
" if action ==2:\n",
|
||||
" triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
|
||||
" [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
|
||||
" [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
|
||||
" # Draw arrow pointing left\n",
|
||||
" if action ==3:\n",
|
||||
" triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
|
||||
" [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||
" [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||
"\n",
|
||||
" # Draw stochastic policy\n",
|
||||
" def draw_stochastic_policy(self,i, action_probs):\n",
|
||||
" row = np.floor(i/self.n_col)\n",
|
||||
" col = i - row * self.n_col\n",
|
||||
" offset = 20\n",
|
||||
" # Draw arrow pointing upward\n",
|
||||
" center_x = 83 * col + 41\n",
|
||||
" center_y = 83 * row + 41 - offset\n",
|
||||
" arrow_base_width = 15 * action_probs[0]\n",
|
||||
" arrow_height = 20 * action_probs[0]\n",
|
||||
" triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
|
||||
" [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
|
||||
" [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
|
||||
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||
"\n",
|
||||
" # Draw arrow pointing right\n",
|
||||
" center_x = 83 * col + 41 + offset\n",
|
||||
" center_y = 83 * row + 41\n",
|
||||
" arrow_base_width = 15 * action_probs[1]\n",
|
||||
" arrow_height = 20 * action_probs[1]\n",
|
||||
" triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
|
||||
" [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||
" [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||
"\n",
|
||||
" # Draw arrow pointing downward\n",
|
||||
" center_x = 83 * col + 41\n",
|
||||
" center_y = 83 * row + 41 +offset\n",
|
||||
" arrow_base_width = 15 * action_probs[2]\n",
|
||||
" arrow_height = 20 * action_probs[2]\n",
|
||||
" triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
|
||||
" [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
|
||||
" [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
|
||||
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||
"\n",
|
||||
" # Draw arrow pointing left\n",
|
||||
" center_x = 83 * col + 41 -offset\n",
|
||||
" center_y = 83 * row + 41\n",
|
||||
" arrow_base_width = 15 * action_probs[3]\n",
|
||||
" arrow_height = 20 * action_probs[3]\n",
|
||||
" triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
|
||||
" [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||
" [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" def draw(self, layout, state=None, draw_state_index= False, rewards=None, policy=None, state_values=None, state_action_values=None,path1=None, path2 = None):\n",
|
||||
" # Construct the image\n",
|
||||
" image_out = np.zeros((self.n_row * 83, self.n_col * 83, 4),dtype='uint8')\n",
|
||||
" for c_row in range (self.n_row):\n",
|
||||
" for c_col in range(self.n_col):\n",
|
||||
" if layout[c_row * self.n_col + c_col]==0:\n",
|
||||
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.empty_image\n",
|
||||
" elif layout[c_row * self.n_col + c_col]==1:\n",
|
||||
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.hole_image\n",
|
||||
" else:\n",
|
||||
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.fish_image\n",
|
||||
" if state is not None and state == c_row * self.n_col + c_col:\n",
|
||||
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.penguin_image\n",
|
||||
"\n",
|
||||
" # Draw the image\n",
|
||||
" plt.imshow(image_out)\n",
|
||||
" self.ax.get_xaxis().set_visible(False)\n",
|
||||
" self.ax.get_yaxis().set_visible(False)\n",
|
||||
" self.ax.spines['top'].set_visible(False)\n",
|
||||
" self.ax.spines['right'].set_visible(False)\n",
|
||||
" self.ax.spines['bottom'].set_visible(False)\n",
|
||||
" self.ax.spines['left'].set_visible(False)\n",
|
||||
"\n",
|
||||
" if draw_state_index:\n",
|
||||
" for c_cell in range(layout.size):\n",
|
||||
" self.draw_text(\"%d\"%(c_cell), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tl','k')\n",
|
||||
"\n",
|
||||
" # Draw the policy as triangles\n",
|
||||
" if policy is not None:\n",
|
||||
" # If the policy is deterministic\n",
|
||||
" if len(policy) == len(layout):\n",
|
||||
" for i in range(len(layout)):\n",
|
||||
" self.draw_deterministic_policy(i, policy[i])\n",
|
||||
" # Else it is stochastic\n",
|
||||
" else:\n",
|
||||
" for i in range(len(layout)):\n",
|
||||
" self.draw_stochastic_policy(i,policy[:,i])\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" if path1 is not None:\n",
|
||||
" self.draw_path(path1, np.array([1.0, 0.0, 0.0]), np.array([0.0, 1.0, 1.0]))\n",
|
||||
"\n",
|
||||
" if rewards is not None:\n",
|
||||
" for c_cell in range(layout.size):\n",
|
||||
" self.draw_text(\"%d\"%(rewards[c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tr','r')\n",
|
||||
"\n",
|
||||
" if state_values is not None:\n",
|
||||
" for c_cell in range(layout.size):\n",
|
||||
" self.draw_text(\"%2.2f\"%(state_values[c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'bc','black')\n",
|
||||
"\n",
|
||||
" if state_action_values is not None:\n",
|
||||
" for c_cell in range(layout.size):\n",
|
||||
" self.draw_text(\"%2.2f\"%(state_action_values[0, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tc','black')\n",
|
||||
" self.draw_text(\"%2.2f\"%(state_action_values[1, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'rc','black')\n",
|
||||
" self.draw_text(\"%2.2f\"%(state_action_values[2, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'bc','black')\n",
|
||||
" self.draw_text(\"%2.2f\"%(state_action_values[3, c_cell]), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'lc','black')\n",
|
||||
"\n",
|
||||
" plt.show()"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "Gq1HfJsHN3SB"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# We're going to work on the problem depicted in figure 19.10a\n",
|
||||
"n_rows = 4; n_cols = 4\n",
|
||||
"layout = np.zeros(n_rows * n_cols)\n",
|
||||
"reward_structure = np.zeros(n_rows * n_cols)\n",
|
||||
"layout[9] = 1 ; reward_structure[9] = -2\n",
|
||||
"layout[10] = 1; reward_structure[10] = -2\n",
|
||||
"layout[14] = 1; reward_structure[14] = -2\n",
|
||||
"layout[15] = 2; reward_structure[15] = 3\n",
|
||||
"initial_state = 0\n",
|
||||
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||
"mdp_drawer.draw(layout, state = initial_state, rewards=reward_structure, draw_state_index = True)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "eBQ7lTpJQBSe"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"For clarity, the black numbers are the state number and the red numbers are the reward for being in that state. Note that the states are indexed from 0 rather than 1 as in the book to make the code neater."
|
||||
],
|
||||
"metadata": {
|
||||
"id": "6Vku6v_se2IG"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Now let's define the state transition function $Pr(s_{t+1}|s_{t},a)$ in full where $a$ is the actions. Here $a=0$ means try to go upward, $a=1$, right, $a=2$ down and $a=3$ right. However, the ice is slippery, so we don't always go the direction we want to.\n",
|
||||
"\n",
|
||||
"Note that as for the states, we've indexed the actions from zero (unlike in the book) so they map to the indices of arrays better"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "Fhc6DzZNOjiC"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"transition_probabilities_given_action0 = np.array(\\\n",
|
||||
"[[0.00 , 0.33, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.50 , 0.00, 0.33, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.33, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.50 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.34, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.34, 0.00, 0.00, 0.17, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.75, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.17, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.75 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.25, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.25 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"transition_probabilities_given_action1 = np.array(\\\n",
|
||||
"[[0.00 , 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.75 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.50, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.25 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.25, 0.00, 0.00, 0.50, 0.00, 0.17, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.75, 0.00, 0.25, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.50 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.50, 0.00 ],\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"transition_probabilities_given_action2 = np.array(\\\n",
|
||||
"[[0.00 , 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.25 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.25, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.75 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.50, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.75, 0.00, 0.00, 0.16, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.34, 0.00, 0.50 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.34, 0.00 ],\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"transition_probabilities_given_action3 = np.array(\\\n",
|
||||
"[[0.00 , 0.25, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.50 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.50, 0.00, 0.75, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.50 , 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.25, 0.00, 0.00, 0.33, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.33, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.17, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.25 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.50, 0.00 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.75 ],\n",
|
||||
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||
"])\n",
|
||||
"\n",
|
||||
"# Store all of these in a three dimension array\n",
|
||||
"# Pr(s_{t+1}=2|s_{t}=1, a_{t}=3] is stored at position [2,1,3]\n",
|
||||
"transition_probabilities_given_action = np.concatenate((np.expand_dims(transition_probabilities_given_action0,2),\n",
|
||||
" np.expand_dims(transition_probabilities_given_action1,2),\n",
|
||||
" np.expand_dims(transition_probabilities_given_action2,2),\n",
|
||||
" np.expand_dims(transition_probabilities_given_action3,2)),axis=2)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "l7rT78BbOgTi"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"def q_learning_step(state_action_values, reward, state, new_state, action, gamma, alpha = 0.1):\n",
|
||||
" # TODO -- write this function\n",
|
||||
" # Replace this line\n",
|
||||
" state_action_values_after = np.copy(state_action_values)\n",
|
||||
"\n",
|
||||
" return state_action_values_after"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "5pO6-9ACWhiV"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# This takes a single step from an MDP which just has a completely random policy\n",
|
||||
"def markov_decision_process_step(state, transition_probabilities_given_action, reward_structure):\n",
|
||||
" # Pick action\n",
|
||||
" action = np.random.randint(4)\n",
|
||||
" # Update the state\n",
|
||||
" new_state = np.random.choice(a=np.arange(0,transition_probabilities_given_action.shape[0]),p = transition_probabilities_given_action[:,state,action])\n",
|
||||
" # Return the reward -- here the reward is for leaving the state\n",
|
||||
" reward = reward_structure[state]\n",
|
||||
"\n",
|
||||
" return new_state, reward, action"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "akjrncMF-FkU"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Initialize the state-action values to random numbers\n",
|
||||
"np.random.seed(0)\n",
|
||||
"n_state = transition_probabilities_given_action.shape[0]\n",
|
||||
"n_action = transition_probabilities_given_action.shape[2]\n",
|
||||
"state_action_values = np.random.normal(size=(n_action, n_state))\n",
|
||||
"gamma = 0.9\n",
|
||||
"\n",
|
||||
"policy = np.argmax(state_action_values, axis=0).astype(int)\n",
|
||||
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||
"mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values, rewards = reward_structure)\n",
|
||||
"\n",
|
||||
"# Now let's simulate a single Q-learning step\n",
|
||||
"initial_state = 9\n",
|
||||
"print(\"Initial state = \", initial_state)\n",
|
||||
"new_state, reward, action = markov_decision_process_step(initial_state, transition_probabilities_given_action, reward_structure)\n",
|
||||
"print(\"Action = \", action)\n",
|
||||
"print(\"New state = \", new_state)\n",
|
||||
"print(\"Reward = \", reward)\n",
|
||||
"\n",
|
||||
"state_action_values_after = q_learning_step(state_action_values, reward, initial_state, new_state, action, gamma)\n",
|
||||
"print(\"Your value:\",state_action_values_after[action, initial_state])\n",
|
||||
"print(\"True value: 0.27650262412468796\")\n",
|
||||
"\n",
|
||||
"policy = np.argmax(state_action_values, axis=0).astype(int)\n",
|
||||
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||
"mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values_after, rewards = reward_structure)\n"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "Fu5_VjvbSwfJ"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Now let's run this for a while and watch the policy improve"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "Ogh0qucmb68J"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"# Initialize the state-action values to random numbers\n",
|
||||
"np.random.seed(0)\n",
|
||||
"n_state = transition_probabilities_given_action.shape[0]\n",
|
||||
"n_action = transition_probabilities_given_action.shape[2]\n",
|
||||
"state_action_values = np.random.normal(size=(n_action, n_state))\n",
|
||||
"# Hard code termination state of finding fish\n",
|
||||
"state_action_values[:,n_state-1] = 3.0\n",
|
||||
"gamma = 0.9\n",
|
||||
"\n",
|
||||
"# Draw the initial setup\n",
|
||||
"policy = np.argmax(state_action_values, axis=0).astype(int)\n",
|
||||
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||
"mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values, rewards = reward_structure)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"state= np.random.randint(n_state-1)\n",
|
||||
"\n",
|
||||
"# Run for a number of iterations\n",
|
||||
"for c_iter in range(10000):\n",
|
||||
" new_state, reward, action = markov_decision_process_step(state, transition_probabilities_given_action, reward_structure)\n",
|
||||
" state_action_values_after = q_learning_step(state_action_values, reward, state, new_state, action, gamma)\n",
|
||||
" # If in termination state, reset state randomly\n",
|
||||
" if new_state==15:\n",
|
||||
" state= np.random.randint(n_state-1)\n",
|
||||
" else:\n",
|
||||
" state = new_state\n",
|
||||
" # Update the policy\n",
|
||||
" state_action_values = np.copy(state_action_values_after)\n",
|
||||
" policy = np.argmax(state_action_values, axis=0).astype(int)\n",
|
||||
"\n",
|
||||
"# Draw the final situation\n",
|
||||
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||
"mdp_drawer.draw(layout, policy = policy, state_action_values = state_action_values, rewards = reward_structure)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "qQFhwVqPcCFH"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user