Created using Colaboratory
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
"metadata": {
|
"metadata": {
|
||||||
"colab": {
|
"colab": {
|
||||||
"provenance": [],
|
"provenance": [],
|
||||||
"authorship_tag": "ABX9TyMForqbtn4usiIlRAenjCfh",
|
"authorship_tag": "ABX9TyPg3umHnqmIXX6jGe809Nxf",
|
||||||
"include_colab_link": true
|
"include_colab_link": true
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
@@ -46,13 +46,691 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"\n"
|
"from PIL import Image"
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "OLComQyvCIJ7"
|
"id": "OLComQyvCIJ7"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"outputs": []
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Get local copies of components of images\n",
|
||||||
|
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Empty.png\n",
|
||||||
|
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Hole.png\n",
|
||||||
|
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Fish.png\n",
|
||||||
|
"!wget https://raw.githubusercontent.com/udlbook/udlbook/main/Notebooks/Chap19/Penguin.png"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "ZsvrUszPLyEG"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Ugly class that takes care of drawing pictures like in the book.\n",
|
||||||
|
"# You can totally ignore this code!\n",
|
||||||
|
"class DrawMDP:\n",
|
||||||
|
" # Constructor initializes parameters\n",
|
||||||
|
" def __init__(self, n_row, n_col):\n",
|
||||||
|
" self.empty_image = np.asarray(Image.open('Empty.png'))\n",
|
||||||
|
" self.hole_image = np.asarray(Image.open('Hole.png'))\n",
|
||||||
|
" self.fish_image = np.asarray(Image.open('Fish.png'))\n",
|
||||||
|
" self.penguin_image = np.asarray(Image.open('Penguin.png'))\n",
|
||||||
|
" self.fig,self.ax = plt.subplots()\n",
|
||||||
|
" self.n_row = n_row\n",
|
||||||
|
" self.n_col = n_col\n",
|
||||||
|
"\n",
|
||||||
|
" my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
|
||||||
|
" my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
|
||||||
|
" r = np.floor(my_colormap_vals_dec/(256*256))\n",
|
||||||
|
" g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
|
||||||
|
" b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
|
||||||
|
" self.colormap = np.vstack((r,g,b)).transpose()/255.0\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" def draw_text(self, text, row, col, position, color):\n",
|
||||||
|
" if position == 'bc':\n",
|
||||||
|
" self.ax.text( 83*col+41,83 * (row+1) -10, text, horizontalalignment=\"center\", color=color, fontweight='bold')\n",
|
||||||
|
" if position == 'tl':\n",
|
||||||
|
" self.ax.text( 83*col+5,83 * row +5, text, verticalalignment = 'top', horizontalalignment=\"left\", color=color, fontweight='bold')\n",
|
||||||
|
"\n",
|
||||||
|
" # Draws a set of states\n",
|
||||||
|
" def draw_path(self, path, color1, color2):\n",
|
||||||
|
" for i in range(len(path)-1):\n",
|
||||||
|
" row_start = np.floor(path[i]/self.n_col)\n",
|
||||||
|
" row_end = np.floor(path[i+1]/self.n_col)\n",
|
||||||
|
" col_start = path[i] - row_start * self.n_col\n",
|
||||||
|
" col_end = path[i+1] - row_end * self.n_col\n",
|
||||||
|
"\n",
|
||||||
|
" color_index = int(np.floor(255 * i/(len(path)-1.)))\n",
|
||||||
|
" self.ax.plot([col_start * 83+41 + i, col_end * 83+41 + i ],[row_start * 83+41 + i, row_end * 83+41 + i ], color=(self.colormap[color_index,0],self.colormap[color_index,1],self.colormap[color_index,2]))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" # Draw deterministic policy\n",
|
||||||
|
" def draw_deterministic_policy(self,i, action):\n",
|
||||||
|
" row = np.floor(i/self.n_col)\n",
|
||||||
|
" col = i - row * self.n_col\n",
|
||||||
|
" center_x = 83 * col + 41\n",
|
||||||
|
" center_y = 83 * row + 41\n",
|
||||||
|
" arrow_base_width = 10\n",
|
||||||
|
" arrow_height = 15\n",
|
||||||
|
" # Draw arrow pointing upward\n",
|
||||||
|
" if action ==0:\n",
|
||||||
|
" triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
|
||||||
|
" [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
|
||||||
|
" [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
|
||||||
|
" # Draw arrow pointing right\n",
|
||||||
|
" if action ==1:\n",
|
||||||
|
" triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
|
||||||
|
" [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||||
|
" [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||||
|
" # Draw arrow pointing downward\n",
|
||||||
|
" if action ==2:\n",
|
||||||
|
" triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
|
||||||
|
" [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
|
||||||
|
" [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
|
||||||
|
" # Draw arrow pointing left\n",
|
||||||
|
" if action ==3:\n",
|
||||||
|
" triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
|
||||||
|
" [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||||
|
" [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||||
|
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||||
|
"\n",
|
||||||
|
" # Draw stochastic policy\n",
|
||||||
|
" def draw_stochastic_policy(self,i, action_probs):\n",
|
||||||
|
" row = np.floor(i/self.n_col)\n",
|
||||||
|
" col = i - row * self.n_col\n",
|
||||||
|
" offset = 20\n",
|
||||||
|
" # Draw arrow pointing upward\n",
|
||||||
|
" center_x = 83 * col + 41\n",
|
||||||
|
" center_y = 83 * row + 41 - offset\n",
|
||||||
|
" arrow_base_width = 15 * action_probs[0]\n",
|
||||||
|
" arrow_height = 20 * action_probs[0]\n",
|
||||||
|
" triangle_indices = np.array([[center_x, center_y-arrow_height/2],\n",
|
||||||
|
" [center_x - arrow_base_width/2, center_y+arrow_height/2],\n",
|
||||||
|
" [center_x + arrow_base_width/2, center_y+arrow_height/2]])\n",
|
||||||
|
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||||
|
"\n",
|
||||||
|
" # Draw arrow pointing right\n",
|
||||||
|
" center_x = 83 * col + 41 + offset\n",
|
||||||
|
" center_y = 83 * row + 41\n",
|
||||||
|
" arrow_base_width = 15 * action_probs[1]\n",
|
||||||
|
" arrow_height = 20 * action_probs[1]\n",
|
||||||
|
" triangle_indices = np.array([[center_x + arrow_height/2, center_y],\n",
|
||||||
|
" [center_x - arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||||
|
" [center_x - arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||||
|
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||||
|
"\n",
|
||||||
|
" # Draw arrow pointing downward\n",
|
||||||
|
" center_x = 83 * col + 41\n",
|
||||||
|
" center_y = 83 * row + 41 +offset\n",
|
||||||
|
" arrow_base_width = 15 * action_probs[2]\n",
|
||||||
|
" arrow_height = 20 * action_probs[2]\n",
|
||||||
|
" triangle_indices = np.array([[center_x, center_y+arrow_height/2],\n",
|
||||||
|
" [center_x - arrow_base_width/2, center_y-arrow_height/2],\n",
|
||||||
|
" [center_x + arrow_base_width/2, center_y-arrow_height/2]])\n",
|
||||||
|
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||||
|
"\n",
|
||||||
|
" # Draw arrow pointing left\n",
|
||||||
|
" center_x = 83 * col + 41 -offset\n",
|
||||||
|
" center_y = 83 * row + 41\n",
|
||||||
|
" arrow_base_width = 15 * action_probs[3]\n",
|
||||||
|
" arrow_height = 20 * action_probs[3]\n",
|
||||||
|
" triangle_indices = np.array([[center_x - arrow_height/2, center_y],\n",
|
||||||
|
" [center_x + arrow_height/2, center_y-arrow_base_width/2],\n",
|
||||||
|
" [center_x + arrow_height/2, center_y+arrow_base_width/2]])\n",
|
||||||
|
" self.ax.fill(triangle_indices[:,0], triangle_indices[:,1],facecolor='cyan', edgecolor='darkcyan', linewidth=1)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" def draw(self, layout, state, draw_state_index= False, rewards=None, policy=None, state_values=None, action_values=None,path1=None, path2 = None):\n",
|
||||||
|
" # Construct the image\n",
|
||||||
|
" image_out = np.zeros((self.n_row * 83, self.n_col * 83, 4),dtype='uint8')\n",
|
||||||
|
" for c_row in range (self.n_row):\n",
|
||||||
|
" for c_col in range(self.n_col):\n",
|
||||||
|
" if layout[c_row * self.n_col + c_col]==0:\n",
|
||||||
|
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.empty_image\n",
|
||||||
|
" elif layout[c_row * self.n_col + c_col]==1:\n",
|
||||||
|
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.hole_image\n",
|
||||||
|
" else:\n",
|
||||||
|
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.fish_image\n",
|
||||||
|
" if state == c_row * self.n_col + c_col:\n",
|
||||||
|
" image_out[c_row*83:c_row*83+83, c_col*83:c_col*83+83,:] = self.penguin_image\n",
|
||||||
|
"\n",
|
||||||
|
" # Draw the image\n",
|
||||||
|
" plt.imshow(image_out)\n",
|
||||||
|
" self.ax.get_xaxis().set_visible(False)\n",
|
||||||
|
" self.ax.get_yaxis().set_visible(False)\n",
|
||||||
|
" self.ax.spines['top'].set_visible(False)\n",
|
||||||
|
" self.ax.spines['right'].set_visible(False)\n",
|
||||||
|
" self.ax.spines['bottom'].set_visible(False)\n",
|
||||||
|
" self.ax.spines['left'].set_visible(False)\n",
|
||||||
|
"\n",
|
||||||
|
" if draw_state_index:\n",
|
||||||
|
" for c_cell in range(layout.size):\n",
|
||||||
|
" self.draw_text(\"%d\"%(c_cell), np.floor(c_cell/self.n_col), c_cell-np.floor(c_cell/self.n_col)*self.n_col,'tl','k')\n",
|
||||||
|
"\n",
|
||||||
|
" # Draw the policy as triangles\n",
|
||||||
|
" if policy is not None:\n",
|
||||||
|
" # If the policy is deterministic\n",
|
||||||
|
" if len(policy) == len(layout):\n",
|
||||||
|
" for i in range(len(layout)):\n",
|
||||||
|
" self.draw_deterministic_policy(i, policy[i])\n",
|
||||||
|
" # Else it is stochastic\n",
|
||||||
|
" else:\n",
|
||||||
|
" for i in range(len(layout)):\n",
|
||||||
|
" self.draw_stochastic_policy(i,policy[:,i])\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" if path1 is not None:\n",
|
||||||
|
" # self.draw_path(path1, np.array([0.81, 0.51, 0.38]), np.array([1.0, 0.2, 0.5]))\n",
|
||||||
|
" self.draw_path(path1, np.array([1.0, 0.0, 0.0]), np.array([0.0, 1.0, 1.0]))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" plt.show()"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "Gq1HfJsHN3SB"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Let's draw the initial situation with the penguin in top right\n",
|
||||||
|
"n_rows = 4; n_cols = 4\n",
|
||||||
|
"layout = np.zeros(n_rows * n_cols)\n",
|
||||||
|
"initial_state = 0\n",
|
||||||
|
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||||
|
"mdp_drawer.draw(layout, state = initial_state, draw_state_index = True)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "eBQ7lTpJQBSe"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Note that the states are indexed from 0 rather than 1 as in the book to make\n",
|
||||||
|
"the code neater."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "P7P40UyMunKb"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Define the state probabilities\n",
|
||||||
|
"transition_probabilities = np.array( \\\n",
|
||||||
|
"[[0.00 , 0.33, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.50 , 0.00, 0.33, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.33, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.50 , 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.34, 0.00, 0.00, 0.33, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.34, 0.00, 0.00, 0.25, 0.00, 0.33, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00, 0.33, 0.00, 0.00, 0.33, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.34, 0.00, 0.50 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.34, 0.00 ],\n",
|
||||||
|
"])\n",
|
||||||
|
"initial_state = 0"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "wgFcIi4YQJWI"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Define a step from the Markov process"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "axllRDDuDDLS"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def markov_process_step(state, transition_probabilities):\n",
|
||||||
|
" # TODO -- update the state according to the appropriate transition probabilities\n",
|
||||||
|
" # One way to do this is to use np.random.choice\n",
|
||||||
|
" # Replace this line:\n",
|
||||||
|
" new_state = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" return new_state"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "FrSZrS67sdbN"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Run the Markov process for 10 steps and visualise the results"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "uTj7rN6LDFXd"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"np.random.seed(0)\n",
|
||||||
|
"T = 10\n",
|
||||||
|
"states = np.zeros(T, dtype='uint8')\n",
|
||||||
|
"states[0] = 0\n",
|
||||||
|
"for t in range(T-1):\n",
|
||||||
|
" states[t+1] = markov_process_step(states[t], transition_probabilities)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Your States:\", states)\n",
|
||||||
|
"print(\"True States: [ 0 4 8 9 10 9 10 9 13 14]\")\n",
|
||||||
|
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||||
|
"mdp_drawer.draw(layout, state = states[0], path1=states, draw_state_index = True)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "lRIdjagCwP62"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Define a Markov one step of a reward process."
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "QLyjyBjjDMin"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def markov_reward_process_step(state, transition_probabilities, reward_structure):\n",
|
||||||
|
"\n",
|
||||||
|
" # TODO -- write this function\n",
|
||||||
|
" # Update the state. Return a reward of +1 if the Penguin lands on the fish\n",
|
||||||
|
" # or zero otherwise.\n",
|
||||||
|
" # Replace this line\n",
|
||||||
|
" new_state = 0; reward = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" return new_state, reward"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "YPHSJRKx-pgO"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Run the Markov reward process for 10 steps and visualise the results"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "AIz8QEiRFoCm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Set up the reward structure so it matches figure 19.2\n",
|
||||||
|
"reward_structure = np.zeros((16,1))\n",
|
||||||
|
"reward_structure[3] = 1; reward_structure[8] = 1; reward_structure[10] = 1\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize random numbers\n",
|
||||||
|
"np.random.seed(0)\n",
|
||||||
|
"T = 10\n",
|
||||||
|
"# Set up the states, so the fish are in the same positions as figure 19.2\n",
|
||||||
|
"states = np.zeros(T, dtype='uint8')\n",
|
||||||
|
"rewards = np.zeros(T, dtype='uint8')\n",
|
||||||
|
"\n",
|
||||||
|
"states[0] = 0\n",
|
||||||
|
"for t in range(T-1):\n",
|
||||||
|
" states[t+1],rewards[t+1] = markov_reward_process_step(states[t], transition_probabilities, reward_structure)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Your States:\", states)\n",
|
||||||
|
"print(\"Your Rewards:\", rewards)\n",
|
||||||
|
"print(\"True Rewards: [0 0 1 0 1 0 1 0 0 0]\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Draw the figure\n",
|
||||||
|
"layout = np.zeros(n_rows * n_cols)\n",
|
||||||
|
"layout[3] = 2; layout[8] = 2 ; layout[10] = 2\n",
|
||||||
|
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||||
|
"mdp_drawer.draw(layout, state = states[0], path1=states, draw_state_index = True)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "0p1gCpGoFn4M"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Now let's calculate the return -- the sum of discounted future rewards"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "lyz47NWrITfj"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def calculate_return(rewards, gamma):\n",
|
||||||
|
" # TODO -- you write this function\n",
|
||||||
|
" # It should compute one return for the start of the sequence (i.e. G_1)\n",
|
||||||
|
" # Replace this line\n",
|
||||||
|
" return_val = 0.0\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" return return_val"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "4fEuBRPnFm_N"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"gamma = 0.9\n",
|
||||||
|
"for t in range(len(states)):\n",
|
||||||
|
" print(\"Return at time %d = %3.3f\"%(t, calculate_return(rewards[t:],gamma)))\n",
|
||||||
|
"\n",
|
||||||
|
"# Reality check!\n",
|
||||||
|
"print(\"True return at time 0: 1.998\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "o19lQgM3JrOz"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"Now let's define the state transition function $Pr(s_{t+1}|s_{t},a)$ in full where $a$ is the actions. Here $a=0$ means try to go upward, $a=1$, right, $a=2$ down and $a=3$ right. However, the ice is slippery, so we don't always go the direction we want to.\n",
|
||||||
|
"\n",
|
||||||
|
"Note that as for the states, we've indexed the actions from zero (unlike in the book, so they map to the indices of arrays better)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "Fhc6DzZNOjiC"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"transition_probabilities_given_action1 = np.array(\\\n",
|
||||||
|
"[[0.00 , 0.33, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.50 , 0.00, 0.33, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.33, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.50 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.34, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.34, 0.00, 0.00, 0.17, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.75, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.17, 0.00, 0.25, 0.00, 0.00, 0.50, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.75 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.25, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.25 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"transition_probabilities_given_action2 = np.array(\\\n",
|
||||||
|
"[[0.00 , 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.75 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.50, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.25 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.25, 0.00, 0.00, 0.50, 0.00, 0.17, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.25, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.75, 0.00, 0.25, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.50 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.50, 0.00 ],\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"transition_probabilities_given_action3 = np.array(\\\n",
|
||||||
|
"[[0.00 , 0.25, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.25 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.25, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.75 , 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.50, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.75, 0.00, 0.00, 0.16, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.17, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.17, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.25, 0.00, 0.00, 0.33, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.16, 0.00, 0.00, 0.00, 0.00, 0.50 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.33, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.33, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.34, 0.00, 0.50 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.34, 0.00 ],\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"transition_probabilities_given_action4 = np.array(\\\n",
|
||||||
|
"[[0.00 , 0.25, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.50 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.50, 0.00, 0.75, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.50, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.50 , 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.33, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.25, 0.00, 0.00, 0.33, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.50, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00, 0.50, 0.00, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.33, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.17, 0.00, 0.50, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.17, 0.00, 0.00, 0.00, 0.00, 0.25 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.34, 0.00, 0.00, 0.00, 0.00, 0.50, 0.00, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.50, 0.00, 0.50, 0.00 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.16, 0.00, 0.00, 0.25, 0.00, 0.75 ],\n",
|
||||||
|
" [0.00 , 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.25, 0.00, 0.00, 0.25, 0.00 ],\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"# Store all of these in a three dimension array\n",
|
||||||
|
"# Pr(s_{t+1}=2|s_{t}=1, a_{t}=3] is stored at position [2,1,3]\n",
|
||||||
|
"transition_probabilities_given_action = np.concatenate((np.expand_dims(transition_probabilities_given_action1,2),\n",
|
||||||
|
" np.expand_dims(transition_probabilities_given_action2,2),\n",
|
||||||
|
" np.expand_dims(transition_probabilities_given_action3,2),\n",
|
||||||
|
" np.expand_dims(transition_probabilities_given_action4,2)),axis=2)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "l7rT78BbOgTi"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Now we need a policy. Let's start with the deterministic policy in figure 19.5a:\n",
|
||||||
|
"policy = [2,2,1,1, 2,1,1,1, 1,1,0,2, 1,0,1,1]\n",
|
||||||
|
"\n",
|
||||||
|
"# Let's draw the policy first\n",
|
||||||
|
"layout = np.zeros(n_rows * n_cols)\n",
|
||||||
|
"layout[15] = 2\n",
|
||||||
|
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||||
|
"mdp_drawer.draw(layout, state = states[0], policy = policy, draw_state_index = True)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "8jWhDlkaKj7Q"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def markov_decision_process_step_deterministic(state, transition_probabilities_given_action, reward_structure, policy):\n",
|
||||||
|
" # TODO -- complete this function.\n",
|
||||||
|
" # For each state, theres is a corresponding action.\n",
|
||||||
|
" # Draw the next state based on the current state and that action\n",
|
||||||
|
" # and calculate the reward\n",
|
||||||
|
" # Replace this line:\n",
|
||||||
|
" new_state = 0; reward = 0;\n",
|
||||||
|
"\n",
|
||||||
|
" return new_state, reward\n"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "dueNbS2SUVUK"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Set up the reward structure so it matches figure 19.2\n",
|
||||||
|
"reward_structure = np.zeros((16,1))\n",
|
||||||
|
"reward_structure[15] = 1\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize random number seed\n",
|
||||||
|
"np.random.seed(3)\n",
|
||||||
|
"T = 10\n",
|
||||||
|
"# Set up the states, so the fish are in the same positions as figure 19.5\n",
|
||||||
|
"states = np.zeros(T, dtype='uint8')\n",
|
||||||
|
"rewards = np.zeros(T, dtype='uint8')\n",
|
||||||
|
"\n",
|
||||||
|
"states[0] = 0\n",
|
||||||
|
"for t in range(T-1):\n",
|
||||||
|
" states[t+1],rewards[t+1] = markov_decision_process_step_deterministic(states[t], transition_probabilities_given_action, reward_structure, policy)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Your States:\", states)\n",
|
||||||
|
"print(\"True States: [ 0 4 8 9 13 14 15 11 7 3]\")\n",
|
||||||
|
"print(\"Your Rewards:\", rewards)\n",
|
||||||
|
"print(\"True Rewards: [0 0 0 0 0 0 1 0 0 0]\")\n",
|
||||||
|
"\n",
|
||||||
|
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||||
|
"mdp_drawer.draw(layout, state = states[0], path1=states, policy = policy, draw_state_index = True)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "4Du5aUfd2Lci"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"You can see that the Penguin usually follows the policy, (heads in the direction of the cyan arrows (when it can). But sometimes, the penguin \"slips\" to a different neighboring state\n",
|
||||||
|
"\n",
|
||||||
|
"Now let's investigate a stochastic policy"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "bLEd8xug33b-"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"np.random.seed(0)\n",
|
||||||
|
"# Let's now choose a random policy. We'll generate a set of random numbers and pass\n",
|
||||||
|
"# them through a softmax function\n",
|
||||||
|
"stochastic_policy = np.random.normal(size=(4,n_rows*n_cols))\n",
|
||||||
|
"stochastic_policy = np.exp(stochastic_policy) / (np.ones((4,1))@ np.expand_dims(np.sum(np.exp(stochastic_policy), axis=0),0))\n",
|
||||||
|
"np.set_printoptions(precision=2)\n",
|
||||||
|
"print(stochastic_policy)\n",
|
||||||
|
"\n",
|
||||||
|
"# Let's draw the policy first\n",
|
||||||
|
"layout = np.zeros(n_rows * n_cols)\n",
|
||||||
|
"layout[15] = 2\n",
|
||||||
|
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||||
|
"mdp_drawer.draw(layout, state = states[0], path1=states, policy = stochastic_policy, draw_state_index = True)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "o7T0b3tyilDc"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def markov_decision_process_step_stochastic(state, transition_probabilities_given_action, reward_structure, stochastic_policy):\n",
|
||||||
|
" # TODO -- complete this function.\n",
|
||||||
|
" # For each state, theres is a corresponding distribution over actions\n",
|
||||||
|
" # Draw a sample from that distribution to get the action\n",
|
||||||
|
" # Draw the next state based on the current state and that action\n",
|
||||||
|
" # and calculate the reward\n",
|
||||||
|
" # Replace this line:\n",
|
||||||
|
" new_state = 0; reward = 0;action = 0\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" return new_state, reward, action"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "T68mTZSe6A3w"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Set up the reward structure so it matches figure 19.2\n",
|
||||||
|
"reward_structure = np.zeros((16,1))\n",
|
||||||
|
"reward_structure[15] = 1\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize random number seed\n",
|
||||||
|
"np.random.seed(0)\n",
|
||||||
|
"T = 10\n",
|
||||||
|
"# Set up the states, so the fish are in the same positions as figure 19.5\n",
|
||||||
|
"states = np.zeros(T, dtype='uint8')\n",
|
||||||
|
"rewards = np.zeros(T, dtype='uint8')\n",
|
||||||
|
"actions = np.zeros(T-1, dtype='uint8')\n",
|
||||||
|
"\n",
|
||||||
|
"states[0] = 0\n",
|
||||||
|
"for t in range(T-1):\n",
|
||||||
|
" states[t+1],rewards[t+1],actions[t] = markov_decision_process_step_stochastic(states[t], transition_probabilities_given_action, reward_structure, stochastic_policy)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Actions\", actions)\n",
|
||||||
|
"print(\"Your States:\", states)\n",
|
||||||
|
"print(\"Your Rewards:\", rewards)\n",
|
||||||
|
"\n",
|
||||||
|
"mdp_drawer = DrawMDP(n_rows, n_cols)\n",
|
||||||
|
"mdp_drawer.draw(layout, state = states[0], path1=states, policy = stochastic_policy, draw_state_index = True)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "hMRVYX2HtqMg"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user