{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap06/6_5_Adam.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# **Notebook 6.5: Adam**\n",
        "\n",
        "This notebook investigates the Adam algorithm as illustrated in figure 6.9 from the book.\n",
        "\n",
        "Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n",
        "\n",
        "Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions."
      ],
      "metadata": {
        "id": "ysg9OHZq07YC"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Hi_t5nCk01tx"
      },
      "outputs": [],
      "source": [
        "# import libraries\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "from matplotlib.colors import ListedColormap"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Define function that we wish to find the minimum of (normally would be defined implicitly by data and loss)\n",
        "def loss(phi0, phi1):\n",
        "    height = np.exp(-0.5 * (phi1 * phi1)*4.0)\n",
        "    height = height * np. exp(-0.5* (phi0-0.7) *(phi0-0.7)/4.0)\n",
        "    return 1.0-height\n",
        "\n",
        "# Compute the gradients of this function (for simplicity, I just used finite differences)\n",
        "def get_loss_gradient(phi0, phi1):\n",
        "    delta_phi = 0.00001;\n",
        "    gradient = np.zeros((2,1));\n",
        "    gradient[0] = (loss(phi0+delta_phi/2.0, phi1) - loss(phi0-delta_phi/2.0, phi1))/delta_phi\n",
        "    gradient[1] = (loss(phi0, phi1+delta_phi/2.0) - loss(phi0, phi1-delta_phi/2.0))/delta_phi\n",
        "    return gradient[:,0];\n",
        "\n",
        "# Compute the loss function at a range of values of phi0 and phi1 for plotting\n",
        "def get_loss_function_for_plot():\n",
        "  grid_values = np.arange(-1.0,1.0,0.01);\n",
        "  phi0mesh, phi1mesh = np.meshgrid(grid_values, grid_values)\n",
        "  loss_function = np.zeros((grid_values.size, grid_values.size))\n",
        "  for idphi0, phi0 in enumerate(grid_values):\n",
        "      for idphi1, phi1 in enumerate(grid_values):\n",
        "          loss_function[idphi0, idphi1] = loss(phi1,phi0)\n",
        "  return loss_function, phi0mesh, phi1mesh"
      ],
      "metadata": {
        "id": "GTrgOKhp16zw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define fancy colormap\n",
        "my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
        "my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
        "r = np.floor(my_colormap_vals_dec/(256*256))\n",
        "g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
        "b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
        "my_colormap_vals = np.vstack((r,g,b)).transpose()/255.0\n",
        "my_colormap = ListedColormap(my_colormap_vals)\n",
        "\n",
        "# Plotting function\n",
        "def draw_function(phi0mesh, phi1mesh, loss_function, my_colormap, opt_path):\n",
        "    fig = plt.figure();\n",
        "    ax = plt.axes();\n",
        "    fig.set_size_inches(7,7)\n",
        "    ax.contourf(phi0mesh, phi1mesh, loss_function, 256, cmap=my_colormap);\n",
        "    ax.contour(phi0mesh, phi1mesh, loss_function, 20, colors=['#80808080'])\n",
        "    ax.plot(opt_path[0,:], opt_path[1,:],'-', color='#a0d9d3ff')\n",
        "    ax.plot(opt_path[0,:], opt_path[1,:],'.', color='#a0d9d3ff',markersize=10)\n",
        "    ax.set_xlabel(\"$\\phi_{0}$\")\n",
        "    ax.set_ylabel(\"$\\phi_{1}$\")\n",
        "    plt.show()"
      ],
      "metadata": {
        "id": "YKijFyuH4ZJD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Simple fixed step size gradient descent\n",
        "def grad_descent(start_posn, n_steps, alpha):\n",
        "    grad_path = np.zeros((2, n_steps+1));\n",
        "    grad_path[:,0] = start_posn[:,0];\n",
        "    for c_step in range(n_steps):\n",
        "        this_grad = get_loss_gradient(grad_path[0,c_step], grad_path[1,c_step]);\n",
        "        grad_path[:,c_step+1] = grad_path[:,c_step] - alpha * this_grad\n",
        "    return grad_path;"
      ],
      "metadata": {
        "id": "Afxr7RqR8s7Q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "We'll start by running gradient descent with a fixed step size for this loss function."
      ],
      "metadata": {
        "id": "MXZL8lu3-EUF"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "loss_function, phi0mesh, phi1mesh = get_loss_function_for_plot() ;\n",
        "\n",
        "start_posn = np.zeros((2,1));\n",
        "start_posn[0,0] = -0.7; start_posn[1,0] = -0.9\n",
        "\n",
        "# Run gradient descent\n",
        "grad_path1 = grad_descent(start_posn, n_steps=200, alpha = 0.08)\n",
        "draw_function(phi0mesh, phi1mesh, loss_function, my_colormap, grad_path1)\n",
        "grad_path2 = grad_descent(start_posn, n_steps=40, alpha= 1.0)\n",
        "draw_function(phi0mesh, phi1mesh, loss_function, my_colormap, grad_path2)"
      ],
      "metadata": {
        "id": "fgkwVEal8stH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Because the function changes much faster in $\\phi_1$ than in $\\phi_0$, there is no great step size to choose.  If we set the step size so that it makes sensible progress in the $\\phi_1$ direction, then it takes many iterations to converge.  If we set the step size so that we make sensible progress in the $\\phi_0$ direction, then the path oscillates in the $\\phi_1$ direction.  \n",
        "\n",
        "This motivates Adam.  At the core of Adam is the idea that we should just determine which way is downhill along each axis (i.e. left/right for $\\phi_0$ or up/down for $\\phi_1$) and move a fixed distance in that direction."
      ],
      "metadata": {
        "id": "AN2uNxaa-bRX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def normalized_gradients(start_posn, n_steps, alpha,  epsilon=1e-20):\n",
        "    grad_path = np.zeros((2, n_steps+1));\n",
        "    grad_path[:,0] = start_posn[:,0];\n",
        "    for c_step in range(n_steps):\n",
        "        # Measure the gradient as in equation 6.13 (first line)\n",
        "        m = get_loss_gradient(grad_path[0,c_step], grad_path[1,c_step]);\n",
        "        # TO DO -- compute the squared gradient as in equation 6.13 (second line)\n",
        "        # Replace this line:\n",
        "        v = np.ones_like(grad_path[:,0])\n",
        "\n",
        "        # TO DO -- apply the update rule (equation 6.14)\n",
        "        # Replace this line:\n",
        "        grad_path[:,c_step+1] = grad_path[:,c_step]\n",
        "\n",
        "    return grad_path;"
      ],
      "metadata": {
        "id": "IqX2zP_29gLF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Let's try out normalized gradients\n",
        "start_posn = np.zeros((2,1));\n",
        "start_posn[0,0] = -0.7; start_posn[1,0] = -0.9\n",
        "\n",
        "# Run gradient descent\n",
        "grad_path1 = normalized_gradients(start_posn, n_steps=40, alpha = 0.08)\n",
        "draw_function(phi0mesh, phi1mesh, loss_function, my_colormap, grad_path1)"
      ],
      "metadata": {
        "id": "wxe-dKW5Chv3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "This moves towards the minimum at a sensible speed, but we never actually converge -- the solution just bounces back and forth between the last two points.  To make it converge, we add momentum to both the estimates of the gradient and the pointwise squared gradient.  We also modify the statistics by a factor that depends on the time to make sure the progress is now slow to start with."
      ],
      "metadata": {
        "id": "_6KoKBJdGGI4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def adam(start_posn, n_steps, alpha,  beta=0.9, gamma=0.99, epsilon=1e-20):\n",
        "    grad_path = np.zeros((2, n_steps+1));\n",
        "    grad_path[:,0] = start_posn[:,0];\n",
        "    m = np.zeros_like(grad_path[:,0])\n",
        "    v = np.zeros_like(grad_path[:,0])\n",
        "    for c_step in range(n_steps):\n",
        "        # Measure the gradient\n",
        "        grad = get_loss_gradient(grad_path[0,c_step], grad_path[1,c_step])\n",
        "        # TODO -- Update the momentum based gradient estimate equation 6.15 (first line)\n",
        "        # Replace this line:\n",
        "        m = m;\n",
        "\n",
        "\n",
        "        # TODO -- update the momentum based squared gradient estimate as in equation 6.15 (second line)\n",
        "        # Replace this line:\n",
        "        v = v\n",
        "\n",
        "        # TODO -- Modify the statistics according to equation 6.16\n",
        "        # You will need the function np.power\n",
        "        # Replace these lines\n",
        "        m_tilde = m\n",
        "        v_tilde = v\n",
        "\n",
        "\n",
        "        # TO DO -- apply the update rule (equation 6.17)\n",
        "        # Replace this line:\n",
        "        grad_path[:,c_step+1] = grad_path[:,c_step]\n",
        "\n",
        "    return grad_path;"
      ],
      "metadata": {
        "id": "BKUhZSGgDEm0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Let's try out our Adam algorithm\n",
        "start_posn = np.zeros((2,1));\n",
        "start_posn[0,0] = -0.7; start_posn[1,0] = -0.9\n",
        "\n",
        "# Run gradient descent\n",
        "grad_path1 = adam(start_posn, n_steps=60, alpha = 0.05)\n",
        "draw_function(phi0mesh, phi1mesh, loss_function, my_colormap, grad_path1)"
      ],
      "metadata": {
        "id": "sg5X18P3IbYo"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}