diff --git a/CM20315_Training_III.ipynb b/CM20315_Training_III.ipynb
new file mode 100644
index 0000000..e8bf65c
--- /dev/null
+++ b/CM20315_Training_III.ipynb
@@ -0,0 +1,585 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyMzgGVp+/BUCXimg7Ip9lhp",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Training_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Training III\n",
+        "\n",
+        "We now have a model and a loss function which we can use to judge how good that model is. It's time to put the \"learning\" into machine learning.\n",
+        "\n",
+        "Learning involves finding the parameters that minimize the loss. That might seems like it's not too hard, but modern models might have billions of parameters. There's an exponential number of possible parameter combinations, and there's no way we can make any progress with exhaustive search.\n",
+        "\n",
+        "In part I we considered 1D search using a bracketing approach.  In part II we experimented with fitting a linear regression model (which has a convex loss function).  In this part, we'll fit the Gabor model, which has a non-convex loss function.\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "el8l05WQEO46"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xhmIOLiZELV_"
+      },
+      "outputs": [],
+      "source": [
+        "# import libraries\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "from matplotlib import cm\n",
+        "from matplotlib.colors import ListedColormap"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's create our training data 30 pairs {x_i, y_i}\n",
+        "# We'll try to fit the Gabor model to these data\n",
+        "data = np.array([[-1.920e+00,-1.422e+01,1.490e+00,-1.940e+00,-2.389e+00,-5.090e+00,\n",
+        "                 -8.861e+00,3.578e+00,-6.010e+00,-6.995e+00,3.634e+00,8.743e-01,\n",
+        "                 -1.096e+01,4.073e-01,-9.467e+00,8.560e+00,1.062e+01,-1.729e-01,\n",
+        "                  1.040e+01,-1.261e+01,1.574e-01,-1.304e+01,-2.156e+00,-1.210e+01,\n",
+        "                 -1.119e+01,2.902e+00,-8.220e+00,-1.179e+01,-8.391e+00,-4.505e+00],\n",
+        "                  [-1.051e+00,-2.482e-02,8.896e-01,-4.943e-01,-9.371e-01,4.306e-01,\n",
+        "                  9.577e-03,-7.944e-02 ,1.624e-01,-2.682e-01,-3.129e-01,8.303e-01,\n",
+        "                  -2.365e-02,5.098e-01,-2.777e-01,3.367e-01,1.927e-01,-2.222e-01,\n",
+        "                  6.352e-02,6.888e-03,3.224e-02,1.091e-02,-5.706e-01,-5.258e-02,\n",
+        "                  -3.666e-02,1.709e-01,-4.805e-02,2.008e-01,-1.904e-01,5.952e-01]])"
+      ],
+      "metadata": {
+        "id": "4cRkrh9MZ58Z"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's define our model\n",
+        "def model(phi,x):\n",
+        "  sin_component = np.sin(phi[0] + 0.06 * phi[1] * x)\n",
+        "  gauss_component = np.exp(-(phi[0] + 0.06 * phi[1] * x) * (phi[0] + 0.06 * phi[1] * x) / 32)\n",
+        "  y_pred= sin_component * gauss_component\n",
+        "  return y_pred"
+      ],
+      "metadata": {
+        "id": "WQUERmb2erAe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw model\n",
+        "def draw_model(data,model,phi,title=None):\n",
+        "  x_model = np.arange(-15,15,0.1)\n",
+        "  y_model = model(phi,x_model)\n",
+        "\n",
+        "  fix, ax = plt.subplots()\n",
+        "  ax.plot(data[0,:],data[1,:],'bo')\n",
+        "  ax.plot(x_model,y_model,'m-')\n",
+        "  ax.set_xlim([-15,15]);ax.set_ylim([-1,1])\n",
+        "  ax.set_xlabel('x'); ax.set_ylabel('y')\n",
+        "  if title is not None:\n",
+        "    ax.set_title(title)\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "qFRe9POHF2le"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parmaeters and draw the model\n",
+        "phi = np.zeros((2,1))\n",
+        "phi[0] =  -5     # Horizontal offset\n",
+        "phi[1] =  25     # Frequency\n",
+        "draw_model(data,model,phi, \"Initial parameters\")\n"
+      ],
+      "metadata": {
+        "id": "TXx1Tpd1Tl-I"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now lets create compute the sum of squares loss for the training data"
+      ],
+      "metadata": {
+        "id": "QU5mdGvpTtEG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_loss(data_x, data_y, model, phi):\n",
+        "  # TODO -- Write this function -- replace the line below\n",
+        "  # TODO -- First make model predictions from data x\n",
+        "  # TODO -- Then compute the squared difference between the predictions and true y values\n",
+        "  # TODO -- Then sum them all and return\n",
+        "  loss = 0\n",
+        "  return loss"
+      ],
+      "metadata": {
+        "id": "I7dqTY2Gg7CR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's just test that we got that right"
+      ],
+      "metadata": {
+        "id": "eB5DQvU5hYNx"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
+        "print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 16.419))"
+      ],
+      "metadata": {
+        "id": "Ty05UtEEg9tc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's plot the whole loss function"
+      ],
+      "metadata": {
+        "id": "F3trnavPiHpH"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def draw_loss_function(compute_loss, data,  model, phi_iters = None):\n",
+        "  # Define pretty colormap\n",
+        "  my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
+        "  my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
+        "  r = np.floor(my_colormap_vals_dec/(256*256))\n",
+        "  g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
+        "  b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
+        "  my_colormap = ListedColormap(np.vstack((r,g,b)).transpose()/255.0)\n",
+        "\n",
+        "  # Make grid of intercept/slope values to plot\n",
+        "  offsets_mesh, freqs_mesh = np.meshgrid(np.arange(-10,10.0,0.1), np.arange(2.5,22.5,0.1))\n",
+        "  loss_mesh = np.zeros_like(freqs_mesh)\n",
+        "  # Compute loss for every set of parameters\n",
+        "  for idslope, slope in np.ndenumerate(freqs_mesh):\n",
+        "     loss_mesh[idslope] = compute_loss(data[0,:], data[1,:], model, np.array([[offsets_mesh[idslope]], [slope]]))\n",
+        "\n",
+        "  fig,ax = plt.subplots()\n",
+        "  fig.set_size_inches(8,8)\n",
+        "  ax.contourf(offsets_mesh,freqs_mesh,loss_mesh,256,cmap=my_colormap)\n",
+        "  ax.contour(offsets_mesh,freqs_mesh,loss_mesh,20,colors=['#80808080'])\n",
+        "  if phi_iters is not None:\n",
+        "    ax.plot(phi_iters[0,:], phi_iters[1,:],'go-')\n",
+        "  ax.set_ylim([2.5,22.5])\n",
+        "  ax.set_xlabel('Offset $\\phi_{0}$'); ax.set_ylabel('Frequency, $\\phi_{1}$')\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "K-NTHpAAHlCl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "draw_loss_function(compute_loss, data, model)"
+      ],
+      "metadata": {
+        "id": "l8HbvIupnTME"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's compute the gradient vector for a given set of parameters:\n",
+        "\n",
+        "\\begin{equation}\n",
+        "\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
+        "\\end{equation}"
+      ],
+      "metadata": {
+        "id": "s9Duf05WqqSC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# These came from writing out the expression for the sum of squares loss and taking the\n",
+        "# derivative with respect to phi0 and phi1. It was a lot of hassle to get it right!\n",
+        "def gabor_deriv_phi0(data_x,data_y,phi0, phi1):\n",
+        "    x = 0.06 * phi1 * data_x + phi0\n",
+        "    y = data_y           \n",
+        "    cos_component = np.cos(x)\n",
+        "    sin_component = np.sin(x)\n",
+        "    gauss_component = np.exp(-0.5 * x *x / 16)\n",
+        "    deriv = cos_component * gauss_component - sin_component * gauss_component * x / 16\n",
+        "    deriv = 2* deriv * (sin_component * gauss_component - y)\n",
+        "    return np.sum(deriv)\n",
+        "\n",
+        "def gabor_deriv_phi1(data_x, data_y,phi0, phi1):\n",
+        "    x = 0.06 * phi1 * data_x + phi0\n",
+        "    y = data_y            \n",
+        "    cos_component = np.cos(x)\n",
+        "    sin_component = np.sin(x)\n",
+        "    gauss_component = np.exp(-0.5 * x *x / 16)\n",
+        "    deriv = 0.06 * data_x * cos_component * gauss_component - 0.06 * data_x*sin_component * gauss_component * x / 16\n",
+        "    deriv = 2*deriv * (sin_component * gauss_component - y)\n",
+        "    return np.sum(deriv)\n",
+        "\n",
+        "def compute_gradient(data_x, data_y, phi):\n",
+        "    dl_dphi0 = gabor_deriv_phi0(data_x, data_y, phi[0],phi[1])\n",
+        "    dl_dphi1 = gabor_deriv_phi1(data_x, data_y, phi[0],phi[1])\n",
+        "    # Return the gradient\n",
+        "    return np.array([[dl_dphi0],[dl_dphi1]])"
+      ],
+      "metadata": {
+        "id": "UpswmkL2qwBT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We can check we got this right using a trick known as **finite differences**.  If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
+        "\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "We don't do this when there are many parameters;  for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
+      ],
+      "metadata": {
+        "id": "RS1nEcYVuEAM"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Compute the gradient using your function\n",
+        "gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
+        "print(\"Your gradients: (%3.3f,%3.3f)\"%(gradient[0],gradient[1]))\n",
+        "# Approximate the gradients with finite differences\n",
+        "delta = 0.0001\n",
+        "dl_dphi0_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[delta],[0]])) - \\\n",
+        "                    compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
+        "dl_dphi1_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[0],[delta]])) - \\\n",
+        "                    compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
+        "print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n"
+      ],
+      "metadata": {
+        "id": "QuwAHN7yt-gi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we are ready to perform gradient descent.  We'll need to use our line search routine from part I, which I've reproduced here plus the helper function loss_function_1D that converts from a 2D problem to a 1D problem"
+      ],
+      "metadata": {
+        "id": "5EIjMM9Fw2eT"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def loss_function_1D(dist_prop, data, model, phi_start, gradient):\n",
+        "  # Return the loss after moving this far\n",
+        "  return compute_loss(data[0,:], data[1,:], model, phi_start+ gradient * dist_prop)\n",
+        "\n",
+        "def line_search(data, model, phi, gradient, thresh=.00001, max_dist = 0.1, max_iter = 15, verbose=False):\n",
+        "    # Initialize four points along the rnage we are going to search\n",
+        "    a = 0\n",
+        "    b = 0.33 * max_dist\n",
+        "    c = 0.66 * max_dist\n",
+        "    d = 1.0 * max_dist\n",
+        "    n_iter  =0;\n",
+        "    \n",
+        "    # While we haven't found the minimum closely enough\n",
+        "    while np.abs(b-c) > thresh and n_iter < max_iter:\n",
+        "        # Increment iteration counter (just to prevent an infinite loop)\n",
+        "        n_iter = n_iter+1\n",
+        "        # Calculate all four points\n",
+        "        lossa = loss_function_1D(a, data, model, phi,gradient)\n",
+        "        lossb = loss_function_1D(b, data, model, phi,gradient)\n",
+        "        lossc = loss_function_1D(c, data, model, phi,gradient)\n",
+        "        lossd = loss_function_1D(d, data, model, phi,gradient)\n",
+        "\n",
+        "        if verbose:\n",
+        "          print('Iter %d, a=%3.3f, b=%3.3f, c=%3.3f, d=%3.3f'%(n_iter, a,b,c,d))\n",
+        "          print('a %f, b%f, c%f, d%f'%(lossa,lossb,lossc,lossd))\n",
+        "\n",
+        "        # Rule #1 If point A is less than points B, C, and D then halve points B,C, and D\n",
+        "        if np.argmin((lossa,lossb,lossc,lossd))==0:\n",
+        "          b = b/2\n",
+        "          c = c/2\n",
+        "          d = d/2\n",
+        "          continue;\n",
+        "\n",
+        "        # Rule #2 If point b is less than point c then\n",
+        "        #                     then point d becomes point c, and\n",
+        "        #                     point b becomes 1/3 between a and new d\n",
+        "        #                     point c beocome 2/3 between a and new d \n",
+        "        if lossb < lossc:\n",
+        "          d = c\n",
+        "          b = a+ (d-a)/3\n",
+        "          c = a+ 2*(d-a)/3\n",
+        "          continue\n",
+        "\n",
+        "        # Rule #2 If point c is less than point b then\n",
+        "        #                     then point a becomes point b, and\n",
+        "        #                     point b becomes 1/3 between new a and d\n",
+        "        #                     point c beocome 2/3 between new a and d \n",
+        "        a = b\n",
+        "        b = a+ (d-a)/3\n",
+        "        c = a+ 2*(d-a)/3\n",
+        "    \n",
+        "    # Return average of two middle points\n",
+        "    return (b+c)/2.0"
+      ],
+      "metadata": {
+        "id": "XrJ2gQjfw1XP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def gradient_descent_step(phi, data,  model):\n",
+        "  # Step 1:  Compute the gradient\n",
+        "  gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
+        "  # Step 2:  Update the parameters -- note we want to search in the negative (downhill direction)\n",
+        "  alpha = line_search(data, model, phi, gradient*-1, max_dist = 2.0)\n",
+        "  phi = phi - alpha * gradient\n",
+        "  return phi"
+      ],
+      "metadata": {
+        "id": "YVq6rmaWRD2M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parameters\n",
+        "n_steps = 21\n",
+        "phi_all = np.zeros((2,n_steps+1))\n",
+        "phi_all[0,0] = -1.5\n",
+        "phi_all[1,0] = 8.5\n",
+        "\n",
+        "# Measure loss and draw initial model\n",
+        "loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
+        "draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
+        "\n",
+        "for c_step in range (n_steps):\n",
+        "  # Do gradient descent step\n",
+        "  phi_all[:,c_step+1:c_step+2] = gradient_descent_step(phi_all[:,c_step:c_step+1],data, model)\n",
+        "  # Measure loss and draw model every 4th step\n",
+        "  if c_step % 4 == 0:\n",
+        "    loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
+        "    draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
+        "\n",
+        "draw_loss_function(compute_loss, data, model,phi_all)\n"
+      ],
+      "metadata": {
+        "id": "tOLd0gtdRLLS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO Experiment with starting the optimization in the previous cell in different places\n",
+        "# and show that it heads to a local minimum if we don't start it in the right valley"
+      ],
+      "metadata": {
+        "id": "Oi8ZlH0ptLqA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def gradient_descent_step_fixed_learning_rate(phi, data,  model, alpha):\n",
+        "  # TODO -- fill in this routine so that we take a fixed size step of size alpha without using line search\n",
+        "\n",
+        "  return phi"
+      ],
+      "metadata": {
+        "id": "4l-ueLk-oAxV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parameters\n",
+        "n_steps = 21\n",
+        "phi_all = np.zeros((2,n_steps+1))\n",
+        "phi_all[0,0] = -1.5\n",
+        "phi_all[1,0] = 8.5\n",
+        "\n",
+        "# Measure loss and draw initial model\n",
+        "loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
+        "draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
+        "\n",
+        "for c_step in range (n_steps):\n",
+        "  # Do gradient descent step\n",
+        "  phi_all[:,c_step+1:c_step+2] = gradient_descent_step_fixed_learning_rate(phi_all[:,c_step:c_step+1],data, model,alpha =0.2)\n",
+        "  # Measure loss and draw model every 4th step\n",
+        "  if c_step % 4 == 0:\n",
+        "    loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
+        "    draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
+        "\n",
+        "draw_loss_function(compute_loss, data, model,phi_all)\n"
+      ],
+      "metadata": {
+        "id": "oi9MX_GRpM41"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO Experiment with the learning rate, alpha.  \n",
+        "# What happens if you set it too large?\n",
+        "# What happens if you set it too small?"
+      ],
+      "metadata": {
+        "id": "In6sQ5YCpMqn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def stochastic_gradient_descent_step(phi, data,  model, alpha, batch_size):\n",
+        "  # TODO -- fill in this routine so that we take a fixed size step of size alpha but only using a subset (batch) of the data\n",
+        "  # at each step\n",
+        "  # You can use the function np.random.permutation to generate a random permutation of the n_data = data.shape[1] indices\n",
+        "  # and then just choose the first n=batch_size of these indices.  Then select compute the gradient update\n",
+        "  # from just the data with these indices.   Don't worry about sampling with replacement.\n",
+        "\n",
+        "\n",
+        "  return phi"
+      ],
+      "metadata": {
+        "id": "VKTC9-1Gpm3N"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set the random number generator so you always get same numbers (disable if you don't want this)\n",
+        "np.random.seed(1)\n",
+        "# Initialize the parameters\n",
+        "n_steps = 41\n",
+        "phi_all = np.zeros((2,n_steps+1))\n",
+        "phi_all[0,0] = 3.5\n",
+        "phi_all[1,0] = 6.5\n",
+        "\n",
+        "# Measure loss and draw initial model\n",
+        "loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
+        "draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
+        "\n",
+        "for c_step in range (n_steps):\n",
+        "  # Do gradient descent step\n",
+        "  phi_all[:,c_step+1:c_step+2] = stochastic_gradient_descent_step(phi_all[:,c_step:c_step+1],data, model,alpha =0.8, batch_size=5)\n",
+        "  # Measure loss and draw model every 4th step\n",
+        "  if c_step % 8 == 0:\n",
+        "    loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
+        "    draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
+        "\n",
+        "draw_loss_function(compute_loss, data, model,phi_all)"
+      ],
+      "metadata": {
+        "id": "469OP_UHskJ4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO -- Experiment with different learning rates, starting points, batch sizes, number of steps.  Get a feel for this."
+      ],
+      "metadata": {
+        "id": "LxE2kTa3s29p"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO -- How about adding a learning rate schedule?  Reduce the learning rate by a factor of beta every M iterations"
+      ],
+      "metadata": {
+        "id": "lw4QPOaQTh5e"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file