Update index.html

2023-04-26 18:19:22 -04:00 · 2023-04-24 14:34:36 -04:00 · 2023-04-19 08:33:10 -04:00 · 2023-04-17 14:01:58 -04:00 · 2023-04-11 11:23:03 -04:00 · 2023-04-08 17:09:26 -04:00
45 changed files with 14557 additions and 51 deletions
--- a/CM20315_Convolution_I.ipynb
+++ b/CM20315_Convolution_I.ipynb
@@ -0,0 +1,432 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyOdO9HZNZ/DwsTSc7M8PBTl",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Convolution_I.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Convolution I \n",
+        "\n",
+        "This notebook investigates the convolution operation.  It asks you to hand code a convolution so we can be sure that we are computing the same thing as in PyTorch.  The subsequent notebooks use the convolutional layers in PyTorch directly."
+      ],
+      "metadata": {
+        "id": "VB_crnDGASX-"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import numpy as np\n",
+        "import torch\n",
+        "# Set to print in reasonable form\n",
+        "np.set_printoptions(precision=3, floatmode=\"fixed\")\n",
+        "torch.set_printoptions(precision=3)"
+      ],
+      "metadata": {
+        "id": "YAoWDUb_DezG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "This routine performs convolution in PyTorch"
+      ],
+      "metadata": {
+        "id": "eAwYWXzAElHG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Perform convolution in PyTorch\n",
+        "def conv_pytorch(image, conv_weights, stride=1, pad =1):\n",
+        "  # Convert image and kernel to tensors\n",
+        "  image_tensor = torch.from_numpy(image) # (batchSize, channelsIn, imageHeightIn, =imageWidthIn)\n",
+        "  conv_weights_tensor = torch.from_numpy(conv_weights) # (channelsOut, channelsIn, kernelHeight, kernelWidth) \n",
+        "  # Do the convolution\n",
+        "  output_tensor = torch.nn.functional.conv2d(image_tensor, conv_weights_tensor, stride=stride, padding=pad) \n",
+        "  # Convert back from PyTorch and return\n",
+        "  return(output_tensor.numpy()) # (batchSize channelsOut imageHeightOut imageHeightIn)"
+      ],
+      "metadata": {
+        "id": "xsmUIN-3BlWr"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "First we'll start with the simplest 2D convolution.  Just one channel in and one channel out.  A single image in the batch."
+      ],
+      "metadata": {
+        "id": "A3Sm8bUWtDNO"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Perform convolution in numpy\n",
+        "def conv_numpy_1(image, weights, pad=1):\n",
+        "    \n",
+        "    # Perform zero padding \n",
+        "    if pad != 0:\n",
+        "        image = np.pad(image, ((0, 0), (0 ,0), (pad, pad), (pad, pad)),'constant')\n",
+        "    \n",
+        "    # Get sizes of image array and kernel weights\n",
+        "    batchSize,  channelsIn, imageHeightIn, imageWidthIn = image.shape\n",
+        "    channelsOut, channelsIn, kernelHeight, kernelWidth = weights.shape\n",
+        "\n",
+        "    # Get size of output arrays\n",
+        "    imageHeightOut = np.floor(1 + imageHeightIn - kernelHeight).astype(int)\n",
+        "    imageWidthOut = np.floor(1 + imageWidthIn - kernelWidth).astype(int)\n",
+        "\n",
+        "    # Create output\n",
+        "    out = np.zeros((batchSize, channelsOut, imageHeightOut, imageWidthOut), dtype=np.float32) \n",
+        "    \n",
+        "    for c_y in range(imageHeightOut):\n",
+        "      for c_x in range(imageWidthOut):\n",
+        "        for c_kernel_y in range(kernelHeight):\n",
+        "          for c_kernel_x in range(kernelWidth):\n",
+        "            # TODO -- Retrieve the image pixel and the weight from the convolution\n",
+        "            # Only one image in batch, one input channel and one output channel, so these indices should all be zero\n",
+        "            # Replace the two lines below\n",
+        "            this_pixel_value = 1.0\n",
+        "            this_weight = 1.0\n",
+        "         \n",
+        "            # Multiply these together and add to the output at this position\n",
+        "            out[0, 0, c_y, c_x] += np.sum(this_pixel_value * this_weight) \n",
+        "            \n",
+        "    return out"
+      ],
+      "metadata": {
+        "id": "EF8FWONVLo1Q"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set random seed so we always get same answer\n",
+        "np.random.seed(1) \n",
+        "n_batch = 1\n",
+        "image_height = 4\n",
+        "image_width = 6\n",
+        "channels_in = 1\n",
+        "kernel_size = 3\n",
+        "channels_out = 1\n",
+        "\n",
+        "# Create random input image\n",
+        "input_image= np.random.normal(size=(n_batch, channels_in, image_height, image_width))\n",
+        "# Create random convolution kernel weights\n",
+        "conv_weights = np.random.normal(size=(channels_out, channels_in, kernel_size, kernel_size))\n",
+        "\n",
+        "# Perform convolution using PyTorch\n",
+        "conv_results_pytorch = conv_pytorch(input_image, conv_weights, stride=1, pad=1)\n",
+        "print(\"PyTorch Results\")\n",
+        "print(conv_results_pytorch)\n",
+        "\n",
+        "# Perform convolution in numpy\n",
+        "print(\"Your results\")\n",
+        "conv_results_numpy = conv_numpy_1(input_image, conv_weights)\n",
+        "print(conv_results_numpy)"
+      ],
+      "metadata": {
+        "id": "iw9KqXZTHN8v"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's now add in the possibility of using different strides"
+      ],
+      "metadata": {
+        "id": "IYj_lxeGzaHX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Perform convolution in numpy\n",
+        "def conv_numpy_2(image, weights, stride=1, pad=1):\n",
+        "    \n",
+        "    # Perform zero padding \n",
+        "    if pad != 0:\n",
+        "        image = np.pad(image, ((0, 0), (0 ,0), (pad, pad), (pad, pad)),'constant')\n",
+        "    \n",
+        "    # Get sizes of image array and kernel weights\n",
+        "    batchSize,  channelsIn, imageHeightIn, imageWidthIn = image.shape\n",
+        "    channelsOut, channelsIn, kernelHeight, kernelWidth = weights.shape\n",
+        "\n",
+        "    # Get size of output arrays\n",
+        "    imageHeightOut = np.floor(1 + (imageHeightIn - kernelHeight) / stride).astype(int)\n",
+        "    imageWidthOut = np.floor(1 + (imageWidthIn - kernelWidth) / stride).astype(int)\n",
+        "    \n",
+        "    # Create output\n",
+        "    out = np.zeros((batchSize, channelsOut, imageHeightOut, imageWidthOut), dtype=np.float32) \n",
+        "    \n",
+        "    for c_y in range(imageHeightOut):\n",
+        "      for c_x in range(imageWidthOut):\n",
+        "        for c_kernel_y in range(kernelHeight):\n",
+        "          for c_kernel_x in range(kernelWidth):\n",
+        "            # TODO -- Retrieve the image pixel and the weight from the convolution\n",
+        "            # Only one image in batch, one input channel and one output channel, so these indices should all be zero\n",
+        "            # Replace the two lines below\n",
+        "            this_pixel_value = 1.0\n",
+        "            this_weight = 1.0\n",
+        "\n",
+        "            # Multiply these together and add to the output at this position\n",
+        "            out[0, 0, c_y, c_x] += np.sum(this_pixel_value * this_weight) \n",
+        "            \n",
+        "    return out"
+      ],
+      "metadata": {
+        "id": "GiujmLhqHN1F"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set random seed so we always get same answer\n",
+        "np.random.seed(1) \n",
+        "n_batch = 1\n",
+        "image_height = 12\n",
+        "image_width = 10\n",
+        "channels_in = 1\n",
+        "kernel_size = 3\n",
+        "channels_out = 1\n",
+        "stride = 2\n",
+        "\n",
+        "# Create random input image\n",
+        "input_image= np.random.normal(size=(n_batch, channels_in, image_height, image_width))\n",
+        "# Create random convolution kernel weights\n",
+        "conv_weights = np.random.normal(size=(channels_out, channels_in, kernel_size, kernel_size))\n",
+        "\n",
+        "# Perform convolution using PyTorch\n",
+        "conv_results_pytorch = conv_pytorch(input_image, conv_weights, stride, pad=1)\n",
+        "print(\"PyTorch Results\")\n",
+        "print(conv_results_pytorch)\n",
+        "\n",
+        "# Perform convolution in numpy\n",
+        "print(\"Your results\")\n",
+        "conv_results_numpy = conv_numpy_2(input_image, conv_weights, stride, pad=1)\n",
+        "print(conv_results_numpy)"
+      ],
+      "metadata": {
+        "id": "FeJy6Bvozgxq"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we'll introduce multiple input and output channels"
+      ],
+      "metadata": {
+        "id": "3flq1Wan2gX-"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Perform convolution in numpy\n",
+        "def conv_numpy_3(image, weights, stride=1, pad=1):\n",
+        "    \n",
+        "    # Perform zero padding \n",
+        "    if pad != 0:\n",
+        "        image = np.pad(image, ((0, 0), (0 ,0), (pad, pad), (pad, pad)),'constant')\n",
+        "    \n",
+        "    # Get sizes of image array and kernel weights\n",
+        "    batchSize,  channelsIn, imageHeightIn, imageWidthIn = image.shape\n",
+        "    channelsOut, channelsIn, kernelHeight, kernelWidth = weights.shape\n",
+        "\n",
+        "    # Get size of output arrays\n",
+        "    imageHeightOut = np.floor(1 + (imageHeightIn - kernelHeight) / stride).astype(int)\n",
+        "    imageWidthOut = np.floor(1 + (imageWidthIn - kernelWidth) / stride).astype(int)\n",
+        "    \n",
+        "    # Create output\n",
+        "    out = np.zeros((batchSize, channelsOut, imageHeightOut, imageWidthOut), dtype=np.float32) \n",
+        "    \n",
+        "    for c_y in range(imageHeightOut):\n",
+        "      for c_x in range(imageWidthOut):\n",
+        "        for c_channel_out in range(channelsOut):\n",
+        "          for c_channel_in in range(channelsIn):\n",
+        "            for c_kernel_y in range(kernelHeight):\n",
+        "              for c_kernel_x in range(kernelWidth):\n",
+        "                  # TODO -- Retrieve the image pixel and the weight from the convolution\n",
+        "                  # Only one image in batch so this index should be zero\n",
+        "                  # Replace the two lines below\n",
+        "                  this_pixel_value = 1.0\n",
+        "                  this_weight = 1.0\n",
+        "\n",
+        "                 # Multiply these together and add to the output at this position\n",
+        "                  out[0, c_channel_out, c_y, c_x] += np.sum(this_pixel_value * this_weight) \n",
+        "    return out"
+      ],
+      "metadata": {
+        "id": "AvdRWGiU2ppX"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set random seed so we always get same answer\n",
+        "np.random.seed(1) \n",
+        "n_batch = 1\n",
+        "image_height = 4\n",
+        "image_width = 6\n",
+        "channels_in = 5\n",
+        "kernel_size = 3\n",
+        "channels_out = 2\n",
+        "\n",
+        "# Create random input image\n",
+        "input_image= np.random.normal(size=(n_batch, channels_in, image_height, image_width))\n",
+        "# Create random convolution kernel weights\n",
+        "conv_weights = np.random.normal(size=(channels_out, channels_in, kernel_size, kernel_size))\n",
+        "\n",
+        "# Perform convolution using PyTorch\n",
+        "conv_results_pytorch = conv_pytorch(input_image, conv_weights, stride=1, pad=1)\n",
+        "print(\"PyTorch Results\")\n",
+        "print(conv_results_pytorch)\n",
+        "\n",
+        "# Perform convolution in numpy\n",
+        "print(\"Your results\")\n",
+        "conv_results_numpy = conv_numpy_3(input_image, conv_weights, stride=1, pad=1)\n",
+        "print(conv_results_numpy)"
+      ],
+      "metadata": {
+        "id": "mdSmjfvY4li2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we'll do the full convolution with multiple images (batch size > 1), and multiple input channels, multiple output channels."
+      ],
+      "metadata": {
+        "id": "Q2MUFebdsJbH"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Perform convolution in numpy\n",
+        "def conv_numpy_4(image, weights, stride=1, pad=1):\n",
+        "    \n",
+        "    # Perform zero padding \n",
+        "    if pad != 0:\n",
+        "        image = np.pad(image, ((0, 0), (0 ,0), (pad, pad), (pad, pad)),'constant')\n",
+        "    \n",
+        "    # Get sizes of image array and kernel weights\n",
+        "    batchSize,  channelsIn, imageHeightIn, imageWidthIn = image.shape\n",
+        "    channelsOut, channelsIn, kernelHeight, kernelWidth = weights.shape\n",
+        "\n",
+        "    # Get size of output arrays\n",
+        "    imageHeightOut = np.floor(1 + (imageHeightIn - kernelHeight) / stride).astype(int)\n",
+        "    imageWidthOut = np.floor(1 + (imageWidthIn - kernelWidth) / stride).astype(int)\n",
+        "    \n",
+        "    # Create output\n",
+        "    out = np.zeros((batchSize, channelsOut, imageHeightOut, imageWidthOut), dtype=np.float32) \n",
+        "    \n",
+        "    for c_batch in range(batchSize):\n",
+        "      for c_y in range(imageHeightOut):\n",
+        "        for c_x in range(imageWidthOut):\n",
+        "          for c_channel_out in range(channelsOut):\n",
+        "            for c_channel_in in range(channelsIn):\n",
+        "              for c_kernel_y in range(kernelHeight):\n",
+        "                for c_kernel_x in range(kernelWidth):\n",
+        "                    # TODO -- Retrieve the image pixel and the weight from the convolution\n",
+        "                    # Replace the two lines below\n",
+        "                    this_pixel_value = 1.0\n",
+        "                    this_weight = 1.0\n",
+        "                    \n",
+        "                    # Multiply these together and add to the output at this position\n",
+        "                    out[c_batch, c_channel_out, c_y, c_x] += np.sum(this_pixel_value * this_weight) \n",
+        "    return out"
+      ],
+      "metadata": {
+        "id": "5WePF-Y-sC1y"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "1w2GEBtqAM2P"
+      },
+      "outputs": [],
+      "source": [
+        "# Set random seed so we always get same answer\n",
+        "np.random.seed(1) \n",
+        "n_batch = 2\n",
+        "image_height = 4\n",
+        "image_width = 6\n",
+        "channels_in = 5\n",
+        "kernel_size = 3\n",
+        "channels_out = 2\n",
+        "\n",
+        "# Create random input image\n",
+        "input_image= np.random.normal(size=(n_batch, channels_in, image_height, image_width))\n",
+        "# Create random convolution kernel weights\n",
+        "conv_weights = np.random.normal(size=(channels_out, channels_in, kernel_size, kernel_size))\n",
+        "\n",
+        "# Perform convolution using PyTorch\n",
+        "conv_results_pytorch = conv_pytorch(input_image, conv_weights, stride=1, pad=1)\n",
+        "print(\"PyTorch Results\")\n",
+        "print(conv_results_pytorch)\n",
+        "\n",
+        "# Perform convolution in numpy\n",
+        "print(\"Your results\")\n",
+        "conv_results_numpy = conv_numpy_4(input_image, conv_weights, stride=1, pad=1)\n",
+        "print(conv_results_numpy)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Lody75JB5By7"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Convolution_II.ipynb
+++ b/CM20315_Convolution_II.ipynb
@@ -0,0 +1,253 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyN4fpyg0d75XccLLsNahur1",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Convolution_II.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Convolution II -- MNIST1D\n",
+        "\n",
+        "This notebook investigates what happens when we use convolutional layers instead of fully-connected layers for the MNIST-1D from the coursework.\n",
+        "\n",
+        "We'll build the network from figure 10.7 in the notes.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "t9vk9Elugvmi"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import numpy as np\n",
+        "import os\n",
+        "import torch, torch.nn as nn\n",
+        "from torch.utils.data import TensorDataset, DataLoader\n",
+        "from torch.optim.lr_scheduler import StepLR\n",
+        "import matplotlib.pyplot as plt\n",
+        "import random"
+      ],
+      "metadata": {
+        "id": "YrXWAH7sUWvU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Run this once to copy the train and validation data to your CoLab environment \n",
+        "# or download from my github to your local machine if you are doing this locally\n",
+        "if not os.path.exists('./train_data_x.npy'):\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/train_data_x.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/train_data_y.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/val_data_x.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/val_data_y.npy  "
+      ],
+      "metadata": {
+        "id": "wScBGXXFVadm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load in the data\n",
+        "train_data_x = np.load('train_data_x.npy')\n",
+        "train_data_y = np.load('train_data_y.npy')\n",
+        "val_data_x = np.load('val_data_x.npy')\n",
+        "val_data_y = np.load('val_data_y.npy')\n",
+        "# Print out sizes\n",
+        "print(\"Train data: %d examples (columns), each of which has %d dimensions (rows)\"%((train_data_x.shape[1],train_data_x.shape[0])))\n",
+        "print(\"Validation data: %d examples (columns), each of which has %d dimensions (rows)\"%((val_data_x.shape[1],val_data_x.shape[0])))"
+      ],
+      "metadata": {
+        "id": "8bKADvLHbiV5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Define the network"
+      ],
+      "metadata": {
+        "id": "_sFvRDGrl4qe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "# TODO Create a model with the folowing layers\n",
+        "# 1. Convolutional layer, (input=length 40 and 1 channel, kernel size 3x3, stride 2, padding=\"valid\", 15 output channels ) \n",
+        "# 2. ReLU\n",
+        "# 3. Convolutional layer, (input=length 19 and 15 channels, kernel size 3x3, stride 2, padding=\"valid\", 15 output channels )\n",
+        "# 4. ReLU\n",
+        "# 5. Convolutional layer, (input=length 9 and 15 channels, kernel size 3x3, stride 2, padding=\"valid\", 15 output channels)\n",
+        "# 6. ReLU\n",
+        "# 7. Flatten (converts 4x15) to length 60\n",
+        "# 8. Linear layer (input size = 60, output size = 10)\n",
+        "# References:\n",
+        "# https://pytorch.org/docs/1.13/generated/torch.nn.Conv1d.html?highlight=conv1d#torch.nn.Conv1d\n",
+        "# https://pytorch.org/docs/stable/generated/torch.nn.Flatten.html\n",
+        "# https://pytorch.org/docs/1.13/generated/torch.nn.Linear.html?highlight=linear#torch.nn.Linear\n",
+        "\n",
+        "# Replace the following function which just runs a standard fully connected network\n",
+        "# The flatten at the beginning is becuase we are passing in the data in a slightly different format.\n",
+        "model = nn.Sequential(\n",
+        "nn.Flatten(),\n",
+        "nn.Linear(40, 100),\n",
+        "nn.ReLU(),\n",
+        "nn.Linear(100, 100),\n",
+        "nn.ReLU(),\n",
+        "nn.Linear(100, 10))"
+      ],
+      "metadata": {
+        "id": "FslroPJJffrh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# He initialization of weights\n",
+        "def weights_init(layer_in):\n",
+        "  if isinstance(layer_in, nn.Linear):\n",
+        "    nn.init.kaiming_uniform_(layer_in.weight)\n",
+        "    layer_in.bias.data.fill_(0.0)"
+      ],
+      "metadata": {
+        "id": "YgLaex1pfhqz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You need all this stuff to ensure that PyTorch is deterministic\n",
+        "def set_seed(seed):\n",
+        "    torch.manual_seed(seed)\n",
+        "    torch.cuda.manual_seed_all(seed)\n",
+        "    torch.backends.cudnn.deterministic = True\n",
+        "    torch.backends.cudnn.benchmark = False\n",
+        "    np.random.seed(seed)\n",
+        "    random.seed(seed)\n",
+        "    os.environ['PYTHONHASHSEED'] = str(seed)"
+      ],
+      "metadata": {
+        "id": "zXRmxCQNnL_M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set seed so always get same result (do not change)\n",
+        "set_seed(1)\n",
+        "\n",
+        "# choose cross entropy loss function (equation 5.24 in the loss notes)\n",
+        "loss_function = nn.CrossEntropyLoss()\n",
+        "# construct SGD optimizer and initialize learning rate and momentum\n",
+        "optimizer = torch.optim.SGD(model.parameters(), lr = 0.05, momentum=0.9)\n",
+        "# object that decreases learning rate by half every 10 epochs\n",
+        "scheduler = StepLR(optimizer, step_size=10, gamma=0.5)\n",
+        "# create 100 dummy data points and store in data loader class\n",
+        "x_train = torch.tensor(train_data_x.transpose().astype('float32'))\n",
+        "y_train = torch.tensor(train_data_y.astype('long'))\n",
+        "x_val= torch.tensor(val_data_x.transpose().astype('float32'))\n",
+        "y_val = torch.tensor(val_data_y.astype('long'))\n",
+        "\n",
+        "# load the data into a class that creates the batches\n",
+        "data_loader = DataLoader(TensorDataset(x_train,y_train), batch_size=100, shuffle=True, worker_init_fn=np.random.seed(1))\n",
+        "\n",
+        "# Initialize model weights\n",
+        "model.apply(weights_init)\n",
+        "\n",
+        "# loop over the dataset n_epoch times\n",
+        "n_epoch = 50\n",
+        "# store the loss and the % correct at each epoch\n",
+        "losses_train = np.zeros((n_epoch))\n",
+        "errors_train = np.zeros((n_epoch))\n",
+        "losses_val = np.zeros((n_epoch))\n",
+        "errors_val = np.zeros((n_epoch))\n",
+        "\n",
+        "for epoch in range(n_epoch):\n",
+        "  # loop over batches\n",
+        "  for i, data in enumerate(data_loader):\n",
+        "    # retrieve inputs and labels for this batch\n",
+        "    x_batch, y_batch = data\n",
+        "    # zero the parameter gradients\n",
+        "    optimizer.zero_grad()\n",
+        "    # forward pass -- calculate model output\n",
+        "    pred = model(x_batch[:,None,:])\n",
+        "    # compute the loss\n",
+        "    loss = loss_function(pred, y_batch)\n",
+        "    # backward pass\n",
+        "    loss.backward()\n",
+        "    # SGD update\n",
+        "    optimizer.step()\n",
+        "\n",
+        "  # Run whole dataset to get statistics -- normally wouldn't do this\n",
+        "  pred_train = model(x_train[:,None,:])\n",
+        "  pred_val = model(x_val[:,None,:])\n",
+        "  _, predicted_train_class = torch.max(pred_train.data, 1)\n",
+        "  _, predicted_val_class = torch.max(pred_val.data, 1)\n",
+        "  errors_train[epoch] = 100 - 100 * (predicted_train_class == y_train).float().sum() / len(y_train)\n",
+        "  errors_val[epoch]= 100 - 100 * (predicted_val_class == y_val).float().sum() / len(y_val)\n",
+        "  losses_train[epoch] = loss_function(pred_train, y_train).item()\n",
+        "  losses_val[epoch]= loss_function(pred_val, y_val).item()\n",
+        "  print(f'Epoch {epoch:5d}, train loss {losses_train[epoch]:.6f}, train error {errors_train[epoch]:3.2f},  val loss {losses_val[epoch]:.6f}, percent error {errors_val[epoch]:3.2f}')\n",
+        "  \n",
+        "  # tell scheduler to consider updating learning rate\n",
+        "  scheduler.step()\n",
+        "\n",
+        "# Plot the results\n",
+        "fig, ax = plt.subplots()\n",
+        "ax.plot(errors_train,'r-',label='train')\n",
+        "ax.plot(errors_val,'b-',label='validation')\n",
+        "ax.set_ylim(0,100); ax.set_xlim(0,n_epoch)\n",
+        "ax.set_xlabel('Epoch'); ax.set_ylabel('Error')\n",
+        "ax.set_title('Part I: Validation Result %3.2f'%(errors_val[-1]))\n",
+        "ax.legend()\n",
+        "ax.plot([0,n_epoch],[37.45, 37.45],'k:') # Original results. You should be better than this!\n",
+        "plt.savefig('Coursework_I_Results.png',format='png')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "NYw8I_3mmX5c"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Convolution_III.ipynb
+++ b/CM20315_Convolution_III.ipynb
--- a/CM20315_Coursework_I.ipynb
+++ b/CM20315_Coursework_I.ipynb
--- a/CM20315_Coursework_II.ipynb
+++ b/CM20315_Coursework_II.ipynb
@@ -0,0 +1,292 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyMaHUorNXKELJbeWcOVBYrr",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Coursework_II.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Coursework II -- Training hyperparameters\n",
+        "\n",
+        "The goal of the coursework is to modify a simple bit of numpy code that trains a network and measures the performance on a validation set for the MNist 1D dataset. \n",
+        "\n",
+        "In this coursework, you need to modify the **training hyperparameters** (only) to improve the performance over the current attempt. This could mean the training algorithm, learning rate, learning rate schedule, momentum term, initialization etc.  \n",
+        "\n",
+        "You don't have to improve the performance much.  A few tenths of a percent is fine.  It just has to be better to get full marks.\n",
+        "\n",
+        "You will need to upload three things to Moodle:\n",
+        "1.   The image that this notebook saves (click the folder icon on the left on colab to download it)\n",
+        "2.   The lines of code you changed\n",
+        "3.   The whole notebook as a .ipynb file.  You can do this on the File menu\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "t9vk9Elugvmi"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import numpy as np\n",
+        "import os\n",
+        "import torch, torch.nn as nn\n",
+        "from torch.utils.data import TensorDataset, DataLoader\n",
+        "from torch.optim.lr_scheduler import StepLR\n",
+        "import matplotlib.pyplot as plt\n",
+        "import random"
+      ],
+      "metadata": {
+        "id": "YrXWAH7sUWvU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Run this once to copy the train and validation data to your CoLab environment \n",
+        "# or download from my github to your local machine if you are doing this locally\n",
+        "if not os.path.exists('./train_data_x.npy'):\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/train_data_x.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/train_data_y.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/val_data_x.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/val_data_y.npy  "
+      ],
+      "metadata": {
+        "id": "wScBGXXFVadm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load in the data\n",
+        "train_data_x = np.load('train_data_x.npy')\n",
+        "train_data_y = np.load('train_data_y.npy')\n",
+        "val_data_x = np.load('val_data_x.npy')\n",
+        "val_data_y = np.load('val_data_y.npy')\n",
+        "# Print out sizes\n",
+        "print(\"Train data: %d examples (columns), each of which has %d dimensions (rows)\"%((train_data_x.shape[1],train_data_x.shape[0])))\n",
+        "print(\"Validation data: %d examples (columns), each of which has %d dimensions (rows)\"%((val_data_x.shape[1],val_data_x.shape[0])))"
+      ],
+      "metadata": {
+        "id": "8bKADvLHbiV5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Define the network"
+      ],
+      "metadata": {
+        "id": "_sFvRDGrl4qe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# YOU SHOULD NOT CHANGE THIS CELL!\n",
+        "\n",
+        "# There are 40 input dimensions and 10 output dimensions for this data\n",
+        "# The inputs correspond to the 40 offsets in the MNIST1D template.\n",
+        "D_i = 40\n",
+        "# The outputs correspond to the 10 digits\n",
+        "D_o = 10\n",
+        "\n",
+        "# Number of hidden units in layers 1 and 2\n",
+        "D_1 = 100\n",
+        "D_2 = 100\n",
+        "\n",
+        "# create model with two hidden layers\n",
+        "model = nn.Sequential(\n",
+        "nn.Linear(D_i, D_1),\n",
+        "nn.ReLU(),\n",
+        "nn.Linear(D_1, D_2),\n",
+        "nn.ReLU(),\n",
+        "nn.Linear(D_2, D_o))"
+      ],
+      "metadata": {
+        "id": "FslroPJJffrh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# He initialization of weights\n",
+        "def weights_init(layer_in):\n",
+        "  if isinstance(layer_in, nn.Linear):\n",
+        "    nn.init.kaiming_uniform_(layer_in.weight)\n",
+        "    layer_in.bias.data.fill_(0.0)"
+      ],
+      "metadata": {
+        "id": "YgLaex1pfhqz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You need all this stuff to ensure that PyTorch is deterministic\n",
+        "def set_seed(seed):\n",
+        "    torch.manual_seed(seed)\n",
+        "    torch.cuda.manual_seed_all(seed)\n",
+        "    torch.backends.cudnn.deterministic = True\n",
+        "    torch.backends.cudnn.benchmark = False\n",
+        "    np.random.seed(seed)\n",
+        "    random.seed(seed)\n",
+        "    os.environ['PYTHONHASHSEED'] = str(seed)"
+      ],
+      "metadata": {
+        "id": "zXRmxCQNnL_M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set seed so always get same result (do not change)\n",
+        "set_seed(1)\n",
+        "\n",
+        "# choose cross entropy loss function (equation 5.24 in the loss notes)\n",
+        "loss_function = nn.CrossEntropyLoss()\n",
+        "# construct SGD optimizer and initialize learning rate and momentum\n",
+        "optimizer = torch.optim.SGD(model.parameters(), lr = 0.05, momentum=0.9)\n",
+        "# object that decreases learning rate by half every 10 epochs\n",
+        "scheduler = StepLR(optimizer, step_size=10, gamma=0.5)\n",
+        "# create 100 dummy data points and store in data loader class\n",
+        "x_train = torch.tensor(train_data_x.transpose().astype('float32'))\n",
+        "y_train = torch.tensor(train_data_y.astype('long'))\n",
+        "x_val= torch.tensor(val_data_x.transpose().astype('float32'))\n",
+        "y_val = torch.tensor(val_data_y.astype('long'))\n",
+        "\n",
+        "# load the data into a class that creates the batches\n",
+        "data_loader = DataLoader(TensorDataset(x_train,y_train), batch_size=100, shuffle=True, worker_init_fn=np.random.seed(1))\n",
+        "\n",
+        "# Initialize model weights\n",
+        "model.apply(weights_init)\n",
+        "\n",
+        "# loop over the dataset n_epoch times\n",
+        "n_epoch = 50\n",
+        "# store the loss and the % correct at each epoch\n",
+        "losses_train = np.zeros((n_epoch))\n",
+        "errors_train = np.zeros((n_epoch))\n",
+        "losses_val = np.zeros((n_epoch))\n",
+        "errors_val = np.zeros((n_epoch))\n",
+        "\n",
+        "for epoch in range(n_epoch):\n",
+        "  # loop over batches\n",
+        "  for i, data in enumerate(data_loader):\n",
+        "    # retrieve inputs and labels for this batch\n",
+        "    x_batch, y_batch = data\n",
+        "    # zero the parameter gradients\n",
+        "    optimizer.zero_grad()\n",
+        "    # forward pass -- calculate model output\n",
+        "    pred = model(x_batch)\n",
+        "    # compute the lss\n",
+        "    loss = loss_function(pred, y_batch)\n",
+        "    # backward pass\n",
+        "    loss.backward()\n",
+        "    # SGD update\n",
+        "    optimizer.step()\n",
+        "\n",
+        "  # Run whole dataset to get statistics -- normally wouldn't do this\n",
+        "  pred_train = model(x_train)\n",
+        "  pred_val = model(x_val)\n",
+        "  _, predicted_train_class = torch.max(pred_train.data, 1)\n",
+        "  _, predicted_val_class = torch.max(pred_val.data, 1)\n",
+        "  errors_train[epoch] = 100 - 100 * (predicted_train_class == y_train).float().sum() / len(y_train)\n",
+        "  errors_val[epoch]= 100 - 100 * (predicted_val_class == y_val).float().sum() / len(y_val)\n",
+        "  losses_train[epoch] = loss_function(pred_train, y_train).item()\n",
+        "  losses_val[epoch]= loss_function(pred_val, y_val).item()\n",
+        "  print(f'Epoch {epoch:5d}, train loss {losses_train[epoch]:.6f}, train error {errors_train[epoch]:3.2f},  val loss {losses_val[epoch]:.6f}, percent error {errors_val[epoch]:3.2f}')\n",
+        "  \n",
+        "  # tell scheduler to consider updating learning rate\n",
+        "  scheduler.step()\n",
+        "\n",
+        "# Plot the results\n",
+        "fig, ax = plt.subplots()\n",
+        "ax.plot(errors_train,'r-',label='train')\n",
+        "ax.plot(errors_val,'b-',label='validation')\n",
+        "ax.set_ylim(0,100); ax.set_xlim(0,n_epoch)\n",
+        "ax.set_xlabel('Epoch'); ax.set_ylabel('Error')\n",
+        "ax.set_title('Part II: Validation Result %3.2f'%(errors_val[-1]))\n",
+        "ax.legend()\n",
+        "ax.plot([0,n_epoch],[37.45, 37.45],'k:') # Original results. You should be better than this!\n",
+        "plt.savefig('Coursework_II_Results.png',format='png')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "NYw8I_3mmX5c"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Leave this all commented for now\n",
+        "# We'll see how well you did on the test data after the coursework is submitted\n",
+        "\n",
+        "# if not os.path.exists('./test_data_x.npy'):\n",
+        "#   !wget https://github.com/udlbook/udlbook/raw/main/practicals/test_data_x.npy\n",
+        "#   !wget https://github.com/udlbook/udlbook/raw/main/practicals/test_data_y.npy\n",
+        "\n",
+        "\n",
+        "# # I haven't given you this yet, leave commented\n",
+        "# test_data_x = np.load('test_data_x.npy')\n",
+        "# test_data_y = np.load('test_data_y.npy')\n",
+        "# x_test = torch.tensor(test_data_x.transpose().astype('float32'))\n",
+        "# y_test = torch.tensor(test_data_y.astype('long'))\n",
+        "# pred_test = model(x_test)\n",
+        "# _, predicted_test_class = torch.max(pred_test.data, 1)\n",
+        "# errors_test = 100 - 100 * (predicted_test_class == y_test).float().sum() / len(y_test)\n",
+        "# print(\"Test error = %3.3f\"%(errors_test))"
+      ],
+      "metadata": {
+        "id": "O7nBz-R84QdJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "zXccksoD1Eww"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Coursework_III.ipynb
+++ b/CM20315_Coursework_III.ipynb
@@ -0,0 +1,290 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyMcvUAtcMw0yuDfUkbsEDLD",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Coursework_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Coursework III -- Regularization\n",
+        "\n",
+        "The goal of the coursework is to modify a simple bit of numpy code that trains a network and measures the performance on a validation set for the MNist 1D dataset. \n",
+        "\n",
+        "In this coursework, you need add **regularization** of some kind to improve the performance.  Anything from chapter 8 of the book or anything else you can find is fine *except* early stopping.  You must not change the model hyperparameters or the training algorithm.\n",
+        "\n",
+        "You don't have to improve the performance much.  A few tenths of a percent is fine.  It just has to be better to get full marks.\n",
+        "\n",
+        "You will need to upload three things to Moodle:\n",
+        "1.   The image that this notebook saves (click the folder icon on the left on colab to download it)\n",
+        "2.   The lines of code you changed\n",
+        "3.   The whole notebook as a .ipynb file.  You can do this on the File menu\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "t9vk9Elugvmi"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import numpy as np\n",
+        "import os\n",
+        "import torch, torch.nn as nn\n",
+        "from torch.utils.data import TensorDataset, DataLoader\n",
+        "from torch.optim.lr_scheduler import StepLR\n",
+        "import matplotlib.pyplot as plt\n",
+        "import random"
+      ],
+      "metadata": {
+        "id": "YrXWAH7sUWvU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Run this once to copy the train and validation data to your CoLab environment \n",
+        "# or download from my github to your local machine if you are doing this locally\n",
+        "if not os.path.exists('./train_data_x.npy'):\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/train_data_x.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/train_data_y.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/val_data_x.npy\n",
+        "  !wget https://github.com/udlbook/udlbook/raw/main/practicals/val_data_y.npy  "
+      ],
+      "metadata": {
+        "id": "wScBGXXFVadm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load in the data\n",
+        "train_data_x = np.load('train_data_x.npy')\n",
+        "train_data_y = np.load('train_data_y.npy')\n",
+        "val_data_x = np.load('val_data_x.npy')\n",
+        "val_data_y = np.load('val_data_y.npy')\n",
+        "# Print out sizes\n",
+        "print(\"Train data: %d examples (columns), each of which has %d dimensions (rows)\"%((train_data_x.shape[1],train_data_x.shape[0])))\n",
+        "print(\"Validation data: %d examples (columns), each of which has %d dimensions (rows)\"%((val_data_x.shape[1],val_data_x.shape[0])))"
+      ],
+      "metadata": {
+        "id": "8bKADvLHbiV5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Define the network"
+      ],
+      "metadata": {
+        "id": "_sFvRDGrl4qe"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# There are 40 input dimensions and 10 output dimensions for this data\n",
+        "# The inputs correspond to the 40 offsets in the MNIST1D template.\n",
+        "D_i = 40\n",
+        "# The outputs correspond to the 10 digits\n",
+        "D_o = 10\n",
+        "\n",
+        "# Number of hidden units in layers 1 and 2\n",
+        "D_1 = 100\n",
+        "D_2 = 100\n",
+        "\n",
+        "# create model with two hidden layers\n",
+        "model = nn.Sequential(\n",
+        "nn.Linear(D_i, D_1),\n",
+        "nn.ReLU(),\n",
+        "nn.Linear(D_1, D_2),\n",
+        "nn.ReLU(),\n",
+        "nn.Linear(D_2, D_o))"
+      ],
+      "metadata": {
+        "id": "FslroPJJffrh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# He initialization of weights\n",
+        "def weights_init(layer_in):\n",
+        "  if isinstance(layer_in, nn.Linear):\n",
+        "    nn.init.kaiming_uniform_(layer_in.weight)\n",
+        "    layer_in.bias.data.fill_(0.0)"
+      ],
+      "metadata": {
+        "id": "YgLaex1pfhqz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You need all this stuff to ensure that PyTorch is deterministic\n",
+        "def set_seed(seed):\n",
+        "    torch.manual_seed(seed)\n",
+        "    torch.cuda.manual_seed_all(seed)\n",
+        "    torch.backends.cudnn.deterministic = True\n",
+        "    torch.backends.cudnn.benchmark = False\n",
+        "    np.random.seed(seed)\n",
+        "    random.seed(seed)\n",
+        "    os.environ['PYTHONHASHSEED'] = str(seed)"
+      ],
+      "metadata": {
+        "id": "zXRmxCQNnL_M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set seed so always get same result (do not change)\n",
+        "set_seed(1)\n",
+        "\n",
+        "# choose cross entropy loss function (equation 5.24 in the loss notes)\n",
+        "loss_function = nn.CrossEntropyLoss()\n",
+        "# construct SGD optimizer and initialize learning rate and momentum\n",
+        "optimizer = torch.optim.SGD(model.parameters(), lr = 0.05, momentum=0.9)\n",
+        "# object that decreases learning rate by half every 10 epochs\n",
+        "scheduler = StepLR(optimizer, step_size=10, gamma=0.5)\n",
+        "# create 100 dummy data points and store in data loader class\n",
+        "x_train = torch.tensor(train_data_x.transpose().astype('float32'))\n",
+        "y_train = torch.tensor(train_data_y.astype('long'))\n",
+        "x_val= torch.tensor(val_data_x.transpose().astype('float32'))\n",
+        "y_val = torch.tensor(val_data_y.astype('long'))\n",
+        "\n",
+        "# load the data into a class that creates the batches\n",
+        "data_loader = DataLoader(TensorDataset(x_train,y_train), batch_size=100, shuffle=True, worker_init_fn=np.random.seed(1))\n",
+        "\n",
+        "# Initialize model weights\n",
+        "model.apply(weights_init)\n",
+        "\n",
+        "# loop over the dataset n_epoch times\n",
+        "n_epoch = 50\n",
+        "# store the loss and the % correct at each epoch\n",
+        "losses_train = np.zeros((n_epoch))\n",
+        "errors_train = np.zeros((n_epoch))\n",
+        "losses_val = np.zeros((n_epoch))\n",
+        "errors_val = np.zeros((n_epoch))\n",
+        "\n",
+        "for epoch in range(n_epoch):\n",
+        "  # loop over batches\n",
+        "  for i, data in enumerate(data_loader):\n",
+        "    # retrieve inputs and labels for this batch\n",
+        "    x_batch, y_batch = data\n",
+        "    # zero the parameter gradients\n",
+        "    optimizer.zero_grad()\n",
+        "    # forward pass -- calculate model output\n",
+        "    pred = model(x_batch)\n",
+        "    # compute the lss\n",
+        "    loss = loss_function(pred, y_batch)\n",
+        "    # backward pass\n",
+        "    loss.backward()\n",
+        "    # SGD update\n",
+        "    optimizer.step()\n",
+        "\n",
+        "  # Run whole dataset to get statistics -- normally wouldn't do this\n",
+        "  pred_train = model(x_train)\n",
+        "  pred_val = model(x_val)\n",
+        "  _, predicted_train_class = torch.max(pred_train.data, 1)\n",
+        "  _, predicted_val_class = torch.max(pred_val.data, 1)\n",
+        "  errors_train[epoch] = 100 - 100 * (predicted_train_class == y_train).float().sum() / len(y_train)\n",
+        "  errors_val[epoch]= 100 - 100 * (predicted_val_class == y_val).float().sum() / len(y_val)\n",
+        "  losses_train[epoch] = loss_function(pred_train, y_train).item()\n",
+        "  losses_val[epoch]= loss_function(pred_val, y_val).item()\n",
+        "  print(f'Epoch {epoch:5d}, train loss {losses_train[epoch]:.6f}, train error {errors_train[epoch]:3.2f},  val loss {losses_val[epoch]:.6f}, percent error {errors_val[epoch]:3.2f}')\n",
+        "  \n",
+        "  # tell scheduler to consider updating learning rate\n",
+        "  scheduler.step()\n",
+        "\n",
+        "# Plot the results\n",
+        "fig, ax = plt.subplots()\n",
+        "ax.plot(errors_train,'r-',label='train')\n",
+        "ax.plot(errors_val,'b-',label='validation')\n",
+        "ax.set_ylim(0,100); ax.set_xlim(0,n_epoch)\n",
+        "ax.set_xlabel('Epoch'); ax.set_ylabel('Error')\n",
+        "ax.set_title('Part III: Validation Result %3.2f'%(errors_val[-1]))\n",
+        "ax.legend()\n",
+        "ax.plot([0,n_epoch],[37.45, 37.45],'k:') # Original results. You should be better than this!\n",
+        "plt.savefig('Coursework_III_Results.png',format='png')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "NYw8I_3mmX5c"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Leave this all commented for now\n",
+        "# We'll see how well you did on the test data after the coursework is submitted\n",
+        "\n",
+        "# if not os.path.exists('./test_data_x.npy'):\n",
+        "#   !wget https://github.com/udlbook/udlbook/raw/main/practicals/test_data_x.npy\n",
+        "#   !wget https://github.com/udlbook/udlbook/raw/main/practicals/test_data_y.npy\n",
+        "\n",
+        "\n",
+        "# # I haven't given you this yet, leave commented\n",
+        "# test_data_x = np.load('test_data_x.npy')\n",
+        "# test_data_y = np.load('test_data_y.npy')\n",
+        "# x_test = torch.tensor(test_data_x.transpose().astype('float32'))\n",
+        "# y_test = torch.tensor(test_data_y.astype('long'))\n",
+        "# pred_test = model(x_test)\n",
+        "# _, predicted_test_class = torch.max(pred_test.data, 1)\n",
+        "# errors_test = 100 - 100 * (predicted_test_class == y_test).float().sum() / len(y_test)\n",
+        "# print(\"Test error = %3.3f\"%(errors_test))"
+      ],
+      "metadata": {
+        "id": "O7nBz-R84QdJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "zXccksoD1Eww"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Coursework_IV.ipynb
+++ b/CM20315_Coursework_IV.ipynb
@@ -0,0 +1,222 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyNb1nfymw3lpvyBHaCFRvMI",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Coursework_IV.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Coursework IV\n",
+        "\n",
+        "This coursework explores the geometry of high dimensional spaces.  It doesn't behave how you would expect and all your intuitions are wrong!  You will write code and it will give you three numerical answers that you need to type into Moodle."
+      ],
+      "metadata": {
+        "id": "EjLK-kA1KnYX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4ESMmnkYEVAb"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "import scipy.special as sci"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Part (a)\n",
+        "\n",
+        "In part (a) of the practical, we investigate how close random points are in 2D, 100D, and 1000D.   In each case, we generate 1000 points and calculate the Euclidean distance between each pair.  You should find that in 1000D, the furthest two points are only slightly further apart than the nearest points.  Weird!"
+      ],
+      "metadata": {
+        "id": "MonbPEitLNgN"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Fix the random seed so we all have the same random numbers\n",
+        "np.random.seed(0)\n",
+        "n_data = 1000\n",
+        "# Create 1000 data examples (columns) each with 2 dimensions (rows)\n",
+        "n_dim = 2\n",
+        "x_2D = np.random.normal(size=(n_dim,n_data))\n",
+        "# Create 1000 data examples (columns) each with 100 dimensions (rows)\n",
+        "n_dim = 100\n",
+        "x_100D = np.random.normal(size=(n_dim,n_data))\n",
+        "# Create 1000 data examples (columns) each with 1000 dimensions (rows)\n",
+        "n_dim = 1000\n",
+        "x_1000D = np.random.normal(size=(n_dim,n_data))\n",
+        "\n",
+        "# These values should be the same, otherwise your answer will be wrong\n",
+        "# Get in touch if they are not!\n",
+        "print('Sum of your data is %3.3f, Should be %3.3f'%(np.sum(x_1000D),1036.321))"
+      ],
+      "metadata": {
+        "id": "vZSHVmcWEk14"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def distance_ratio(x):\n",
+        "  # TODO -- replace the two lines below to calculate the largest and smallest Euclidean distance between \n",
+        "  # the data points in the columns of x.  DO NOT include the distance between the data point\n",
+        "  # and itself (which is obviously zero)\n",
+        "  smallest_dist = 1.0\n",
+        "  largest_dist = 1.0\n",
+        " \n",
+        "  # Calculate the ratio and return\n",
+        "  dist_ratio = largest_dist / smallest_dist\n",
+        "  return dist_ratio"
+      ],
+      "metadata": {
+        "id": "PhVmnUs8ErD9"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "print('Ratio of largest to smallest distance 2D: %3.3f'%(distance_ratio(x_2D)))\n",
+        "print('Ratio of largest to smallest distance 100D: %3.3f'%(distance_ratio(x_100D)))\n",
+        "print('Ratio of largest to smallest distance 1000D: %3.3f'%(distance_ratio(x_1000D)))\n",
+        "print('**Note down the last of these three numbers, you will need to submit it for your coursework**')"
+      ],
+      "metadata": {
+        "id": "0NdPxfn5GQuJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Part (b)\n",
+        "\n",
+        "In part (b) of the practical we calculate the volume of a hypersphere of radius 0.5 (i.e., of diameter 1) as a function of the radius.  You will find that the volume decreases to almost nothing in high dimensions.  All of the volume is in the corners of the unit hypercube (which always has volume 1). Double weird.\n",
+        "\n",
+        "Note that you you can check your answer by doing the calculation for 2D using the standard formula for the area of a circle and making sure it matches."
+      ],
+      "metadata": {
+        "id": "b2FYKV1SL4Z7"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def volume_of_hypersphere(diameter, dimensions):\n",
+        "  # Formula iven in Problem 8.7 of the notes in Moodle (probably a different problem number on book site)\n",
+        "  # You will need sci.special.gamma()\n",
+        "  # Check out:    https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.gamma.html\n",
+        "  # Also use this value for pi\n",
+        "  pi = np.pi\n",
+        "  # TODO replace this code with formula for the volume of a hypersphere\n",
+        "  volume = 1.0\n",
+        "\n",
+        "  return volume\n",
+        "  "
+      ],
+      "metadata": {
+        "id": "CZoNhD8XJaHR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "diameter = 1.0\n",
+        "for c_dim in range(1,11):\n",
+        "  print(\"Volume of unit diameter hypersphere in %d dimensions is %3.3f\"%(c_dim, volume_of_hypersphere(diameter, c_dim)))\n",
+        "print('**Note down the last of these ten numbers, you will need to submit it for your coursework**')"
+      ],
+      "metadata": {
+        "id": "fNTBlg_GPEUh"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Part (c)\n",
+        "\n",
+        "In part (c) of the coursework, you will calculate what proportion of the volume of a hypersphere is in the outer 1% of the radius/diameter.  Calculate the volume of a hypersphere and then the volume of a hypersphere with 0.99 of the radius and then figure out the proportion (a number between 0 and 1).  You'll see that by the time we get to 300 dimensions most of the volume is in the outer 1 percent.  Extremely weird!"
+      ],
+      "metadata": {
+        "id": "GdyMeOBmoXyF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def get_prop_of_volume_in_outer_1_percent(dimension):\n",
+        "  # TODO -- replace this line\n",
+        "  proportion = 1.0\n",
+        "\n",
+        "  return proportion"
+      ],
+      "metadata": {
+        "id": "8_CxZ2AIpQ8w"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# While we're here, let's look at how much of the volume is in the outer 1% of the radius\n",
+        "for c_dim in [1,2,10,20,50,100,150,200,250,300]:\n",
+        "  print('Proportion of volume in outer 1 percent of radius in %d dimensions =%3.3f'%(c_dim, get_prop_of_volume_in_outer_1_percent(c_dim)))\n",
+        "print('**Note down the last of these ten numbers, you will need to submit it for your coursework**')"
+      ],
+      "metadata": {
+        "id": "LtMDIn2qPVfJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Qbxb-eHHQCS7"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Deep.ipynb
+++ b/CM20315_Deep.ipynb
@@ -0,0 +1,448 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyOtP/O21RVLxAeEBIwV0aZt",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Deep.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Deep neural networks**\n",
+        "\n",
+        "In this notebook, we'll experiment with feeding one neural network into another as in figure 4.1 from the book."
+      ],
+      "metadata": {
+        "id": "MaKn8CFlzN8E"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8ClURpZQzI6L"
+      },
+      "outputs": [],
+      "source": [
+        "# Imports math library\n",
+        "import numpy as np\n",
+        "# Imports plotting library\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation"
+      ],
+      "metadata": {
+        "id": "YdmveeAUz4YG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define a shallow neural network with, one input, one output, and three hidden units\n",
+        "def shallow_1_1_3(x, activation_fn, phi_0,phi_1,phi_2,phi_3, theta_10, theta_11, theta_20, theta_21, theta_30, theta_31):\n",
+        "  # Initial lines\n",
+        "  pre_1 = theta_10 + theta_11 * x\n",
+        "  pre_2 = theta_20 + theta_21 * x\n",
+        "  pre_3 = theta_30 + theta_31 * x\n",
+        "  # Activation functions\n",
+        "  act_1 = activation_fn(pre_1)\n",
+        "  act_2 = activation_fn(pre_2)\n",
+        "  act_3 = activation_fn(pre_3)\n",
+        "  # Weight activations\n",
+        "  w_act_1 = phi_1 * act_1\n",
+        "  w_act_2 = phi_2 * act_2\n",
+        "  w_act_3 = phi_3 * act_3\n",
+        "  # Combine weighted activation and add y offset\n",
+        "  y = phi_0 + w_act_1 + w_act_2 + w_act_3\n",
+        "  # Return everything we have calculated\n",
+        "  return y, pre_1, pre_2, pre_3, act_1, act_2, act_3, w_act_1, w_act_2, w_act_3"
+      ],
+      "metadata": {
+        "id": "ximCLwIfz8kj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# # Plot the shallow neural network.  We'll assume input in is range [-1,1] and output [-1,1]\n",
+        "# If the plot_all flag is set to true, then we'll plot all the intermediate stages as in Figure 3.3 \n",
+        "def plot_neural(x, y, pre_1, pre_2, pre_3, act_1, act_2, act_3, w_act_1, w_act_2, w_act_3, plot_all=False, x_data=None, y_data=None):\n",
+        "\n",
+        "  # Plot intermediate plots if flag set\n",
+        "  if plot_all:\n",
+        "    fig, ax = plt.subplots(3,3)\n",
+        "    fig.set_size_inches(8.5, 8.5)\n",
+        "    fig.tight_layout(pad=3.0)\n",
+        "    ax[0,0].plot(x,pre_1,'r-'); ax[0,0].set_ylabel('Preactivation')\n",
+        "    ax[0,1].plot(x,pre_2,'b-'); ax[0,1].set_ylabel('Preactivation')\n",
+        "    ax[0,2].plot(x,pre_3,'g-'); ax[0,2].set_ylabel('Preactivation')\n",
+        "    ax[1,0].plot(x,act_1,'r-'); ax[1,0].set_ylabel('Activation')\n",
+        "    ax[1,1].plot(x,act_2,'b-'); ax[1,1].set_ylabel('Activation')\n",
+        "    ax[1,2].plot(x,act_3,'g-'); ax[1,2].set_ylabel('Activation')\n",
+        "    ax[2,0].plot(x,w_act_1,'r-'); ax[2,0].set_ylabel('Weighted Act')\n",
+        "    ax[2,1].plot(x,w_act_2,'b-'); ax[2,1].set_ylabel('Weighted Act')\n",
+        "    ax[2,2].plot(x,w_act_3,'g-'); ax[2,2].set_ylabel('Weighted Act')\n",
+        "\n",
+        "    for plot_y in range(3):\n",
+        "      for plot_x in range(3):\n",
+        "        ax[plot_y,plot_x].set_xlim([-1,1]);ax[plot_x,plot_y].set_ylim([-1,1])\n",
+        "        ax[plot_y,plot_x].set_aspect(1.0)\n",
+        "      ax[2,plot_y].set_xlabel('Input, $x$');\n",
+        "    plt.show()\n",
+        "\n",
+        "  fig, ax = plt.subplots()\n",
+        "  ax.plot(x,y)\n",
+        "  ax.set_xlabel('Input'); ax.set_ylabel('Output')\n",
+        "  ax.set_xlim([-1,1]);ax.set_ylim([-1,1])\n",
+        "  ax.set_aspect(1.0)\n",
+        "  if x_data is not None:\n",
+        "    ax.plot(x_data, y_data, 'mo')\n",
+        "    for i in range(len(x_data)):\n",
+        "      ax.plot(x_data[i], y_data[i],)\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "btrt7BX20gKD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's define two networks.  We'll put the prefixes n1_ and n2_ before all the variables to make it clear which network is which.  We'll just consider the inputs and outputs over the range [-1,1].  If you set the \"plot_all\" flat to True,  you can see the details of how they were created."
+      ],
+      "metadata": {
+        "id": "LxBJCObC-NTY"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now lets define some parameters and run the first neural network\n",
+        "n1_theta_10 = 0.0   ; n1_theta_11 = -1.0\n",
+        "n1_theta_20 = 0     ; n1_theta_21 = 1.0\n",
+        "n1_theta_30 = -0.67 ; n1_theta_31 =  1.0\n",
+        "n1_phi_0 = 1.0; n1_phi_1 = -2.0; n1_phi_2 = -3.0; n1_phi_3 = 9.3\n",
+        "\n",
+        "# Define a range of input values\n",
+        "n1_in = np.arange(-1,1,0.01)\n",
+        "\n",
+        "# We run the neural network for each of these input values\n",
+        "n1_out, pre_1, pre_2, pre_3, act_1, act_2, act_3, w_act_1, w_act_2, w_act_3 = \\\n",
+        "    shallow_1_1_3(n1_in, ReLU, n1_phi_0, n1_phi_1, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "# And then plot it\n",
+        "plot_neural(n1_in, n1_out, pre_1, pre_2, pre_3, act_1, act_2, act_3, w_act_1, w_act_2, w_act_3, plot_all=False)"
+      ],
+      "metadata": {
+        "id": "JRebvurv22pT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now lets define some parameters and run the second neural network\n",
+        "n2_theta_10 =  -0.6 ; n2_theta_11 = -1.0\n",
+        "n2_theta_20 =  0.2  ; n2_theta_21 = 1.0\n",
+        "n2_theta_30 =  -0.5  ; n2_theta_31 =  1.0\n",
+        "n2_phi_0 = 0.5; n2_phi_1 = -1.0; n2_phi_2 = -1.5; n2_phi_3 = 2.0\n",
+        "\n",
+        "# Define a range of input values\n",
+        "n2_in = np.arange(-1,1,0.01)\n",
+        "\n",
+        "# We run the neural network for each of these input values\n",
+        "n2_out, pre_1, pre_2, pre_3, act_1, act_2, act_3, w_act_1, w_act_2, w_act_3 = \\\n",
+        "    shallow_1_1_3(n2_in, ReLU, n2_phi_0, n2_phi_1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "# And then plot it\n",
+        "plot_neural(n2_in, n2_out, pre_1, pre_2, pre_3, act_1, act_2, act_3, w_act_1, w_act_2, w_act_3, plot_all=False)"
+      ],
+      "metadata": {
+        "id": "ZRjWu8i9239X"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we'll consider feeding output of the first network into the second one."
+      ],
+      "metadata": {
+        "id": "qOcj2Rof-o20"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# # Plot two shallow neural networks and the composition of the two   \n",
+        "def plot_neural_two_components(x_in, net1_out, net2_out, net12_out=None):\n",
+        "\n",
+        "  # Plot the two networks separately\n",
+        "  fig, ax = plt.subplots(1,2)\n",
+        "  fig.set_size_inches(8.5, 8.5)\n",
+        "  fig.tight_layout(pad=3.0)\n",
+        "  ax[0].plot(x_in, net1_out,'r-')\n",
+        "  ax[0].set_xlabel('Net 1 input'); ax[0].set_ylabel('Net 1 output')\n",
+        "  ax[0].set_xlim([-1,1]);ax[0].set_ylim([-1,1])\n",
+        "  ax[0].set_aspect(1.0)\n",
+        "  ax[1].plot(x_in, net2_out,'b-')\n",
+        "  ax[1].set_xlabel('Net 2 input'); ax[1].set_ylabel('Net 2 output')\n",
+        "  ax[1].set_xlim([-1,1]);ax[1].set_ylim([-1,1])\n",
+        "  ax[1].set_aspect(1.0)\n",
+        "  plt.show()\n",
+        "\n",
+        "  if net12_out is not None:\n",
+        "    # Plot their composition\n",
+        "    fig, ax = plt.subplots()\n",
+        "    ax.plot(x_in ,net12_out,'g-')\n",
+        "    ax.set_xlabel('Net 1 Input'); ax.set_ylabel('Net 2 Output')\n",
+        "    ax.set_xlim([-1,1]);ax.set_ylim([-1,1])\n",
+        "    ax.set_aspect(1.0)\n",
+        "    plt.show()"
+      ],
+      "metadata": {
+        "id": "ZB2HTalOE40X"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Display the two inputs\n",
+        "x = np.arange(-1,1,0.001)\n",
+        "# We run the first  and second neural networks for each of these input values\n",
+        "net1_out, *_ = shallow_1_1_3(x, ReLU, n1_phi_0, n1_phi_1, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "net2_out, *_ = shallow_1_1_3(x, ReLU, n2_phi_0, n2_phi_1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "# Plot both graphs\n",
+        "plot_neural_two_components(x, net1_out, net2_out)"
+      ],
+      "metadata": {
+        "id": "K6Tmecgu7uqt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO \n",
+        "# Take a piece of paper and draw what you think will happen when we feed the \n",
+        "# output of the first network into the second one.  Draw the relationship between\n",
+        "# the input of the first network and the output of the second one."
+      ],
+      "metadata": {
+        "id": "NUQVop9-Xta1"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now let's see if your predictions were right \n",
+        "\n",
+        "# TODO feed the output of first network into second network (replace this line)\n",
+        "net12_out = np.zeros_like(x)\n",
+        "# Plot all three graphs\n",
+        "plot_neural_two_components(x, net1_out, net2_out, net12_out)"
+      ],
+      "metadata": {
+        "id": "Yq7GH-MCIyPI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now we'll change things a up a bit.  What happens if we change the second network? (note the *-1 change)\n",
+        "net1_out, *_ = shallow_1_1_3(x, ReLU, n1_phi_0, n1_phi_1, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "net2_out, *_ = shallow_1_1_3(x, ReLU, n2_phi_0, n2_phi_1*-1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "plot_neural_two_components(x, net1_out, net2_out)"
+      ],
+      "metadata": {
+        "id": "BMlLkLbdEuPu"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO \n",
+        "# Take a piece of paper and draw what you think will happen when we feed the \n",
+        "# output of the first network into the second one now that we have changed it.  Draw the relationship between\n",
+        "# the input of the first network and the output of the second one."
+      ],
+      "metadata": {
+        "id": "Of6jVXLTJ688"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# When you have a prediction, run this code to see if you were right\n",
+        "net12_out, *_ = shallow_1_1_3(net1_out, ReLU, n2_phi_0, n2_phi_1*-1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "plot_neural_two_components(x, net1_out, net2_out, net12_out)"
+      ],
+      "metadata": {
+        "id": "PbbSCaSeK6SM"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's change things again.  What happens if we change the firsrt network? (note the changes)\n",
+        "net1_out, *_ = shallow_1_1_3(x, ReLU, n1_phi_0, n1_phi_1*0.5, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "net2_out, *_ = shallow_1_1_3(x, ReLU, n2_phi_0, n2_phi_1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "plot_neural_two_components(x, net1_out, net2_out)"
+      ],
+      "metadata": {
+        "id": "b39mcSGFK9Fd"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO \n",
+        "# Take a piece of paper and draw what you think will happen when we feed the \n",
+        "# output of the first network now we have changed it into the original second network.  Draw the relationship between\n",
+        "# the input of the first network and the output of the second one."
+      ],
+      "metadata": {
+        "id": "MhO40cC_LW9I"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# When you have a prediction, run this code to see if you were right\n",
+        "net12_out, *_ = shallow_1_1_3(net1_out, ReLU, n2_phi_0, n2_phi_1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "plot_neural_two_components(x, net1_out, net2_out, net12_out)"
+      ],
+      "metadata": {
+        "id": "Akwo-hnPLkNr"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's change things again.  What happens if the first network and second networks are the same?\n",
+        "net1_out, *_ = shallow_1_1_3(x, ReLU, n1_phi_0, n1_phi_1, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "net2_out_new, *_ = shallow_1_1_3(x, ReLU, n1_phi_0, n1_phi_1, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "plot_neural_two_components(x, net1_out, net2_out_new)"
+      ],
+      "metadata": {
+        "id": "TJ7wXKpRLl_E"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO \n",
+        "# Take a piece of paper and draw what you think will happen when we feed the \n",
+        "# output of the first network into the original second network.  Draw the relationship between\n",
+        "# the input of the first network and the output of the second one."
+      ],
+      "metadata": {
+        "id": "dJbbh6R7NG9k"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# When you have a prediction, run this code to see if you were right\n",
+        "net12_out, *_ = shallow_1_1_3(net1_out, ReLU, n1_phi_0, n1_phi_1, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "plot_neural_two_components(x, net1_out, net2_out_new, net12_out)"
+      ],
+      "metadata": {
+        "id": "BiZZl3yNM2Bq"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO \n",
+        "# Contemplate what you think will happen when we feed the \n",
+        "# output of the original first network into a second copy of the original first network, and then \n",
+        "# the output of that into the original second network (so now we have a three layer network)\n",
+        "# How many total linear regions will we have in the output?  \n",
+        "net123_out, *_ = shallow_1_1_3(net12_out, ReLU, n2_phi_0, n2_phi_1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "plot_neural_two_components(x, net12_out, net2_out, net123_out)"
+      ],
+      "metadata": {
+        "id": "BSd51AkzNf7-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TO DO\n",
+        "# How many linear regions would there be if we ran N copies of the first network, feeding the result of the first \n",
+        "# into the second, the second into the third and so on, and then passed the result into the original second\n",
+        "# network (blue curve above)\n",
+        "\n",
+        "# Take away conclusions:  with very few parameters, we can make A LOT of linear regions, but\n",
+        "# they depend on one another in complex ways that quickly become to difficult to understand intuitively."
+      ],
+      "metadata": {
+        "id": "HqzePCLOVQK7"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Deep2.ipynb
+++ b/CM20315_Deep2.ipynb
@@ -0,0 +1,317 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyP87B9tfgXpVQdlQBUGw4mg",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Deep2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Deep neural networks #2**\n",
+        "\n",
+        "In this notebook, we'll investigate converting neural networks to matrix form."
+      ],
+      "metadata": {
+        "id": "MaKn8CFlzN8E"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8ClURpZQzI6L"
+      },
+      "outputs": [],
+      "source": [
+        "# Imports math library\n",
+        "import numpy as np\n",
+        "# Imports plotting library\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation"
+      ],
+      "metadata": {
+        "id": "YdmveeAUz4YG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define a shallow neural network with, one input, one output, and three hidden units\n",
+        "def shallow_1_1_3(x, activation_fn, phi_0,phi_1,phi_2,phi_3, theta_10, theta_11, theta_20, theta_21, theta_30, theta_31):\n",
+        "  # Initial lines\n",
+        "  pre_1 = theta_10 + theta_11 * x\n",
+        "  pre_2 = theta_20 + theta_21 * x\n",
+        "  pre_3 = theta_30 + theta_31 * x\n",
+        "  # Activation functions\n",
+        "  act_1 = activation_fn(pre_1)\n",
+        "  act_2 = activation_fn(pre_2)\n",
+        "  act_3 = activation_fn(pre_3)\n",
+        "  # Weight activations\n",
+        "  w_act_1 = phi_1 * act_1\n",
+        "  w_act_2 = phi_2 * act_2\n",
+        "  w_act_3 = phi_3 * act_3\n",
+        "  # Combine weighted activation and add y offset\n",
+        "  y = phi_0 + w_act_1 + w_act_2 + w_act_3\n",
+        "  # Return everything we have calculated\n",
+        "  return y, pre_1, pre_2, pre_3, act_1, act_2, act_3, w_act_1, w_act_2, w_act_3"
+      ],
+      "metadata": {
+        "id": "ximCLwIfz8kj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# # Plot the shallow neural network.  We'll assume input in is range [-1,1] and output [-1,1]\n",
+        "def plot_neural(x, y):\n",
+        "  fig, ax = plt.subplots()\n",
+        "  ax.plot(x.T,y.T)\n",
+        "  ax.set_xlabel('Input'); ax.set_ylabel('Output')\n",
+        "  ax.set_xlim([-1,1]);ax.set_ylim([-1,1])\n",
+        "  ax.set_aspect(1.0)\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "btrt7BX20gKD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's define a networks.  We'll just consider the inputs and outputs over the range [-1,1].  If you set the \"plot_all\" flat to True,  you can see the details of how it was created."
+      ],
+      "metadata": {
+        "id": "LxBJCObC-NTY"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now lets define some parameters and run the first neural network\n",
+        "n1_theta_10 = 0.0   ; n1_theta_11 = -1.0\n",
+        "n1_theta_20 = 0     ; n1_theta_21 = 1.0\n",
+        "n1_theta_30 = -0.67 ; n1_theta_31 =  1.0\n",
+        "n1_phi_0 = 1.0; n1_phi_1 = -2.0; n1_phi_2 = -3.0; n1_phi_3 = 9.3\n",
+        "\n",
+        "# Define a range of input values\n",
+        "n1_in = np.arange(-1,1,0.01).reshape([1,-1])\n",
+        "\n",
+        "# We run the neural network for each of these input values\n",
+        "n1_out, *_ = shallow_1_1_3(n1_in, ReLU, n1_phi_0, n1_phi_1, n1_phi_2, n1_phi_3, n1_theta_10, n1_theta_11, n1_theta_20, n1_theta_21, n1_theta_30, n1_theta_31)\n",
+        "# And then plot it\n",
+        "plot_neural(n1_in, n1_out)"
+      ],
+      "metadata": {
+        "id": "JRebvurv22pT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we'll define the same neural network, but this time, we will  use matrix form.  When you get this right, it will draw the same plot as above."
+      ],
+      "metadata": {
+        "id": "XCJqo_AjfAra"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "beta_0 = np.zeros((3,1))\n",
+        "Omega_0 = np.zeros((3,1))\n",
+        "beta_1 = np.zeros((1,1))\n",
+        "Omega_1 = np.zeros((1,3))\n",
+        "\n",
+        "# TODO Fill in the values of the beta and Omega matrices with the n1_theta and n1_phi parameters that define the network above\n",
+        "# !!! NOTE THAT MATRICES ARE CONVENTIONALLY INDEXED WITH a_11 IN THE TOP LEFT CORNER, BUT NDARRAYS START AT [0,0]\n",
+        "# To get you started I've filled in a couple:\n",
+        "beta_0[0,0] = n1_theta_10\n",
+        "Omega_0[0,0] = n1_theta_11\n",
+        "\n",
+        "\n",
+        "# Make sure that input data matrix has different inputs in its columns\n",
+        "n_data = n1_in.size\n",
+        "n_dim_in = 1\n",
+        "n1_in_mat = np.reshape(n1_in,(n_dim_in,n_data))\n",
+        "\n",
+        "# This runs the network for ALL of the inputs, x at once so we can draw graph\n",
+        "h1 = ReLU(np.matmul(beta_0,np.ones((1,n_data))) + np.matmul(Omega_0,n1_in_mat))\n",
+        "n1_out = np.matmul(beta_1,np.ones((1,n_data))) + np.matmul(Omega_1,h1)\n",
+        "\n",
+        "# Draw the network and check that it looks the same as the non-matrix case\n",
+        "plot_neural(n1_in, n1_out)"
+      ],
+      "metadata": {
+        "id": "MR0AecZYfACR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we'll feed the output of the first network into the second one."
+      ],
+      "metadata": {
+        "id": "qOcj2Rof-o20"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now lets define some parameters and run the second neural network\n",
+        "n2_theta_10 =  -0.6 ; n2_theta_11 = -1.0\n",
+        "n2_theta_20 =  0.2  ; n2_theta_21 = 1.0\n",
+        "n2_theta_30 =  -0.5  ; n2_theta_31 =  1.0\n",
+        "n2_phi_0 = 0.5; n2_phi_1 = -1.0; n2_phi_2 = -1.5; n2_phi_3 = 2.0\n",
+        "\n",
+        "# Define a range of input values\n",
+        "n2_in = np.arange(-1,1,0.01)\n",
+        "\n",
+        "# We run the second neural network on the output of the first network\n",
+        "n2_out, *_ = \\\n",
+        "    shallow_1_1_3(n1_out, ReLU, n2_phi_0, n2_phi_1, n2_phi_2, n2_phi_3, n2_theta_10, n2_theta_11, n2_theta_20, n2_theta_21, n2_theta_30, n2_theta_31)\n",
+        "# And then plot it\n",
+        "plot_neural(n1_in, n2_out)"
+      ],
+      "metadata": {
+        "id": "ZRjWu8i9239X"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "beta_0 = np.zeros((3,1))\n",
+        "Omega_0 = np.zeros((3,1))\n",
+        "beta_1 = np.zeros((3,1))\n",
+        "Omega_1 = np.zeros((3,3))\n",
+        "beta_2 = np.zeros((1,1))\n",
+        "Omega_2 = np.zeros((1,3))\n",
+        "\n",
+        "# TODO Fill in the values of the beta and Omega matrices for with the n1_theta, n1_phi, n2_theta, and n2_phi parameters \n",
+        "# that define the composition of the two networks above (see eqn 4.5 for Omega1 and beta1 albeit in different notation)\n",
+        "# !!! NOTE THAT MATRICES ARE CONVENTIONALLY INDEXED WITH a_11 IN THE TOP LEFT CORNER, BUT NDARRAYS START AT [0,0] SO EVERYTHING IS OFFSET\n",
+        "# To get you started I've filled in a few:\n",
+        "beta_0[0,0] = n1_theta_10\n",
+        "Omega_0[0,0] = n1_theta_11\n",
+        "beta_1[0,0] = n2_theta_10 + n2_theta_11 * n1_phi_0\n",
+        "Omega_1[0,0] = n2_theta_11 * n1_phi_1\n",
+        "\n",
+        "\n",
+        "\n",
+        "# Make sure that input data matrix has different inputs in its columns\n",
+        "n_data = n1_in.size\n",
+        "n_dim_in = 1\n",
+        "n1_in_mat = np.reshape(n1_in,(n_dim_in,n_data))\n",
+        "\n",
+        "# This runs the network for ALL of the inputs, x at once so we can draw graph (hence extra np.ones term)\n",
+        "h1 = ReLU(np.matmul(beta_0,np.ones((1,n_data))) + np.matmul(Omega_0,n1_in_mat))\n",
+        "h2 = ReLU(np.matmul(beta_1,np.ones((1,n_data))) + np.matmul(Omega_1,h1))\n",
+        "n1_out = np.matmul(beta_2,np.ones((1,n_data))) + np.matmul(Omega_2,h2)\n",
+        "\n",
+        "# Draw the network and check that it looks the same as the non-matrix version\n",
+        "plot_neural(n1_in, n1_out)"
+      ],
+      "metadata": {
+        "id": "ZB2HTalOE40X"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's make a deep network with 3 hidden layers.  It will have d_i=4 inputs, d_1=5 neurons  in the first layer, d_2=2 neurons in the second layer and d_3=4 neurons in the third layer, and d_o = 1 output.  Consults figure 4.6 for guidance."
+      ],
+      "metadata": {
+        "id": "0VANqxH2kyS4"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# define sizes\n",
+        "D_i=4; D_1=5; D_2=2; D_3=4; D_o=1\n",
+        "# We'll choose the inputs and parameters of this network randomly using np.random.normal\n",
+        "# For example, we'll set the input using\n",
+        "n_data = 10;\n",
+        "x = np.random.normal(size=(D_i, n_data))\n",
+        "# TODO initialize the parameters randomly but with the correct sizes\n",
+        "# Replace the lines below\n",
+        "beta_0 = np.random.normal(size=(1,1))\n",
+        "Omega_0 = np.random.normal(size=(1,1))\n",
+        "beta_1 = np.random.normal(size=(1,1))\n",
+        "Omega_1 = np.random.normal(size=(1,1))\n",
+        "beta_2 = np.random.normal(size=(1,1))\n",
+        "Omega_2 = np.random.normal(size=(1,1))\n",
+        "beta_3 = np.random.normal(size=(1,1))\n",
+        "Omega_3 = np.random.normal(size=(1,1))\n",
+        "\n",
+        "# If you set the above sizes to the correct values then, the following code will run \n",
+        "h1 = ReLU(np.matmul(beta_0,np.ones((1,n_data))) + np.matmul(Omega_0,x));\n",
+        "h2 = ReLU(np.matmul(beta_1,np.ones((1,n_data))) + np.matmul(Omega_1,h1));\n",
+        "h3 = ReLU(np.matmul(beta_2,np.ones((1,n_data))) + np.matmul(Omega_2,h2));\n",
+        "y = np.matmul(beta_3,np.ones((1,n_data))) + np.matmul(Omega_3,h3)\n",
+        "\n",
+        "if h1.shape[0] is not D_1 or h1.shape[1] is not n_data:\n",
+        "  print(\"h1 is wrong shape\")\n",
+        "if h2.shape[0] is not D_2 or h1.shape[1] is not n_data:\n",
+        "  print(\"h2 is wrong shape\")\n",
+        "if h3.shape[0] is not D_3 or h1.shape[1] is not n_data:\n",
+        "  print(\"h3 is wrong shape\")\n",
+        "if y.shape[0] is not D_o or h1.shape[1] is not n_data:\n",
+        "  print(\"Output is wrong shape\")\n",
+        "\n",
+        "# Print the inputs and outputs\n",
+        "print(x)\n",
+        "print(y)"
+      ],
+      "metadata": {
+        "id": "RdBVAc_Rj22-"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Gradients_I.ipynb
+++ b/CM20315_Gradients_I.ipynb
@@ -0,0 +1,420 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyMDEfAZvjcjpvBNmdrYv3EW",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Gradients_I.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# CM20315 Gradients I\n",
+        "\n",
+        "We're going to investigate how to take the derivatives of functions where one operation is composed with another, which is composed with a third and so on.    For example, consider the function:\n",
+        "\n",
+        "\\begin{equation}\n",
+        "     y = \\beta_4+\\omega_4\\cdot \\log\\biggl[\\beta_3+\\omega_3\\cdot\\cos\\Bigl[\\beta_2+\\omega_2\\cdot\\exp\\bigl[\\beta_1+\\omega_1\\cdot\\sin[\\beta_0+\\omega_0x]\\bigr]\\Bigr]\\biggr],\n",
+        "\\end{equation}\n",
+        "\n",
+        "which is a composition of the functions $\\log[\\bullet], \\cos[\\bullet],\\exp[\\bullet],\\sin[\\bullet]$.   I chose these just because you probably already know the derivatives of these functions:\n",
+        "\n",
+        "\\begin{eqnarray*}\n",
+        "\\frac{\\partial \\log[z]}{\\partial z} = \\frac{1}{z}\\quad\\quad \\frac{\\partial \\cos[z]}{\\partial z} = -\\sin[z] \\quad\\quad \\frac{\\partial \\exp[z]}{\\partial z} = \\exp[z] \\quad\\quad \\frac{\\partial \\sin[z]}{\\partial z} = -\\cos[z].\n",
+        "\\end{eqnarray*}\n",
+        "\n",
+        "Suppose that we know the current values of $\\beta_{0},\\beta_{1},\\beta_{2},\\beta_{3},\\beta_{4},\\omega_{0},\\omega_{1},\\omega_{2},\\omega_{3},\\omega_{4}$, and $x$. We could obviously calculate $y$.   But we also want to know how $y$ changes when we make a small change to $\\beta_{0},\\beta_{1},\\beta_{2},\\beta_{3},\\beta_{4},\\omega_{0},\\omega_{1},\\omega_{2},\\omega_{3}$, or $\\omega_{4}$.  In other words, we want to compute the ten derivatives:\n",
+        "\n",
+        "\\begin{eqnarray*}\n",
+        "\\frac{\\partial y}{\\partial \\beta_{0}}, \\quad \\frac{\\partial y}{\\partial \\beta_{1}}, \\quad \\frac{\\partial y}{\\partial \\beta_{2}}, \\quad \\frac{\\partial y }{\\partial \\beta_{3}}, \\quad\n",
+        "\\frac{\\partial y}{\\partial \\beta_{4}}, \\quad \\frac{\\partial y}{\\partial \\omega_{0}}, \\quad \\frac{\\partial y}{\\partial \\omega_{1}}, \\quad \\frac{\\partial y}{\\partial \\omega_{2}}, \\quad \\frac{\\partial y}{\\partial \\omega_{3}},  \\quad\\mbox{and} \\quad \\frac{\\partial y}{\\partial \\omega_{4}}.\n",
+        "\\end{eqnarray*}"
+      ],
+      "metadata": {
+        "id": "1DmMo2w63CmT"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# import library\n",
+        "import numpy as np"
+      ],
+      "metadata": {
+        "id": "RIPaoVN834Lj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's first define the original function for $y$:"
+      ],
+      "metadata": {
+        "id": "32-ufWhc3v2c"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AakK_qen3BpU"
+      },
+      "outputs": [],
+      "source": [
+        "def fn(x, beta0, beta1, beta2, beta3, beta4, omega0, omega1, omega2, omega3, omega4):\n",
+        "  return beta4 + omega4 * np.log(beta3+omega3 * np.cos(beta2 + omega2 * np.exp(beta1 + omega1 * np.sin(beta0 + omega0 * x))))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we'll choose some values for the betas and the omegas and x and compute the output of the function:"
+      ],
+      "metadata": {
+        "id": "y7tf0ZMt5OXt"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "beta0 = 1.0; beta1 = 2.0; beta2 = -3.0; beta3 = 0.4; beta4 = -0.3\n",
+        "omega0 = 0.1; omega1 = -0.4; omega2 = 2.0; omega3 = 3.0; omega4 = -0.5\n",
+        "x = 2.3\n",
+        "y_func = fn(x,beta0,beta1,beta2,beta3,beta4,omega0,omega1,omega2,omega3,omega4)\n",
+        "print('y=%3.3f'%y_func)"
+      ],
+      "metadata": {
+        "id": "pwvOcCxr41X_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Computing derivatives by hand\n",
+        "\n",
+        "We could compute expressions for the derivatives by hand and write code to compute them directly.  Some of them are easy. For example:\n",
+        "\n",
+        "\\begin{equation}\n",
+        "\\frac{\\partial y}{\\partial \\beta_{4}}  = 1,\n",
+        "\\end{equation}\n",
+        "\n",
+        "but some have very complex expressions, even for this relatively simple original equation. For example:\n",
+        "\n",
+        "\\begin{eqnarray*}\n",
+        "\\frac{\\partial y}{\\partial \\omega_{0}}  &=& \n",
+        "-\\frac{\\omega_{1}\\omega_{2}\\omega_{3}\\omega_{4} x \\cos[\\beta_{0}\\!+\\!\\omega_{0}x]\\cdot\\exp\\bigl[\\omega_{1}\\sin[\\beta_{0}\\!+\\!\\omega_{0}x]\\!+\\!\\beta_{1}\\bigr]\\cdot\\sin\\Bigl[\\omega_{2}\\exp\\bigl[\\omega_{1}\\sin[\\beta_{0}\\!+\\!\\omega_{0}x]\\!+\\!\\beta_{1}\\bigr]\\!+\\!\\beta_{2}\\Bigr]}\n",
+        "{\\omega_{3}\\cos[\\omega_{2}\\exp[\\omega_{1}\\sin[\\beta_{0}\\!+\\!\\omega_{0}x]\\!+\\!\\beta_{1}]\\!+\\!\\beta_{2}]\\!+\\!\\beta_{3}}.\n",
+        "\\end{eqnarray*}"
+      ],
+      "metadata": {
+        "id": "u5w69NeT64yV"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dydbeta4_func = 1\n",
+        "dydomega0_func = -omega1*omega2*omega3*omega4*x * np.cos(beta0+omega0*x) * \\\n",
+        "              np.exp(omega1 * np.sin(beta0+omega0*x)+beta1) * \\\n",
+        "              np.sin(omega2 * np.exp(omega1 * np.sin(beta0+omega0 *x)+beta1)+beta2)/ \\\n",
+        "              (omega3 * np.cos(omega2 * np.exp(omega1 * np.sin(beta0+omega0*x)+beta1)+beta2)+beta3)"
+      ],
+      "metadata": {
+        "id": "7t22hALp5zkq"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's make sure these are correct using finite differences:"
+      ],
+      "metadata": {
+        "id": "iRh4hnu3-H3n"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dydbeta4_fd = (fn(x,beta0,beta1,beta2,beta3,beta4+0.0001,omega0,omega1,omega2,omega3,omega4)-fn(x,beta0,beta1,beta2,beta3,beta4,omega0,omega1,omega2,omega3,omega4))/0.0001\n",
+        "dydomega0_fd = (fn(x,beta0,beta1,beta2,beta3,beta4,omega0+0.0001,omega1,omega2,omega3,omega4)-fn(x,beta0,beta1,beta2,beta3,beta4,omega0,omega1,omega2,omega3,omega4))/0.0001\n",
+        "\n",
+        "print('dydbeta4: Function value = %3.3f, Finite difference value = %3.3f'%(dydbeta4_func,dydbeta4_fd))\n",
+        "print('dydomega0: Function value = %3.3f, Finite difference value = %3.3f'%(dydomega0_func,dydomega0_fd))"
+      ],
+      "metadata": {
+        "id": "1O3XmXMx-HlZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The code to calculate $\\partial y/ \\partial \\omega_0$ is a bit of a nightmare.  It's easy to make mistakes, and you can see that some parts of it are repeated (for example, the $\\sin[\\bullet]$ term), which suggests some kind of redundancy in the calculations.  The goal of this practical is to compute the derivatives in a much simpler way.  There will be three steps:"
+      ],
+      "metadata": {
+        "id": "wS4IPjZAKWTN"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Step 1:** Write the original equations as a series of intermediate calculations.  We change \n",
+        "\n",
+        "\\begin{equation}\n",
+        "     y = \\beta_4+\\omega_4\\cdot \\log\\biggl[\\beta_3+\\omega_3\\cdot\\cos\\Bigl[\\beta_2+\\omega_2\\cdot\\exp\\bigl[\\beta_1+\\omega_1\\cdot\\sin[\\beta_0+\\omega_0x]\\bigr]\\Bigr]\\biggr]\n",
+        "\\end{equation}\n",
+        "\n",
+        "to \n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "f_{0} &=& \\beta_{0} + \\omega_{0} x\\nonumber\\\\\n",
+        "h_{1} &=& \\sin[f_{0}]\\nonumber\\\\\n",
+        "f_{1} &=& \\beta_{1} + \\omega_{1}h_{1}\\nonumber\\\\\n",
+        "h_{2} &=& \\exp[f_{1}]\\nonumber\\\\\n",
+        "f_{2} &=& \\beta_{2} + \\omega_{2} h_{2}\\nonumber\\\\\n",
+        "h_{3} &=& \\cos[f_{2}]\\nonumber\\\\\n",
+        "f_{3} &=& \\beta_{3} + \\omega_{3}h_{3}\\nonumber\\\\\n",
+        "h_{4} &=& \\log[f_{3}]\\nonumber\\\\\n",
+        "y &=& \\beta_{4} + \\omega_{4} h_{4}\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "and compute and store the values of all of these intermediate values.  We'll need them to compute the derivatives.<br>  This is called the **forward pass**."
+      ],
+      "metadata": {
+        "id": "8UWhvDeNDudz"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO compute all the f_k and h_k terms \n",
+        "# Replace the code below\n",
+        "\n",
+        "f0 = 1\n",
+        "h1 = 1\n",
+        "f1 = 1\n",
+        "h2 = 1\n",
+        "f2 = 1\n",
+        "h3 = 1\n",
+        "f3 = 1\n",
+        "h4 = 1\n",
+        "y = 1"
+      ],
+      "metadata": {
+        "id": "ZWKAq6HC90qV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's check we got that right:\n",
+        "print(\"f0: true value = %3.3f, your value = %3.3f\"%(1.230, f0))\n",
+        "print(\"h1: true value = %3.3f, your value = %3.3f\"%(0.942, h1))\n",
+        "print(\"f1: true value = %3.3f, your value = %3.3f\"%(1.623, f1))\n",
+        "print(\"h2: true value = %3.3f, your value = %3.3f\"%(5.068, h2))\n",
+        "print(\"f2: true value = %3.3f, your value = %3.3f\"%(7.137, f2))\n",
+        "print(\"h3: true value = %3.3f, your value = %3.3f\"%(0.657, h3))\n",
+        "print(\"f3: true value = %3.3f, your value = %3.3f\"%(2.372, f3))\n",
+        "print(\"h4: true value = %3.3f, your value = %3.3f\"%(0.864, h4))\n",
+        "print(\"y_func = %3.3f, y = %3.3f\"%(y_func, y))\n"
+      ],
+      "metadata": {
+        "id": "ibxXw7TUW4Sx"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Step 2:** Compute the derivatives of $y$ with respect to the intermediate quantities that we just calculated, but in reverse order:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial y}{\\partial h_4}, \\quad \\frac{\\partial y}{\\partial f_3}, \\quad \\frac{\\partial y}{\\partial h_3}, \\quad \\frac{\\partial y}{\\partial f_2}, \\quad\n",
+        "\\frac{\\partial y}{\\partial h_2}, \\quad \\frac{\\partial y}{\\partial f_1}, \\quad \\frac{\\partial y}{\\partial h_1},  \\quad\\mbox{and} \\quad \\frac{\\partial y}{\\partial f_0}.\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "The first of these derivatives is straightforward:\n",
+        "\n",
+        "\\begin{equation}\n",
+        "\\frac{\\partial y}{\\partial h_{4}} = \\frac{\\partial }{\\partial h_{4}} \\beta_{4} + \\omega_{4} h_{4} = \\omega_{4}.\n",
+        "\\end{equation}\n",
+        "\n",
+        "The second derivative can be calculated using the chain rule:\n",
+        "\n",
+        "\\begin{equation}\n",
+        "\\frac{\\partial y}{\\partial f_{3}} = \\frac{\\partial y}{\\partial h_{4}} \\frac{\\partial h_{4}}{\\partial f_{3}}.\n",
+        "\\end{equation}\n",
+        "\n",
+        "The left-hand side asks how $y$ changes when $f_{3}$ changes.  The right-hand side says we can decompose this into (i) how $y$ changes when $h_{4}$ changes and how $h_{4}$ changes when $f_{4}$ changes.  So you get a chain of events happening:  $f_{3}$ changes $h_{4}$, which changes $y$, and the derivatives represent the effects of this chain.  Notice that we computed the first of these derivatives already and the other one is the derivative of $\\log[f_{3}]$ is simply $1/f_{3}$.  We calculated $f_{3}$ in step 1.\n",
+        "\n",
+        "We can continue in this way, computing the derivatives of the output with respect to these intermediate quantities:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial y}{\\partial h_{3}} &=& \\frac{\\partial y}{\\partial h_{4}} \\frac{\\partial h_{4}}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial h_{3}}=\\frac{\\partial y}{\\partial f_{3}} \\frac{\\partial f_{3}}{\\partial h_{3}}\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial f_{2}} &=& \\frac{\\partial y}{\\partial h_{4}} \\frac{\\partial h_{4}}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial h_{3}}\\frac{\\partial h_{3}}{\\partial f_{2}} = \\frac{\\partial y}{\\partial h_{3}}\\frac{\\partial h_{3}}{\\partial f_{2}}\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial h_{2}} &=& \\frac{\\partial y}{\\partial h_{4}} \\frac{\\partial h_{4}}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial h_{3}}\\frac{\\partial h_{3}}{\\partial f_{2}}\\frac{\\partial f_{2}}{\\partial h_{2}}=\\frac{\\partial y}{\\partial f_{2}}\\frac{\\partial f_{2}}{\\partial h_{2}}\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial f_{1}} &=& \\frac{\\partial y}{\\partial h_{4}} \\frac{\\partial h_{4}}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial h_{3}}\\frac{\\partial h_{3}}{\\partial f_{2}}\\frac{\\partial f_{2}}{\\partial h_{2}}\\frac{\\partial h_{2}}{\\partial f_{1}}=\\frac{\\partial y}{\\partial h_{2}}\\frac{\\partial h_{2}}{\\partial f_{1}}\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial h_{1}} &=& \\frac{\\partial y}{\\partial h_{4}} \\frac{\\partial h_{4}}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial h_{3}}\\frac{\\partial h_{3}}{\\partial f_{2}}\\frac{\\partial f_{2}}{\\partial h_{2}}\\frac{\\partial h_{2}}{\\partial f_{1}}\\frac{\\partial f_{1}}{\\partial h_{1}}=\\frac{\\partial y}{\\partial f_{1}}\\frac{\\partial f_{1}}{\\partial h_{1}}\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial f_{0}} &=& \\frac{\\partial y}{\\partial h_{4}} \\frac{\\partial h_{4}}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial h_{3}}\\frac{\\partial h_{3}}{\\partial f_{2}}\\frac{\\partial f_{2}}{\\partial h_{2}}\\frac{\\partial h_{2}}{\\partial f_{1}}\\frac{\\partial f_{1}}{\\partial h_{1}}\\frac{\\partial h_{1}}{\\partial f_{0}}=\\frac{\\partial y}{\\partial h_{1}}\\frac{\\partial h_{1}}{\\partial f_{0}}.\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "In each case, we have already computed all of the terms except the last one in the previous step, and the last term is simple to evaluate.  This is called the **backward pass**."
+      ],
+      "metadata": {
+        "id": "jay8NYWdFHuZ"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO -- Compute the derivatives of the output with respect\n",
+        "# to the intermediate computations h_k and f_k (i.e, run the backward pass)\n",
+        "# I've done the first two for you.  You replace the code below:\n",
+        "dydh4 = omega4\n",
+        "dydf3 = dydh4 * (1/f3)\n",
+        "# Replace the code below\n",
+        "dydh3 = 1\n",
+        "dydf2 = 1\n",
+        "dydh2 = 1\n",
+        "dydf1 = 1\n",
+        "dydh1 = 1\n",
+        "dydf0 = 1 "
+      ],
+      "metadata": {
+        "id": "gCQJeI--Egdl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's check we got that right\n",
+        "print(\"dydh3: true value = %3.3f, your value = %3.3f\"%(-0.632, dydh3))\n",
+        "print(\"dydf2: true value = %3.3f, your value = %3.3f\"%(0.476, dydf2))\n",
+        "print(\"dydh2: true value = %3.3f, your value = %3.3f\"%(0.953, dydh2))\n",
+        "print(\"dydf1: true value = %3.3f, your value = %3.3f\"%(4.830, dydf1))\n",
+        "print(\"dydh1: true value = %3.3f, your value = %3.3f\"%(-1.932, dydh1))\n",
+        "print(\"dydf0: true value = %3.3f, your value = %3.3f\"%(-0.646, dydf0))"
+      ],
+      "metadata": {
+        "id": "dS1OrLtlaFr7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Step 3:**  Now we will find how $y$ changes when we change the $\\beta$ and $\\omega$ terms. The first two are easy:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial y}{\\partial \\beta_{4}} &=& \\frac{\\partial }{\\partial \\beta_{4}}(\\beta_{4} + \\omega_{4} h_{4}) = 1\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial \\omega_{4}} &=& \\frac{\\partial }{\\partial \\omega_{4}}(\\beta_{4} + \\omega_{4} h_{4}) = h_{4}.\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "The remaining terms are calculated using the chain rule again:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial y}{\\partial \\beta_{3}} &=& \\frac{\\partial y}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial \\beta_{3}}\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial \\omega_{3}} &=& \\frac{\\partial y}{\\partial f_{3}}\\frac{\\partial f_{3}}{\\partial \\omega_{3}}\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "where we already computed the first term of each right-hand side in Step 2, and the second terms are also easy to compute.  By the same logic, the other terms are:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial y}{\\partial \\beta_{k}} &=& \\frac{\\partial y}{\\partial f_{k}}\\frac{\\partial f_{k}}{\\partial \\beta_{k}}\\nonumber \\\\\n",
+        "\\frac{\\partial y}{\\partial \\omega_{k}} &=& \\frac{\\partial y}{\\partial f_{k}}\\frac{\\partial f_{k}}{\\partial \\omega_{k}}\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "for $k=2,1,0$."
+      ],
+      "metadata": {
+        "id": "FlzlThQPGpkU"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO -- Calculate the final derivatives with respect to the beta and omega terms\n",
+        "\n",
+        "dydbeta4 = 1\n",
+        "dydomega4 = 1\n",
+        "dydbeta3 = 1\n",
+        "dydomega3 = 1\n",
+        "dydbeta2 = 1\n",
+        "dydomega2 = 1\n",
+        "dydbeta1 = 1\n",
+        "dydomega1 = 1\n",
+        "dydbeta0 = 1\n",
+        "dydomega0 = 1\n"
+      ],
+      "metadata": {
+        "id": "1I2BhqZhGMK6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's check we got them right\n",
+        "print('dydbeta4: Your value = %3.3f, Function value = %3.3f, Finite difference value = %3.3f'%(dydbeta4, dydbeta4_func,dydbeta4_fd))\n",
+        "print('dydomega4: Your value = %3.3f, True value = %3.3f'%(dydomega4, 0.864))\n",
+        "print('dydbeta3: Your value = %3.3f, True value = %3.3f'%(dydbeta3, -0.211))\n",
+        "print('dydomega3: Your value = %3.3f, True value = %3.3f'%(dydomega3, -0.139))\n",
+        "print('dydbeta2: Your value = %3.3f, True value = %3.3f'%(dydbeta2, 0.476))\n",
+        "print('dydomega2: Your value = %3.3f, True value = %3.3f'%(dydomega2, 2.415))\n",
+        "print('dydbeta1: Your value = %3.3f, True value = %3.3f'%(dydbeta1, 4.830))\n",
+        "print('dydomega1: Your value = %3.3f, True value = %3.3f'%(dydomega1, 4.552))\n",
+        "print('dydbeta0: Your value = %3.3f, True value = %3.3f'%(dydbeta0, -0.646))\n",
+        "print('dydomega0: Your value = %3.3f, Function value = %3.3f, Finite difference value = %3.3f'%(dydomega0, dydomega0_func,dydomega0_fd))"
+      ],
+      "metadata": {
+        "id": "38eiOn2aHgHI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Using this method, we can compute the derivatives quite easily without needing to compute very complicated expressions.  This is exactly the same way that the derivatives of the parameters are computed in the backpropagation algorithm.  In fact, this basically *is* the backpropagation algorithm."
+      ],
+      "metadata": {
+        "id": "N2ZhrR-2fNa1"
+      }
+    }
+  ]
+}
--- a/CM20315_Gradients_II.ipynb
+++ b/CM20315_Gradients_II.ipynb
@@ -0,0 +1,350 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyNUus+txeW8v5HpKHIRwUMo",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Gradients_II.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Gradients II: Backpropagation algorithm\n",
+        "\n",
+        "In this practical, we'll investigate the backpropagation algoritithm.  This computes the gradients of the loss with respect to all of the parameters (weights and biases) in the network.  We'll use these gradients when we run stochastic gradient descent."
+      ],
+      "metadata": {
+        "id": "L6chybAVFJW2"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LdIDglk1FFcG"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "First let's define a neural network.  We'll just choose the weights and biaes randomly for now"
+      ],
+      "metadata": {
+        "id": "nnUoI0m6GyjC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set seed so we always get the same random numbers\n",
+        "np.random.seed(0)\n",
+        "\n",
+        "# Number of layers\n",
+        "K = 5\n",
+        "# Number of neurons per layer\n",
+        "D = 6\n",
+        "# Input layer\n",
+        "D_i = 1\n",
+        "# Output layer \n",
+        "D_o = 1\n",
+        "\n",
+        "# Make empty lists \n",
+        "all_weights = [None] * (K+1)\n",
+        "all_biases = [None] * (K+1)\n",
+        "\n",
+        "# Create input and output layers\n",
+        "all_weights[0] = np.random.normal(size=(D, D_i))\n",
+        "all_weights[-1] = np.random.normal(size=(D_o, D))\n",
+        "all_biases[0] = np.random.normal(size =(D,1))\n",
+        "all_biases[-1]= np.random.normal(size =(D_o,1))\n",
+        "\n",
+        "# Create intermediate layers\n",
+        "for layer in range(1,K):\n",
+        "  all_weights[layer] = np.random.normal(size=(D,D))\n",
+        "  all_biases[layer] = np.random.normal(size=(D,1))  "
+      ],
+      "metadata": {
+        "id": "WVM4Tc_jGI0Q"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation"
+      ],
+      "metadata": {
+        "id": "jZh-7bPXIDq4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's run our random network.  The weight matrices $\\boldsymbol\\Omega_{1\\ldots K}$ are the entries of the list \"all_weights\" and the biases $\\boldsymbol\\beta_{1\\ldots k}$ are the entries of the list \"all_biases\"\n",
+        "\n",
+        "We know that we will need the activations $\\mathbf{f}_{0\\ldots K}$ and the activations $\\mathbf{h}_{1\\ldots K}$ for the forward pass of backpropagation, so we'll store and return these as well. \n"
+      ],
+      "metadata": {
+        "id": "5irtyxnLJSGX"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_network_output(net_input, all_weights, all_biases):\n",
+        "\n",
+        "  # Retrieve number of layers\n",
+        "  K = len(all_weights) -1\n",
+        "\n",
+        "  # We'll store the pre-activations at each layer in a list \"all_f\"\n",
+        "  # and the activations in a second list[all_h].  \n",
+        "  all_f = [None] * (K+1)\n",
+        "  all_h = [None] * (K+1)\n",
+        "\n",
+        "  #For convenience, we'll set \n",
+        "  # all_h[0] to be the input, and all_f[K] will be the output\n",
+        "  all_h[0] = net_input\n",
+        "\n",
+        "  # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n",
+        "  for layer in range(K):\n",
+        "      # Update preactivations and activations at this layer according to eqn 7.5\n",
+        "      # Remmember to use np.matmul for matrix multiplications\n",
+        "      # TODO -- Replace the lines below\n",
+        "      all_f[layer] = all_h[layer]\n",
+        "      all_h[layer+1] = all_f[layer]\n",
+        "\n",
+        "  # Compute the output from the last hidden layer\n",
+        "  # TO DO -- Replace the line below\n",
+        "  all_f[K] = np.zeros_like(all_biases[-1])\n",
+        "\n",
+        "  # Retrieve the output\n",
+        "  net_output = all_f[K]\n",
+        "\n",
+        "  return net_output, all_f, all_h"
+      ],
+      "metadata": {
+        "id": "LgquJUJvJPaN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define in input\n",
+        "net_input = np.ones((D_i,1)) * 1.2\n",
+        "# Compute network output\n",
+        "net_output, all_f, all_h = compute_network_output(net_input,all_weights, all_biases)\n",
+        "print(\"True output = %3.3f, Your answer = %3.3f\"%(1.907, net_output[0,0]))"
+      ],
+      "metadata": {
+        "id": "IN6w5m2ZOhnB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's define a loss function.  We'll just use the least squaures loss function. We'll also write a function to compute dloss_doutpu"
+      ],
+      "metadata": {
+        "id": "SxVTKp3IcoBF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def least_squares_loss(net_output, y):\n",
+        "  return np.sum((net_output-y) * (net_output-y))\n",
+        "\n",
+        "def d_loss_d_output(net_output, y):\n",
+        "    return 2*(net_output -y); "
+      ],
+      "metadata": {
+        "id": "6XqWSYWJdhQR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "y = np.ones((D_o,1)) * 20.0\n",
+        "loss = least_squares_loss(net_output, y)\n",
+        "print(\"y = %3.3f Loss = %3.3f\"%(y, loss))"
+      ],
+      "metadata": {
+        "id": "njF2DUQmfttR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's compute the derivatives of the network.  We already computed the forward pass.  Let's compute the backward pass."
+      ],
+      "metadata": {
+        "id": "98WmyqFYWA-0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# We'll need the indicator function\n",
+        "def indicator_function(x):\n",
+        "  x_in = np.array(x)\n",
+        "  x_in[x_in>=0] = 1\n",
+        "  x_in[x_in<0] = 0\n",
+        "  return x_in\n",
+        "\n",
+        "# Main backward pass routine\n",
+        "def backward_pass(all_weights, all_biases, all_f, all_h, y):\n",
+        "  # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n",
+        "  all_dl_dweights = [None] * (K+1)\n",
+        "  all_dl_dbiases = [None] * (K+1)\n",
+        "  # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n",
+        "  all_dl_df = [None] * (K+1)\n",
+        "  all_dl_dh = [None] * (K+1)\n",
+        "  # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n",
+        "\n",
+        "  # Compute derivatives of net output with respect to loss\n",
+        "  all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))\n",
+        "\n",
+        "  # Now work backwards through the network\n",
+        "  for layer in range(K,-1,-1):\n",
+        "    # TODO Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)\n",
+        "    # NOTE!  To take a copy of matrix X, use Z=np.array(X)\n",
+        "    # REPLACE THIS LINE\n",
+        "    all_dl_dbiases[layer] = np.zeros_like(all_biases[layer])\n",
+        "\n",
+        "    # TODO Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)\n",
+        "    # Don't forget to use np.matmul\n",
+        "    # REPLACE THIS LINE\n",
+        "    all_dl_dweights[layer] = np.zeros_like(all_weights[layer])\n",
+        "\n",
+        "    # TODO: calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)\n",
+        "    # REPLACE THIS LINE\n",
+        "    all_dl_dh[layer] = np.zeros_like(all_h[layer])\n",
+        "\n",
+        "    if layer > 0:\n",
+        "      # TODO Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)\n",
+        "      # REPLACE THIS LINE\n",
+        "      all_dl_df[layer-1] = np.zeros_like(all_f[layer-1])\n",
+        "\n",
+        "  return all_dl_dweights, all_dl_dbiases"
+      ],
+      "metadata": {
+        "id": "LJng7WpRPLMz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "all_dl_dweights, all_dl_dbiases = backward_pass(all_weights, all_biases, all_f, all_h, y)"
+      ],
+      "metadata": {
+        "id": "9A9MHc4sQvbp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "np.set_printoptions(precision=3)\n",
+        "# Make space for derivatives computed by finite differences\n",
+        "all_dl_dweights_fd = [None] * (K+1)\n",
+        "all_dl_dbiases_fd = [None] * (K+1)\n",
+        "\n",
+        "# Let's test if we have the derivatives right using finite differences\n",
+        "delta_fd = 0.000001\n",
+        "\n",
+        "# For every layer\n",
+        "for layer in range(K):\n",
+        "  dl_dbias  = np.zeros_like(all_dl_dbiases[layer])\n",
+        "  # For every element in the bias\n",
+        "  for row in range(all_biases[layer].shape[0]):\n",
+        "    # Take copy of biases  We'll change one element each time\n",
+        "    all_biases_copy = [np.array(x) for x in all_biases]\n",
+        "    all_biases_copy[layer][row] += delta_fd\n",
+        "    network_output_1, *_ = compute_network_output(net_input, all_weights, all_biases_copy)\n",
+        "    network_output_2, *_ = compute_network_output(net_input, all_weights, all_biases)\n",
+        "    dl_dbias[row] = (least_squares_loss(network_output_1, y) - least_squares_loss(network_output_2,y))/delta_fd\n",
+        "  all_dl_dbiases_fd[layer] = np.array(dl_dbias)\n",
+        "  print(\"Bias %d, derivatives from backprop:\"%(layer))\n",
+        "  print(all_dl_dbiases[layer])\n",
+        "  print(\"Bias %d, derivatives from finite differences\"%(layer))\n",
+        "  print(all_dl_dbiases_fd[layer])\n",
+        "\n",
+        "\n",
+        "# For every layer\n",
+        "for layer in range(K):\n",
+        "  dl_dweight  = np.zeros_like(all_dl_dweights[layer])\n",
+        "  # For every element in the bias\n",
+        "  for row in range(all_weights[layer].shape[0]):\n",
+        "    for col in range(all_weights[layer].shape[1]):\n",
+        "      # Take copy of biases  We'll change one element each time\n",
+        "      all_weights_copy = [np.array(x) for x in all_weights]\n",
+        "      all_weights_copy[layer][row][col] += delta_fd\n",
+        "      network_output_1, *_ = compute_network_output(net_input, all_weights_copy, all_biases)\n",
+        "      network_output_2, *_ = compute_network_output(net_input, all_weights, all_biases)\n",
+        "      dl_dweight[row][col] = (least_squares_loss(network_output_1, y) - least_squares_loss(network_output_2,y))/delta_fd\n",
+        "  all_dl_dweights_fd[layer] = np.array(dl_dweight)\n",
+        "  print(\"Weight %d, derivatives from backprop:\"%(layer))\n",
+        "  print(all_dl_dweights[layer])\n",
+        "  print(\"Weight %d, derivatives from finite differences\"%(layer))\n",
+        "  print(all_dl_dweights_fd[layer])"
+      ],
+      "metadata": {
+        "id": "PK-UtE3hreAK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "gtokc0VX0839"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Gradients_III.ipynb
+++ b/CM20315_Gradients_III.ipynb
@@ -0,0 +1,351 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyPr1jNETAJLP27xFPVEC09J",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Gradients_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Initialization\n",
+        "\n",
+        "In this practical, we'll investigate the what happens to the activations and the forward pass if we don't initialize the parameters sensibly."
+      ],
+      "metadata": {
+        "id": "L6chybAVFJW2"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LdIDglk1FFcG"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "First let's define a neural network.  We'll just choose the weights and biaes randomly for now"
+      ],
+      "metadata": {
+        "id": "nnUoI0m6GyjC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def init_params(K, D, sigma_sq_omega):\n",
+        "  # Set seed so we always get the same random numbers\n",
+        "  np.random.seed(0)\n",
+        "\n",
+        "  # Input layer\n",
+        "  D_i = 1\n",
+        "  # Output layer \n",
+        "  D_o = 1\n",
+        "\n",
+        "  # Make empty lists \n",
+        "  all_weights = [None] * (K+1)\n",
+        "  all_biases = [None] * (K+1)\n",
+        "\n",
+        "  # Create input and output layers\n",
+        "  all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)\n",
+        "  all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)\n",
+        "  all_biases[0] = np.zeros((D,1))\n",
+        "  all_biases[-1]= np.zeros((D_o,1))\n",
+        "\n",
+        "  # Create intermediate layers\n",
+        "  for layer in range(1,K):\n",
+        "    all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)\n",
+        "    all_biases[layer] = np.zeros((D,1)) \n",
+        "\n",
+        "  return all_weights, all_biases"
+      ],
+      "metadata": {
+        "id": "WVM4Tc_jGI0Q"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation"
+      ],
+      "metadata": {
+        "id": "jZh-7bPXIDq4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_network_output(net_input, all_weights, all_biases):\n",
+        "\n",
+        "  # Retrieve number of layers\n",
+        "  K = len(all_weights) -1\n",
+        "\n",
+        "  # We'll store the pre-activations at each layer in a list \"all_f\"\n",
+        "  # and the activations in a second list[all_h].  \n",
+        "  all_f = [None] * (K+1)\n",
+        "  all_h = [None] * (K+1)\n",
+        "\n",
+        "  #For convenience, we'll set \n",
+        "  # all_h[0] to be the input, and all_f[K] will be the output\n",
+        "  all_h[0] = net_input\n",
+        "\n",
+        "  # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n",
+        "  for layer in range(K):\n",
+        "      # Update preactivations and activations at this layer according to eqn 7.5\n",
+        "      all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])\n",
+        "      all_h[layer+1] = ReLU(all_f[layer])\n",
+        "\n",
+        "  # Compute the output from the last hidden layer\n",
+        "  all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])\n",
+        "\n",
+        "  # Retrieve the output\n",
+        "  net_output = all_f[K]\n",
+        "\n",
+        "  return net_output, all_f, all_h"
+      ],
+      "metadata": {
+        "id": "LgquJUJvJPaN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's investigate how this the size of the outputs vary as we change the initialization variance:\n"
+      ],
+      "metadata": {
+        "id": "bIUrcXnOqChl"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Number of layers\n",
+        "K = 5\n",
+        "# Number of neurons per layer\n",
+        "D = 8\n",
+        "  # Input layer\n",
+        "D_i = 1\n",
+        "# Output layer \n",
+        "D_o = 1\n",
+        "# Set variance of initial weights to 1\n",
+        "sigma_sq_omega = 1.0\n",
+        "# Initialize parameters\n",
+        "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
+        "\n",
+        "n_data = 1000\n",
+        "data_in = np.random.normal(size=(1,n_data))\n",
+        "net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
+        "\n",
+        "for layer in range(K):\n",
+        "  print(\"Layer %d, std of hidden units = %3.3f\"%(layer, np.std(all_h[layer])))"
+      ],
+      "metadata": {
+        "id": "A55z3rKBqO7M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
+        "# and the 1000 training examples\n",
+        "\n",
+        "# TO DO \n",
+        "# Change this to 50 layers with 80 hidden units per layer\n",
+        "\n",
+        "# TO DO \n",
+        "# Now experiment with sigma_sq_omega to try to stop the variance of the forward computation explode"
+      ],
+      "metadata": {
+        "id": "VL_SO4tar3DC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's define a loss function.  We'll just use the least squaures loss function. We'll also write a function to compute dloss_doutput\n"
+      ],
+      "metadata": {
+        "id": "SxVTKp3IcoBF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def least_squares_loss(net_output, y):\n",
+        "  return np.sum((net_output-y) * (net_output-y))\n",
+        "\n",
+        "def d_loss_d_output(net_output, y):\n",
+        "    return 2*(net_output -y); "
+      ],
+      "metadata": {
+        "id": "6XqWSYWJdhQR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Here's the code for the backward pass"
+      ],
+      "metadata": {
+        "id": "98WmyqFYWA-0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# We'll need the indicator function\n",
+        "def indicator_function(x):\n",
+        "  x_in = np.array(x)\n",
+        "  x_in[x_in>=0] = 1\n",
+        "  x_in[x_in<0] = 0\n",
+        "  return x_in\n",
+        "\n",
+        "# Main backward pass routine\n",
+        "def backward_pass(all_weights, all_biases, all_f, all_h, y):\n",
+        "  # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n",
+        "  all_dl_dweights = [None] * (K+1)\n",
+        "  all_dl_dbiases = [None] * (K+1)\n",
+        "  # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n",
+        "  all_dl_df = [None] * (K+1)\n",
+        "  all_dl_dh = [None] * (K+1)\n",
+        "  # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n",
+        "\n",
+        "  # Compute derivatives of net output with respect to loss\n",
+        "  all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))\n",
+        "\n",
+        "  # Now work backwards through the network\n",
+        "  for layer in range(K,-1,-1):\n",
+        "    # Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)\n",
+        "    all_dl_dbiases[layer] = np.array(all_dl_df[layer])\n",
+        "    # Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)\n",
+        "    all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())\n",
+        "\n",
+        "    # Calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)\n",
+        "    all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])\n",
+        "    # Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)\n",
+        "    if layer > 0:\n",
+        "      all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]\n",
+        "\n",
+        "  return all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df"
+      ],
+      "metadata": {
+        "id": "LJng7WpRPLMz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's look at what happens to the magnitude of the gradients on the way back."
+      ],
+      "metadata": {
+        "id": "phFnbthqwhFi"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Number of layers\n",
+        "K = 5\n",
+        "# Number of neurons per layer\n",
+        "D = 8\n",
+        "  # Input layer\n",
+        "D_i = 1\n",
+        "# Output layer \n",
+        "D_o = 1\n",
+        "# Set variance of initial weights to 1\n",
+        "sigma_sq_omega = 1.0\n",
+        "# Initialize parameters\n",
+        "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
+        "\n",
+        "# For simplicity we'll just consider the gradients of the weights and biases between the first and last hidden layer\n",
+        "n_data = 100\n",
+        "aggregate_dl_df = [None] * (K+1)\n",
+        "for layer in range(1,K):\n",
+        "  # These 3D arrays will store the gradients for every data point\n",
+        "  aggregate_dl_df[layer] = np.zeros((D,n_data))\n",
+        "\n",
+        "\n",
+        "# We'll have to compute the derivatives of the parameters for each data point separately\n",
+        "for c_data in range(n_data):\n",
+        "  data_in = np.random.normal(size=(1,1))\n",
+        "  y = np.zeros((1,1))\n",
+        "  net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
+        "  all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df = backward_pass(all_weights, all_biases, all_f, all_h, y)\n",
+        "  for layer in range(1,K):\n",
+        "    aggregate_dl_df[layer][:,c_data] = np.squeeze(all_dl_df[layer])\n",
+        "\n",
+        "for layer in range(1,K):\n",
+        "  print(\"Layer %d, std of dl_dh = %3.3f\"%(layer, np.std(aggregate_dl_df[layer].ravel())))\n"
+      ],
+      "metadata": {
+        "id": "9A9MHc4sQvbp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
+        "# and the 1000 training examples\n",
+        "\n",
+        "# TO DO \n",
+        "# Change this to 50 layers with 80 hidden units per layer\n",
+        "\n",
+        "# TO DO \n",
+        "# Now experiment with sigma_sq_omega to try to stop the variance of the gradients exploding"
+      ],
+      "metadata": {
+        "id": "gtokc0VX0839"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Loss.ipynb
+++ b/CM20315_Loss.ipynb
@@ -0,0 +1,564 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyNWvWC97VuIGwu4VTE1XbO6",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Loss.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Loss functions\n",
+        "\n",
+        "In this practical, we'll investigate loss functions.  In part 1 (this notebook), we'll investigate univariate regression (where the output data $y$ is continuous.  Our formulation will be based on the normal/Gaussian distribution.\n",
+        "\n",
+        "We'll compute loss functions for maximum likelihood, minimum negative log likelihood, and least squares and show that they all imply that we should use the same parameter values\n",
+        "\n",
+        "In part II, we'll investigate binary classification (where the output data is 0 or 1).  This will be based on the Bernouilli distribution\n",
+        "\n",
+        "In part III we'll investigate multiclass classification (where the output data is 0,1, or, 2).  This will be based on the categorical distribution."
+      ],
+      "metadata": {
+        "id": "jSlFkICHwHQF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PYMZ1x-Pv1ht"
+      },
+      "outputs": [],
+      "source": [
+        "# Imports math library\n",
+        "import numpy as np\n",
+        "# Imports plotting library\n",
+        "import matplotlib.pyplot as plt\n",
+        "# Import math Library\n",
+        "import math"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation\n",
+        "\n",
+        "# Define a shallow neural network\n",
+        "def shallow_nn(x, beta_0, omega_0, beta_1, omaga_1):\n",
+        "    # Make sure that input data is (1 x n_data) array\n",
+        "    n_data = x.size\n",
+        "    x = np.reshape(x,(1,n_data))\n",
+        "\n",
+        "    # This runs the network for ALL of the inputs, x at once so we can draw graph\n",
+        "    h1 = ReLU(np.matmul(beta_0,np.ones((1,n_data))) + np.matmul(omega_0,x))\n",
+        "    y = np.matmul(beta_1,np.ones((1,n_data))) + np.matmul(omega_1,h1)\n",
+        "    return y"
+      ],
+      "metadata": {
+        "id": "Fv7SZR3tv7mV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Utility function for plotting data\n",
+        "def plot_univariate_regression(x_model, y_model, x_data = None, y_data = None, sigma_model = None, title= None):\n",
+        "  # Make sure model data are 1D arrays\n",
+        "  x_model = np.squeeze(x_model)\n",
+        "  y_model = np.squeeze(y_model)\n",
+        "\n",
+        "  fig, ax = plt.subplots()\n",
+        "  ax.plot(x_model,y_model)\n",
+        "  if sigma_model is not None:\n",
+        "    ax.fill_between(x_model, y_model-2*sigma_model, y_model+2*sigma_model, color='lightgray')\n",
+        "  ax.set_xlabel('Input, $x$'); ax.set_ylabel('Output, $y$')\n",
+        "  ax.set_xlim([0,1]);ax.set_ylim([-1,1])\n",
+        "  ax.set_aspect(0.5)\n",
+        "  if title is not None:\n",
+        "    ax.set_title(title)\n",
+        "  if x_data is not None:\n",
+        "    ax.plot(x_data, y_data, 'ko')\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "NRR67ri_1TzN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Univariate regression"
+      ],
+      "metadata": {
+        "id": "PsgLZwsPxauP"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get parameters for model -- we can call this function to easily reset them\n",
+        "def get_parameters():\n",
+        "  # And we'll create a network that approximately fits it\n",
+        "  beta_0 = np.zeros((3,1));  # formerly theta_x0\n",
+        "  omega_0 = np.zeros((3,1)); # formerly theta_x1\n",
+        "  beta_1 = np.zeros((1,1));  # formerly phi_0\n",
+        "  omega_1 = np.zeros((1,3)); # formerly phi_x\n",
+        "\n",
+        "  beta_0[0,0] = 0.3; beta_0[1,0] = -1.0; beta_0[2,0] = -0.5\n",
+        "  omega_0[0,0] = -1.0; omega_0[1,0] = 1.8; omega_0[2,0] = 0.65\n",
+        "  beta_1[0,0] = 0.1;\n",
+        "  omega_1[0,0] = -2.0; omega_1[0,1] = -1.0; omega_1[0,2] = 7.0\n",
+        "\n",
+        "  return beta_0, omega_0, beta_1, omega_1"
+      ],
+      "metadata": {
+        "id": "pUT9Ain_HRim"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's create some 1D training data\n",
+        "x_train = np.array([0.09291784,0.46809093,0.93089486,0.67612654,0.73441752,0.86847339,\\\n",
+        "                   0.49873225,0.51083168,0.18343972,0.99380898,0.27840809,0.38028817,\\\n",
+        "                   0.12055708,0.56715537,0.92005746,0.77072270,0.85278176,0.05315950,\\\n",
+        "                   0.87168699,0.58858043])\n",
+        "y_train = np.array([-0.25934537,0.18195445,0.651270150,0.13921448,0.09366691,0.30567674,\\\n",
+        "                    0.372291170,0.20716968,-0.08131792,0.51187806,0.16943738,0.3994327,\\\n",
+        "                    0.019062570,0.55820410,0.452564960,-0.1183121,0.02957665,-1.24354444, \\\n",
+        "                    0.248038840,0.26824970])\n",
+        "\n",
+        "# Get parameters for the model\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "sigma = 0.2\n",
+        "\n",
+        "# Define a range of input values\n",
+        "x_model = np.arange(0,1,0.01)\n",
+        "# Run the model to get values to plot and plot it.\n",
+        "y_model = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "plot_univariate_regression(x_model, y_model, x_train, y_train, sigma_model = sigma)\n"
+      ],
+      "metadata": {
+        "id": "VWzNOt1swFVd"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The blue line i sthe mean prediction of the model and the gray area represents plus/minus two standardard deviations.  This model fits okay, but could be improved. Let's compute the loss.  We'll compute the  the least squares error, the likelihood, the negative log likelihood."
+      ],
+      "metadata": {
+        "id": "MvVX6tl9AEXF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return probability under normal distribution for input x\n",
+        "def normal_distribution(y, mu, sigma):\n",
+        "    # TODO-- write in the equation for the normal distribution \n",
+        "    # Equation 5.7 from the notes (you will need np.sqrt() and np.exp(), and math.pi)\n",
+        "    # Don't use the numpy version -- that's cheating!\n",
+        "    # Replace the line below\n",
+        "    prob = np.zeros_like(y)\n",
+        "    return prob"
+      ],
+      "metadata": {
+        "id": "YaLdRlEX0FkU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %3.3f, Your answer = %3.3f\"%(0.119,normal_distribution(1,-1,2.3)))"
+      ],
+      "metadata": {
+        "id": "4TSL14dqHHbV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's plot the Gaussian distribution.\n",
+        "y_gauss = np.arange(-5,5,0.1)\n",
+        "mu = 0; sigma = 1.0\n",
+        "gauss_prob = normal_distribution(y_gauss, mu, sigma)\n",
+        "fig, ax = plt.subplots()\n",
+        "ax.plot(y_gauss, gauss_prob)\n",
+        "ax.set_xlabel('Input, $y$'); ax.set_ylabel('Probability $Pr(y)$')\n",
+        "ax.set_xlim([-5,5]);ax.set_ylim([0,1.0])\n",
+        "plt.show()\n",
+        "\n",
+        "# TODO \n",
+        "# 1. Predict what will happen if we change to mu=1 and leave sigma=1\n",
+        "# Answer:\n",
+        "# Now change the code above and see if you were correct.\n",
+        "# 2. Predict what will happen if we leave mu = 0 and change sigma to 2.0\n",
+        "# Answer:\n",
+        "# 3. Predict what will happen if we leave mu = 0 and change sigma to 0.5\n",
+        "# Answer:"
+      ],
+      "metadata": {
+        "id": "A2HcmNfUMIlj"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's compute the likelihood using this function"
+      ],
+      "metadata": {
+        "id": "R5z_0dzQMF35"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the likelihood of all of the data under the model\n",
+        "def compute_likelihood(y_train, mu, sigma):\n",
+        "  # TODO -- compute the likelihood of the data -- the product of the normal probabilities for each data point\n",
+        "  # Top line of equation 5.3 in the notes\n",
+        "  # You will need np.prod() and the normal_distribution function you used above\n",
+        "  # Replace the line below\n",
+        "  likelihood = 0\n",
+        "  return likelihood"
+      ],
+      "metadata": {
+        "id": "zpS7o6liCx7f"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's test this for a homoscedastic (constant sigma) model\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Use our neural network to predict the mean of the Gaussian\n",
+        "mu_pred = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "# Set the standard devation to something reasonable\n",
+        "sigma = 0.2\n",
+        "# Compute the likelihood\n",
+        "likelihood = compute_likelihood(y_train, mu_pred, sigma)\n",
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %9.9f, Your answer = %9.9f\"%(0.000010624,likelihood))"
+      ],
+      "metadata": {
+        "id": "1hQxBLoVNlr2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You can see that this gives a very small answer, even for this small 1D dataset, and with the model fitting quite well.  This is because it is the product of sveral probabilities, which are all quite small themselves.\n",
+        "This will get out of hand pretty quickly with real datasets -- the likelihood will get so small that we can't represent it with normal finite-precision math\n",
+        "\n",
+        "This is why we use negative log likelihood"
+      ],
+      "metadata": {
+        "id": "HzphKgPfOvlk"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the negative log likelihood of the data under the model\n",
+        "def compute_negative_log_likelihood(y_train, mu, sigma):\n",
+        "  # TODO -- compute the likelihood of the data -- don't use the likelihood function above -- compute the negative sum of the log probabilities\n",
+        "  # Bottom line of equation 5.4 in the notes\n",
+        "  # You will need np.sum(), np.log()\n",
+        "  # Replace the line below\n",
+        "  nll = 0\n",
+        "  return nll"
+      ],
+      "metadata": {
+        "id": "dsT0CWiKBmTV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's test this for a homoscedastic (constant sigma) model\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Use our neural network to predict the mean of the Gaussian\n",
+        "mu_pred = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "# Set the standard devation to something reasonable\n",
+        "sigma = 0.2\n",
+        "# Compute the log likelihood\n",
+        "nll = compute_negative_log_likelihood(y_train, mu_pred, sigma)\n",
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %9.9f, Your answer = %9.9f\"%(11.452419564,nll))"
+      ],
+      "metadata": {
+        "id": "nVxUXg9rQmwI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "For good measure, let's compute the sum of squares as well"
+      ],
+      "metadata": {
+        "id": "-S8bXApoWVLG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the squared distance between the predicted \n",
+        "def compute_sum_of_squares(y_train, y_pred):\n",
+        "  # TODO -- compute the sum of squared distances between the training data and the model prediction\n",
+        "  # Eqn 5.10 in the notes.  Make sure that you understand this, and ask questions if you don't\n",
+        "  # Replace the line below\n",
+        "  sum_of_squares = 0;\n",
+        "  return sum_of_squares"
+      ],
+      "metadata": {
+        "id": "I1pjFdHCF4JZ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's test this again\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Use our neural network to predict the mean of the Gaussian\n",
+        "y_pred = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "# Compute the log likelihood\n",
+        "sum_of_squares = compute_sum_of_squares(y_train, y_pred)\n",
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %9.9f, Your answer = %9.9f\"%(2.020992572,sum_of_squares))"
+      ],
+      "metadata": {
+        "id": "2C40fskIHBx7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's investigate finding the maximum likelihood / minimum log likelihood / least squares solution.  For simplicity, we'll assume that all the parameters are correct except one and look at how the likelihood, log likelihood, and sum of squares change as we manipulate the last parameter.  We'll start with overall y offset, beta_1 (formerly phi_0)"
+      ],
+      "metadata": {
+        "id": "OgcRojvPWh4V"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define a range of values for the parameter\n",
+        "beta_1_vals = np.arange(0,1.0,0.01)\n",
+        "# Create some arrays to store the likelihoods, negative log likehoos and sum of squares\n",
+        "likelihoods = np.zeros_like(beta_1_vals)\n",
+        "nlls = np.zeros_like(beta_1_vals)\n",
+        "sum_squares = np.zeros_like(beta_1_vals)\n",
+        "\n",
+        "# Initialise the parameters\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "sigma = 0.2\n",
+        "for count in range(len(beta_1_vals)):\n",
+        "  # Set the value for the parameter\n",
+        "  beta_1[0,0] = beta_1_vals[count]\n",
+        "  # Run the network with new parameters\n",
+        "  mu_pred = y_pred = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "  # Compute and store the three values\n",
+        "  likelihoods[count] = compute_likelihood(y_train, mu_pred, sigma)\n",
+        "  nlls[count] = compute_negative_log_likelihood(y_train, mu_pred, sigma)\n",
+        "  sum_squares[count] = compute_sum_of_squares(y_train, y_pred)\n",
+        "  # Draw the model for every 20th parameter setting\n",
+        "  if count % 20 == 0:\n",
+        "    # Run the model to get values to plot and plot it.\n",
+        "    y_model = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "    plot_univariate_regression(x_model, y_model, x_train, y_train, sigma_model = sigma, title=\"beta1[0]=%3.3f\"%(beta_1[0,0]))\n"
+      ],
+      "metadata": {
+        "id": "pFKtDaAeVU4U"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now let's plot the likelihood, negative log likelihood, and least squares as a function the value of the offset beta1\n",
+        "fig, ax = plt.subplots(1,3)\n",
+        "fig.set_size_inches(10.5, 3.5)\n",
+        "fig.tight_layout(pad=3.0)\n",
+        "ax[0].plot(beta_1_vals, likelihoods); ax[0].set_xlabel('beta_1[0]'); ax[0].set_ylabel('likelihood')\n",
+        "ax[1].plot(beta_1_vals, nlls); ax[1].set_xlabel('beta_1[0]'); ax[1].set_ylabel('negative log likelihood')\n",
+        "ax[2].plot(beta_1_vals, sum_squares); ax[2].set_xlabel('beta_1[0]'); ax[2].set_ylabel('sum of squares')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "UHXeTa9MagO6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Hopefully, you can see that the maximum of the likelihood fn is at the same position as the minimum negative log likelihood\n",
+        "# and the least squares solutions\n",
+        "# Let's check that:\n",
+        "print(\"Maximum likelihood = %3.3f, at beta_1=%3.3f\"%( (likelihoods[np.argmax(likelihoods)],beta_1_vals[np.argmax(likelihoods)])))\n",
+        "print(\"Minimum negative log likelihood = %3.3f, at beta_1=%3.3f\"%( (nlls[np.argmin(nlls)],beta_1_vals[np.argmin(nlls)])))\n",
+        "print(\"Least squares = %3.3f, at beta_1=%3.3f\"%( (sum_squares[np.argmin(sum_squares)],beta_1_vals[np.argmin(sum_squares)])))\n",
+        "\n",
+        "# Plot the best model\n",
+        "beta_1[0,0] = beta_1_vals[np.argmin(sum_squares)]\n",
+        "y_model = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "plot_univariate_regression(x_model, y_model, x_train, y_train, sigma_model = sigma, title=\"beta1=%3.3f\"%(beta_1[0,0]))"
+      ],
+      "metadata": {
+        "id": "aDEPhddNdN4u"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "They all give the same answer. But you can see from the three plots above that the likelihood is very small unless the parameters are almost correct.  So in practice, we would work with the negative log likelihood or the least squares.<br><br>\n",
+        "\n",
+        "For fun, let's do the same thing with the standard deviation parameter of our network.  This is not an output of the network (unless we choose to make that the case), but it still affects the likelihood.\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "771G8N1Vk5A2"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define a range of values for the parameter\n",
+        "sigma_vals = np.arange(0.1,0.5,0.005)\n",
+        "# Create some arrays to store the likelihoods, negative log likehoos and sum of squares\n",
+        "likelihoods = np.zeros_like(sigma_vals)\n",
+        "nlls = np.zeros_like(sigma_vals)\n",
+        "sum_squares = np.zeros_like(sigma_vals)\n",
+        "\n",
+        "# Initialise the parameters\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Might as well set to the best offset\n",
+        "beta_1[0,0] = 0.27\n",
+        "for count in range(len(sigma_vals)):\n",
+        "  # Set the value for the parameter\n",
+        "  sigma = sigma_vals[count]\n",
+        "  # Run the network with new parameters\n",
+        "  mu_pred = y_pred = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "  # Compute and store the three values\n",
+        "  likelihoods[count] = compute_likelihood(y_train, mu_pred, sigma)\n",
+        "  nlls[count] = compute_negative_log_likelihood(y_train, mu_pred, sigma)\n",
+        "  sum_squares[count] = compute_sum_of_squares(y_train, y_pred)\n",
+        "  # Draw the model for every 20th parameter setting\n",
+        "  if count % 20 == 0:\n",
+        "    # Run the model to get values to plot and plot it.\n",
+        "    y_model = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "    plot_univariate_regression(x_model, y_model, x_train, y_train, sigma_model=sigma, title=\"sigma=%3.3f\"%(sigma))"
+      ],
+      "metadata": {
+        "id": "dMNAr0R8gg82"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now let's plot the likelihood, negative log likelihood, and least squares as a function the value of the offset beta1\n",
+        "fig, ax = plt.subplots(1,3)\n",
+        "fig.set_size_inches(10.5, 3.5)\n",
+        "fig.tight_layout(pad=3.0)\n",
+        "ax[0].plot(sigma_vals, likelihoods); ax[0].set_xlabel('$\\sigma$'); ax[0].set_ylabel('likelihood')\n",
+        "ax[1].plot(sigma_vals, nlls); ax[1].set_xlabel('$\\sigma$'); ax[1].set_ylabel('negative log likelihood')\n",
+        "ax[2].plot(sigma_vals, sum_squares); ax[2].set_xlabel('$\\sigma$'); ax[2].set_ylabel('sum of squares')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "l9jduyHLDAZC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Hopefully, you can see that the maximum of the likelihood fn is at the same position as the minimum negative log likelihood\n",
+        "# The least squares solution does not depend on sigma, so it's just flat -- no use here.\n",
+        "# Let's check that:\n",
+        "print(\"Maximum likelihood = %3.3f, at beta_1=%3.3f\"%( (likelihoods[np.argmax(likelihoods)],sigma_vals[np.argmax(likelihoods)])))\n",
+        "print(\"Minimum negative log likelihood = %3.3f, at beta_1=%3.3f\"%( (nlls[np.argmin(nlls)],sigma_vals[np.argmin(nlls)])))\n",
+        "# Plot the best model\n",
+        "sigma= sigma_vals[np.argmin(nlls)]\n",
+        "y_model = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "plot_univariate_regression(x_model, y_model, x_train, y_train, sigma_model = sigma, title=\"beta1=%3.3f, sigma =%3.3f\"%(beta_1[0,0],sigma))"
+      ],
+      "metadata": {
+        "id": "XH7yER52Dxt5"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Obviously, to fit the full neural model we would vary all of the 10 parameters of the network in the $\\boldsymbol\\beta_{0},\\boldsymbol\\omega_{0},\\boldsymbol\\beta_{1},\\boldsymbol\\omega_{1}$ (and maybe $\\sigma$) until we find the combination that have the maximum likelihood / minimum negative log likelihood / least squares.<br><br>\n",
+        "\n",
+        "Here we just varied one at a time as it is easier to see what is going on.  This is known as **coordinate descent**.\n"
+      ],
+      "metadata": {
+        "id": "q_KeGNAHEbIt"
+      }
+    }
+  ]
+}
--- a/CM20315_Loss_II.ipynb
+++ b/CM20315_Loss_II.ipynb
@@ -0,0 +1,453 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyMLgMUtCcJHjIzHTTqjKVt1",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Loss_II.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Loss functions part II\n",
+        "\n",
+        "This practical investigates loss functions.  In part I we investigated univariate regression (where the output data $y$ is continuous.  Our formulation was based on the normal/Gaussian distribution.\n",
+        "\n",
+        "In this notebook, we investigate binary classification (where the output data is 0 or 1).  This will be based on the Bernouilli distribution\n",
+        "\n",
+        "In part III we'll investigate multiclass classification (where the outputs data can take multiple values 1,... K.\n",
+        "\n",
+        "We'll compute loss functions with maximum likelihood and minimum negative log likelihood."
+      ],
+      "metadata": {
+        "id": "jSlFkICHwHQF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PYMZ1x-Pv1ht"
+      },
+      "outputs": [],
+      "source": [
+        "# Imports math library\n",
+        "import numpy as np\n",
+        "# Imports plotting library\n",
+        "import matplotlib.pyplot as plt\n",
+        "# Import math Library\n",
+        "import math"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation\n",
+        "\n",
+        "# Define a shallow neural network\n",
+        "def shallow_nn(x, beta_0, omega_0, beta_1, omaga_1):\n",
+        "    # Make sure that input data is (1 x n_data) array\n",
+        "    n_data = x.size\n",
+        "    x = np.reshape(x,(1,n_data))\n",
+        "\n",
+        "    # This runs the network for ALL of the inputs, x at once so we can draw graph\n",
+        "    h1 = ReLU(np.matmul(beta_0,np.ones((1,n_data))) + np.matmul(omega_0,x))\n",
+        "    model_out = np.matmul(beta_1,np.ones((1,n_data))) + np.matmul(omega_1,h1)\n",
+        "    return model_out"
+      ],
+      "metadata": {
+        "id": "Fv7SZR3tv7mV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Utility function for plotting data\n",
+        "def plot_binary_classification(x_model, out_model, lambda_model, x_data = None, y_data = None, title= None):\n",
+        "  # Make sure model data are 1D arrays\n",
+        "  x_model = np.squeeze(x_model)\n",
+        "  out_model = np.squeeze(out_model)\n",
+        "  lambda_model = np.squeeze(lambda_model)\n",
+        "\n",
+        "  fig, ax = plt.subplots(1,2)\n",
+        "  fig.set_size_inches(7.0, 3.5)\n",
+        "  fig.tight_layout(pad=3.0)\n",
+        "  ax[0].plot(x_model,out_model)\n",
+        "  ax[0].set_xlabel('Input, $x$'); ax[0].set_ylabel('Model output')\n",
+        "  ax[0].set_xlim([0,1]);ax[0].set_ylim([-4,4])\n",
+        "  if title is not None:\n",
+        "    ax[0].set_title(title)\n",
+        "  ax[1].plot(x_model,lambda_model)\n",
+        "  ax[1].set_xlabel('Input, $x$'); ax[1].set_ylabel('$\\lambda$ or Pr(y=1|x)')\n",
+        "  ax[1].set_xlim([0,1]);ax[1].set_ylim([-0.05,1.05])\n",
+        "  if title is not None:\n",
+        "    ax[1].set_title(title)\n",
+        "  if x_data is not None:\n",
+        "    ax[1].plot(x_data, y_data, 'ko')\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "NRR67ri_1TzN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Binary classification"
+      ],
+      "metadata": {
+        "id": "PsgLZwsPxauP"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get parameters for model -- we can call this function to easily reset them\n",
+        "def get_parameters():\n",
+        "  # And we'll create a network that approximately fits it\n",
+        "  beta_0 = np.zeros((3,1));  # formerly theta_x0\n",
+        "  omega_0 = np.zeros((3,1)); # formerly theta_x1\n",
+        "  beta_1 = np.zeros((1,1));  # formerly phi_0\n",
+        "  omega_1 = np.zeros((1,3)); # formerly phi_x\n",
+        "\n",
+        "  beta_0[0,0] = 0.3; beta_0[1,0] = -1.0; beta_0[2,0] = -0.5\n",
+        "  omega_0[0,0] = -1.0; omega_0[1,0] = 1.8; omega_0[2,0] = 0.65\n",
+        "  beta_1[0,0] = 2.6;\n",
+        "  omega_1[0,0] = -24.0; omega_1[0,1] = -8.0; omega_1[0,2] = 50.0\n",
+        "\n",
+        "  return beta_0, omega_0, beta_1, omega_1"
+      ],
+      "metadata": {
+        "id": "pUT9Ain_HRim"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Sigmoid function that maps [-infty,infty] to [0,1]\n",
+        "def sigmoid(model_out):\n",
+        "  # TODO -- implement the logistic sigmoid function\n",
+        "  # Replace this line:\n",
+        "  sig_model_out = np.zeros_like(model_out)\n",
+        "  return sig_model_out"
+      ],
+      "metadata": {
+        "id": "uFb8h-9IXnIe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's create some 1D training data\n",
+        "x_train = np.array([0.09291784,0.46809093,0.93089486,0.67612654,0.73441752,0.86847339,\\\n",
+        "                   0.49873225,0.51083168,0.18343972,0.99380898,0.27840809,0.38028817,\\\n",
+        "                   0.12055708,0.56715537,0.92005746,0.77072270,0.85278176,0.05315950,\\\n",
+        "                   0.87168699,0.58858043])\n",
+        "y_train = np.array([0,1,1,0,0,1,\\\n",
+        "                    1,0,0,1,0,1,\\\n",
+        "                    0,1,1,0,1,0, \\\n",
+        "                    1,1])\n",
+        "\n",
+        "# Get parameters for the model\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "\n",
+        "# Define a range of input values\n",
+        "x_model = np.arange(0,1,0.01)\n",
+        "# Run the model to get values to plot and plot it.\n",
+        "model_out= shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "lambda_model = sigmoid(model_out)\n",
+        "plot_binary_classification(x_model, model_out, lambda_model, x_train, y_train)\n"
+      ],
+      "metadata": {
+        "id": "VWzNOt1swFVd"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The left is model output and the right is the model output after the sigmoid has been applied, so it now lies in the range [0,1] and represents the probabiilty, that y=1.  The black dots show the training data.  We'll compute the the likelihood and the negative log likelihood."
+      ],
+      "metadata": {
+        "id": "MvVX6tl9AEXF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return probability under Bernoulli distribution for input x\n",
+        "def bernoulli_distribution(y, lambda_param):\n",
+        "    # TODO-- write in the equation for the Bernoullid distribution \n",
+        "    # Equation 5.17 from the notes (you will need np.power)\n",
+        "    # Replace the line below\n",
+        "    prob = np.zeros_like(y)\n",
+        "    return prob"
+      ],
+      "metadata": {
+        "id": "YaLdRlEX0FkU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %3.3f, Your answer = %3.3f\"%(0.8,bernoulli_distribution(0,0.2)))\n",
+        "print(\"Correct answer = %3.3f, Your answer = %3.3f\"%(0.2,bernoulli_distribution(1,0.2)))"
+      ],
+      "metadata": {
+        "id": "4TSL14dqHHbV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's compute the likelihood using this function"
+      ],
+      "metadata": {
+        "id": "R5z_0dzQMF35"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the likelihood of all of the data under the model\n",
+        "def compute_likelihood(y_train, lambda_param):\n",
+        "  # TODO -- compute the likelihood of the data -- the product of the Bernoullis probabilities for each data point\n",
+        "  # Top line of equation 5.3 in the notes\n",
+        "  # You will need np.prod() and the bernoulli_distribution function you used above\n",
+        "  # Replace the line below\n",
+        "  likelihood = 0\n",
+        "  return likelihood"
+      ],
+      "metadata": {
+        "id": "sk4EJSPQ41CK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's test this \n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Use our neural network to predict the mean of the Gaussian\n",
+        "model_out = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "lambda_train = sigmoid(model_out)\n",
+        "# Compute the likelihood\n",
+        "likelihood = compute_likelihood(y_train, lambda_train)\n",
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %9.9f, Your answer = %9.9f\"%(0.000070237,likelihood))"
+      ],
+      "metadata": {
+        "id": "1hQxBLoVNlr2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You can see that this gives a very small answer, even for this small 1D dataset, and with the model fitting quite well.  This is because it is the product of sveral probabilities, which are all quite small themselves.\n",
+        "This will get out of hand pretty quickly with real datasets -- the likelihood will get so small that we can't represent it with normal finite-precision math\n",
+        "\n",
+        "This is why we use negative log likelihood"
+      ],
+      "metadata": {
+        "id": "HzphKgPfOvlk"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the negative log likelihood of the data under the model\n",
+        "def compute_negative_log_likelihood(y_train, lambda_param):\n",
+        "  # TODO -- compute the likelihood of the data -- don't use the likelihood function above -- compute the negative sum of the log probabilities\n",
+        "  # You will need np.sum(), np.log()\n",
+        "  # Replace the line below\n",
+        "  nll = 0\n",
+        "  return nll"
+      ],
+      "metadata": {
+        "id": "dsT0CWiKBmTV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's test this\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Use our neural network to predict the mean of the Gaussian\n",
+        "model_out = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "# Set the standard devation to something reasonable\n",
+        "lambda_train = sigmoid(model_out)\n",
+        "# Compute the log likelihood\n",
+        "nll = compute_negative_log_likelihood(y_train, lambda_train)\n",
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %9.9f, Your answer = %9.9f\"%(9.563639387,nll))"
+      ],
+      "metadata": {
+        "id": "nVxUXg9rQmwI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's investigate finding the maximum likelihood / minimum log likelihood / least squares solution.  For simplicity, we'll assume that all the parameters are fixed except one and look at how the likelihood and log likelihood change as we manipulate the last parameter.  We'll start with overall y_offset, beta_1 (formerly phi_0)"
+      ],
+      "metadata": {
+        "id": "OgcRojvPWh4V"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the likelihood of all of the data under the model\n",
+        "def compute_likelihood(y_train, lambda_param):\n",
+        "  # TODO -- compute the likelihood of the data -- the product of the Bernoulli probabilities for each data point\n",
+        "  # Top line of equation 5.3 in the notes\n",
+        "  # You will need np.prod() and the bernoulli_distribution function you used above\n",
+        "  # Replace the line below\n",
+        "  likelihood = 0\n",
+        "  return likelihood"
+      ],
+      "metadata": {
+        "id": "zpS7o6liCx7f"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define a range of values for the parameter\n",
+        "beta_1_vals = np.arange(-2,6.0,0.1)\n",
+        "# Create some arrays to store the likelihoods, negative log likehoods\n",
+        "likelihoods = np.zeros_like(beta_1_vals)\n",
+        "nlls = np.zeros_like(beta_1_vals)\n",
+        "\n",
+        "# Initialise the parameters\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "for count in range(len(beta_1_vals)):\n",
+        "  # Set the value for the parameter\n",
+        "  beta_1[0,0] = beta_1_vals[count]\n",
+        "  # Run the network with new parameters\n",
+        "  model_out = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "  lambda_train = sigmoid(model_out)\n",
+        "  # Compute and store the three values\n",
+        "  likelihoods[count] = compute_likelihood(y_train,lambda_train)\n",
+        "  nlls[count] = compute_negative_log_likelihood(y_train, lambda_train)\n",
+        "  # Draw the model for every 20th parameter setting\n",
+        "  if count % 20 == 0:\n",
+        "    # Run the model to get values to plot and plot it.\n",
+        "    model_out = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "    lambda_model = sigmoid(model_out)\n",
+        "    plot_binary_classification(x_model, model_out, lambda_model, x_train, y_train, title=\"beta_1[0]=%3.3f\"%(beta_1[0,0]))\n"
+      ],
+      "metadata": {
+        "id": "pFKtDaAeVU4U"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now let's plot the likelihood, negative log likelihood, and least squares as a function the value of the offset beta1\n",
+        "fig, ax = plt.subplots(1,2)\n",
+        "fig.set_size_inches(10.5, 3.5)\n",
+        "fig.tight_layout(pad=3.0)\n",
+        "ax[0].plot(beta_1_vals, likelihoods); ax[0].set_xlabel('beta_1[0]'); ax[0].set_ylabel('likelihood')\n",
+        "ax[1].plot(beta_1_vals, nlls); ax[1].set_xlabel('beta_1[0]'); ax[1].set_ylabel('negative log likelihood')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "UHXeTa9MagO6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Hopefully, you can see that the maximum of the likelihood fn is at the same position as the minimum negative log likelihood\n",
+        "# and the least squares solutions\n",
+        "# Let's check that:\n",
+        "print(\"Maximum likelihood = %f, at beta_1=%3.3f\"%( (likelihoods[np.argmax(likelihoods)],beta_1_vals[np.argmax(likelihoods)])))\n",
+        "print(\"Minimum negative log likelihood = %f, at beta_1=%3.3f\"%( (nlls[np.argmin(nlls)],beta_1_vals[np.argmin(nlls)])))\n",
+        "\n",
+        "# Plot the best model\n",
+        "beta_1[0,0] = beta_1_vals[np.argmin(nlls)]\n",
+        "model_out = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "lambda_model = sigmoid(model_out)\n",
+        "plot_binary_classification(x_model, model_out, lambda_model, x_train, y_train, title=\"beta_1[0]=%3.3f\"%(beta_1[0,0]))\n"
+      ],
+      "metadata": {
+        "id": "aDEPhddNdN4u"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "They both give the same answer. But you can see from the likelihood above that the likelihood is very small unless the parameters are almost correct.  So in practice, we would work with the negative log likelihood.<br><br>\n",
+        "\n",
+        "Again, to fit the full neural model we would vary all of the 10 parameters of the network in the $\\boldsymbol\\beta_{0},\\boldsymbol\\omega_{0},\\boldsymbol\\beta_{1},\\boldsymbol\\omega_{1}$ until we find the combination that have the maximum likelihood / minimum negative log likelihood.<br><br>\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "771G8N1Vk5A2"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "eQ4xeiDgOn0X"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Loss_III.ipynb
+++ b/CM20315_Loss_III.ipynb
@@ -0,0 +1,447 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyNl/KjOshENtrwKt/IdwaUO",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Loss_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Loss functions part III\n",
+        "\n",
+        "This practical investigates loss functions.  In part I we investigated univariate regression (where the output data $y$ is continuous.  Our formulation was based on the normal/Gaussian distribution.\n",
+        "In part II we investigated binary classification (where the output data is 0 or 1).  This will be based on the Bernouilli distribution.<br><br>\n",
+        "\n",
+        "Now we'll investigate multiclass classification (where the outputs data can take multiple values 1,... K, which is based on the categorical distribution\n",
+        "\n",
+        "We'll compute loss functions with maximum likelihood and minimum negative log likelihood."
+      ],
+      "metadata": {
+        "id": "jSlFkICHwHQF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PYMZ1x-Pv1ht"
+      },
+      "outputs": [],
+      "source": [
+        "# Imports math library\n",
+        "import numpy as np\n",
+        "# Used for repmat\n",
+        "import numpy.matlib\n",
+        "# Imports plotting library\n",
+        "import matplotlib.pyplot as plt\n",
+        "# Import math Library\n",
+        "import math"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation\n",
+        "\n",
+        "# Define a shallow neural network\n",
+        "def shallow_nn(x, beta_0, omega_0, beta_1, omaga_1):\n",
+        "    # Make sure that input data is (1 x n_data) array\n",
+        "    n_data = x.size\n",
+        "    x = np.reshape(x,(1,n_data))\n",
+        "\n",
+        "    # This runs the network for ALL of the inputs, x at once so we can draw graph\n",
+        "    h1 = ReLU(np.matmul(beta_0,np.ones((1,n_data))) + np.matmul(omega_0,x))\n",
+        "    model_out = np.matmul(beta_1,np.ones((1,n_data))) + np.matmul(omega_1,h1)\n",
+        "    return model_out"
+      ],
+      "metadata": {
+        "id": "Fv7SZR3tv7mV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Utility function for plotting data\n",
+        "def plot_multiclass_classification(x_model, out_model, lambda_model, x_data = None, y_data = None, title= None):\n",
+        "  # Make sure model data are 1D arrays\n",
+        "  n_data = len(x_model)\n",
+        "  n_class = 3\n",
+        "  x_model = np.squeeze(x_model)\n",
+        "  out_model = np.reshape(out_model, (n_class,n_data))\n",
+        "  lambda_model = np.reshape(lambda_model, (n_class,n_data))\n",
+        "\n",
+        "  fig, ax = plt.subplots(1,2)\n",
+        "  fig.set_size_inches(7.0, 3.5)\n",
+        "  fig.tight_layout(pad=3.0)\n",
+        "  ax[0].plot(x_model,out_model[0,:],'r-')\n",
+        "  ax[0].plot(x_model,out_model[1,:],'g-')\n",
+        "  ax[0].plot(x_model,out_model[2,:],'b-')\n",
+        "  ax[0].set_xlabel('Input, $x$'); ax[0].set_ylabel('Model outputs')\n",
+        "  ax[0].set_xlim([0,1]);ax[0].set_ylim([-4,4])\n",
+        "  if title is not None:\n",
+        "    ax[0].set_title(title)\n",
+        "  ax[1].plot(x_model,lambda_model[0,:],'r-')\n",
+        "  ax[1].plot(x_model,lambda_model[1,:],'g-')\n",
+        "  ax[1].plot(x_model,lambda_model[2,:],'b-')\n",
+        "  ax[1].set_xlabel('Input, $x$'); ax[1].set_ylabel('$\\lambda$ or Pr(y=1|x)')\n",
+        "  ax[1].set_xlim([0,1]);ax[1].set_ylim([-0.1,1.05])\n",
+        "  if title is not None:\n",
+        "    ax[1].set_title(title)\n",
+        "  if x_data is not None:\n",
+        "    for i in range(len(x_data)):\n",
+        "      if y_data[i] ==0:\n",
+        "        ax[1].plot(x_data[i],-0.05, 'r.')\n",
+        "      if y_data[i] ==1:\n",
+        "        ax[1].plot(x_data[i],-0.05, 'g.')\n",
+        "      if y_data[i] ==2:\n",
+        "        ax[1].plot(x_data[i],-0.05, 'b.')\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "NRR67ri_1TzN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Multiclass classification"
+      ],
+      "metadata": {
+        "id": "PsgLZwsPxauP"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Get parameters for model -- we can call this function to easily reset them\n",
+        "def get_parameters():\n",
+        "  # And we'll create a network that approximately fits it\n",
+        "  beta_0 = np.zeros((3,1));  # formerly theta_x0\n",
+        "  omega_0 = np.zeros((3,1)); # formerly theta_x1\n",
+        "  beta_1 = np.zeros((3,1));  # NOTE -- there are three outputs now (one for each class, so three output biases)\n",
+        "  omega_1 = np.zeros((3,3)); # NOTE -- there are three outputs now (one for each class, so nine output weights, connecting 3 hidden units to 3 outputs)\n",
+        "\n",
+        "  beta_0[0,0] = 0.3; beta_0[1,0] = -1.0; beta_0[2,0] = -0.5\n",
+        "  omega_0[0,0] = -1.0; omega_0[1,0] = 1.8; omega_0[2,0] = 0.65\n",
+        "  beta_1[0,0] = 2.0; beta_1[1,0] = -2; beta_1[2,0] = 0.0\n",
+        "  omega_1[0,0] = -24.0; omega_1[0,1] = -8.0; omega_1[0,2] = 50.0\n",
+        "  omega_1[1,0] = -2.0; omega_1[1,1] = 8.0; omega_1[1,2] = -30.0\n",
+        "  omega_1[2,0] = 16.0; omega_1[2,1] = -8.0; omega_1[2,2] =-8\n",
+        "\n",
+        "  return beta_0, omega_0, beta_1, omega_1"
+      ],
+      "metadata": {
+        "id": "pUT9Ain_HRim"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Softmax function that maps a vector of arbitrary values to a vector of values that are positive and sum to one.\n",
+        "def softmax(model_out):\n",
+        "  # This operation has to be done separately for every column of the input\n",
+        "  # Compute exponentials of all the elements\n",
+        "  exp_model_out = np.exp(model_out);\n",
+        "  # Sum down the columns\n",
+        "  sum_exp_model_out = np.sum(exp_model_out, axis =0)\n",
+        "  # Divide to normalize\n",
+        "  softmax_model_out = exp_model_out/np.matlib.repmat(sum_exp_model_out, exp_model_out.shape[0], 1)\n",
+        "  return softmax_model_out"
+      ],
+      "metadata": {
+        "id": "uFb8h-9IXnIe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's create some 1D training data\n",
+        "x_train = np.array([0.09291784,0.46809093,0.93089486,0.67612654,0.73441752,0.86847339,\\\n",
+        "                   0.49873225,0.51083168,0.18343972,0.99380898,0.27840809,0.38028817,\\\n",
+        "                   0.12055708,0.56715537,0.92005746,0.77072270,0.85278176,0.05315950,\\\n",
+        "                   0.87168699,0.58858043])\n",
+        "y_train = np.array([2,0,1,2,1,0,\\\n",
+        "                    0,2,2,0,2,0,\\\n",
+        "                    2,0,1,2,1,2, \\\n",
+        "                    1,0])\n",
+        "\n",
+        "# Get parameters for the model\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "\n",
+        "# Define a range of input values\n",
+        "x_model = np.arange(0,1,0.01)\n",
+        "# Run the model to get values to plot and plot it.\n",
+        "model_out= shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "lambda_model = softmax(model_out)\n",
+        "plot_multiclass_classification(x_model, model_out, lambda_model, x_train, y_train)\n"
+      ],
+      "metadata": {
+        "id": "VWzNOt1swFVd"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "The left is model output and the right is the model output after the softmax has been applied, so it now lies in the range [0,1] and represents the probabiilty, that y=0 (red), 1 (green) and 2 (blue)   The dots at the bottom show the training data with the same color scheme.  So we want the red curve to be high where there are red dots, the green curve to be high where there are green dotsmand the blue curve to be high where there are blue dots  We'll compute the the likelihood and the negative log likelihood."
+      ],
+      "metadata": {
+        "id": "MvVX6tl9AEXF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return probability under Bernoulli distribution for input x\n",
+        "# Complicated code to commpute it but just take value from row k of lambda param where y =k, \n",
+        "def categorical_distribution(y, lambda_param):\n",
+        "    prob = np.zeros_like(y)\n",
+        "    for row_index in range(lambda_param.shape[0]):\n",
+        "        prob = prob + ((y==row_index).astype(int)) * lambda_param[row_index,:]\n",
+        "    return prob"
+      ],
+      "metadata": {
+        "id": "YaLdRlEX0FkU"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %3.3f, Your answer = %3.3f\"%(0.2,categorical_distribution(np.array([[0]]),np.array([[0.2],[0.5],[0.3]]))))\n",
+        "print(\"Correct answer = %3.3f, Your answer = %3.3f\"%(0.5,categorical_distribution(np.array([[1]]),np.array([[0.2],[0.5],[0.3]]))))\n",
+        "print(\"Correct answer = %3.3f, Your answer = %3.3f\"%(0.3,categorical_distribution(np.array([[2]]),np.array([[0.2],[0.5],[0.3]]))))\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "4TSL14dqHHbV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's compute the likelihood using this function"
+      ],
+      "metadata": {
+        "id": "R5z_0dzQMF35"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the likelihood of all of the data under the model\n",
+        "def compute_likelihood(y_train, lambda_param):\n",
+        "  # TODO -- compute the likelihood of the data -- the product of the categorical probabilities for each data point\n",
+        "  # Top line of equation 5.3 in the notes\n",
+        "  # You will need np.prod() and the categorical_distribution function you used above\n",
+        "  # Replace the line below\n",
+        "  likelihood = 0\n",
+        "  return likelihood"
+      ],
+      "metadata": {
+        "id": "zpS7o6liCx7f"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's test this \n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Use our neural network to predict the mean of the Gaussian\n",
+        "model_out = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "lambda_train = softmax(model_out)\n",
+        "# Compute the likelihood\n",
+        "likelihood = compute_likelihood(y_train, lambda_train)\n",
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %9.9f, Your answer = %9.9f\"%(0.000000041,likelihood))"
+      ],
+      "metadata": {
+        "id": "1hQxBLoVNlr2"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You can see that this gives a very small answer, even for this small 1D dataset, and with the model fitting quite well.  This is because it is the product of sveral probabilities, which are all quite small themselves.\n",
+        "This will get out of hand pretty quickly with real datasets -- the likelihood will get so small that we can't represent it with normal finite-precision math\n",
+        "\n",
+        "This is why we use negative log likelihood"
+      ],
+      "metadata": {
+        "id": "HzphKgPfOvlk"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Return the negative log likelihood of the data under the model\n",
+        "def compute_negative_log_likelihood(y_train, lambda_param):\n",
+        "  # TODO -- compute the likelihood of the data -- don't use the likelihood function above -- compute the negative sum of the log probabilities\n",
+        "  # You will need np.sum(), np.log()\n",
+        "  # Replace the line below\n",
+        "  nll = 0\n",
+        "  return nll"
+      ],
+      "metadata": {
+        "id": "dsT0CWiKBmTV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's test this \n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "# Use our neural network to predict the mean of the Gaussian\n",
+        "model_out = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "# Set the standard devation to something reasonable\n",
+        "lambda_train = softmax(model_out)\n",
+        "# Compute the log likelihood\n",
+        "nll = compute_negative_log_likelihood(y_train, lambda_train)\n",
+        "# Let's double check we get the right answer before proceeding\n",
+        "print(\"Correct answer = %9.9f, Your answer = %9.9f\"%(17.015457867,nll))"
+      ],
+      "metadata": {
+        "id": "nVxUXg9rQmwI"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's investigate finding the maximum likelihood / minimum log likelihood / least squares solution.  For simplicity, we'll assume that all the parameters are fixed except one and look at how the likelihood and log likelihood change as we manipulate the last parameter.  We'll start with overall y_offset, beta_1 (formerly phi_0)"
+      ],
+      "metadata": {
+        "id": "OgcRojvPWh4V"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define a range of values for the parameter\n",
+        "beta_1_vals = np.arange(-2,6.0,0.1)\n",
+        "# Create some arrays to store the likelihoods, negative log likehoods\n",
+        "likelihoods = np.zeros_like(beta_1_vals)\n",
+        "nlls = np.zeros_like(beta_1_vals)\n",
+        "\n",
+        "# Initialise the parameters\n",
+        "beta_0, omega_0, beta_1, omega_1 = get_parameters()\n",
+        "for count in range(len(beta_1_vals)):\n",
+        "  # Set the value for the parameter\n",
+        "  beta_1[0,0] = beta_1_vals[count]\n",
+        "  # Run the network with new parameters\n",
+        "  model_out = shallow_nn(x_train, beta_0, omega_0, beta_1, omega_1)\n",
+        "  lambda_train = softmax(model_out)\n",
+        "  # Compute and store the three values\n",
+        "  likelihoods[count] = compute_likelihood(y_train,lambda_train)\n",
+        "  nlls[count] = compute_negative_log_likelihood(y_train, lambda_train)\n",
+        "  # Draw the model for every 20th parameter setting\n",
+        "  if count % 20 == 0:\n",
+        "    # Run the model to get values to plot and plot it.\n",
+        "    model_out = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "    lambda_model = softmax(model_out)\n",
+        "    plot_multiclass_classification(x_model, model_out, lambda_model, x_train, y_train, title=\"beta1[0,0]=%3.3f\"%(beta_1[0,0]))\n"
+      ],
+      "metadata": {
+        "id": "pFKtDaAeVU4U"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Now let's plot the likelihood, negative log likelihood, and least squares as a function the value of the offset beta1\n",
+        "fig, ax = plt.subplots(1,2)\n",
+        "fig.set_size_inches(10.5, 3.5)\n",
+        "fig.tight_layout(pad=3.0)\n",
+        "ax[0].plot(beta_1_vals, likelihoods); ax[0].set_xlabel('beta_1[0,0]'); ax[0].set_ylabel('likelihood')\n",
+        "ax[1].plot(beta_1_vals, nlls); ax[1].set_xlabel('beta_1[0,0]'); ax[1].set_ylabel('negative log likelihood')\n",
+        "plt.show()"
+      ],
+      "metadata": {
+        "id": "UHXeTa9MagO6"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Hopefully, you can see that the maximum of the likelihood fn is at the same position as the minimum negative log likelihood\n",
+        "# and the least squares solutions\n",
+        "# Let's check that:\n",
+        "print(\"Maximum likelihood = %f, at beta_1=%3.3f\"%( (likelihoods[np.argmax(likelihoods)],beta_1_vals[np.argmax(likelihoods)])))\n",
+        "print(\"Minimum negative log likelihood = %f, at beta_1=%3.3f\"%( (nlls[np.argmin(nlls)],beta_1_vals[np.argmin(nlls)])))\n",
+        "\n",
+        "# Plot the best model\n",
+        "beta_1[0,0] = beta_1_vals[np.argmin(nlls)]\n",
+        "model_out = shallow_nn(x_model, beta_0, omega_0, beta_1, omega_1)\n",
+        "lambda_model = softmax(model_out)\n",
+        "plot_multiclass_classification(x_model, model_out, lambda_model, x_train, y_train, title=\"beta1[0,0]=%3.3f\"%(beta_1[0,0]))\n"
+      ],
+      "metadata": {
+        "id": "aDEPhddNdN4u"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "They both give the same answer. But you can see from the likelihood above that the likelihood is very small unless the parameters are almost correct.  So in practice, we would work with the negative log likelihood.<br><br>\n",
+        "\n",
+        "Again, to fit the full neural model we would vary all of the 16 parameters of the network in the $\\boldsymbol\\beta_{0},\\boldsymbol\\omega_{0},\\boldsymbol\\beta_{1},\\boldsymbol\\omega_{1}$ until we find the combination that have the maximum likelihood / minimum negative log likelihood.<br><br>\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "771G8N1Vk5A2"
+      }
+    }
+  ]
+}
--- a/CM20315_Shallow.ipynb
+++ b/CM20315_Shallow.ipynb
--- a/CM20315_Training_II.ipynb
+++ b/CM20315_Training_II.ipynb
@@ -0,0 +1,427 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyMwBvBF5E8ERRBCqN9x6Dp5",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Training_II.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Training II\n",
+        "\n",
+        "We now have a model and a loss function which we can use to judge how good that model is. It's time to put the \"learning\" into machine learning.\n",
+        "\n",
+        "Learning involves finding the parameters that minimize the loss. That might seems like it's not too hard, but modern models might have billions of parameters. There's an exponential number of possible parameter combinations, and there's no way we can make any progress with exhaustive search.\n",
+        "\n",
+        "In part I we considered 1D search using a bracketing approach.  In this part, we'll extend to fitting the linear regression model (which has a convex loss function).  Then in part III, we'll consider a non-convex loss function and implement stochastic gradient descent.\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "el8l05WQEO46"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xhmIOLiZELV_"
+      },
+      "outputs": [],
+      "source": [
+        "# import libraries\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "from matplotlib import cm\n",
+        "from matplotlib.colors import ListedColormap"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's create our training data 12 pairs {x_i, y_i}\n",
+        "# We'll try to fit the straight line model to these data\n",
+        "data = np.array([[0.03,0.19,0.34,0.46,0.78,0.81,1.08,1.18,1.39,1.60,1.65,1.90],\n",
+        "                 [0.67,0.85,1.05,1.00,1.40,1.50,1.30,1.54,1.55,1.68,1.73,1.60]])"
+      ],
+      "metadata": {
+        "id": "4cRkrh9MZ58Z"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's define our model\n",
+        "def model(phi,x):\n",
+        "  y_pred = phi[0]+phi[1] * x\n",
+        "  return y_pred"
+      ],
+      "metadata": {
+        "id": "WQUERmb2erAe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw model\n",
+        "def draw_model(data,model,phi,title=None):\n",
+        "  x_model = np.arange(0,2,0.01)\n",
+        "  y_model = model(phi,x_model)\n",
+        "\n",
+        "  fix, ax = plt.subplots()\n",
+        "  ax.plot(data[0,:],data[1,:],'bo')\n",
+        "  ax.plot(x_model,y_model,'m-')\n",
+        "  ax.set_xlim([0,2]);ax.set_ylim([0,2])\n",
+        "  ax.set_xlabel('x'); ax.set_ylabel('y')\n",
+        "  ax.set_aspect('equal')\n",
+        "  if title is not None:\n",
+        "    ax.set_title(title)\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "qFRe9POHF2le"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parmaeters and draw the model\n",
+        "phi = np.zeros((2,1))\n",
+        "phi[0] = 0.6      # Intercept\n",
+        "phi[1] = -0.2      # Slope\n",
+        "draw_model(data,model,phi, \"Initial parameters\")\n"
+      ],
+      "metadata": {
+        "id": "TXx1Tpd1Tl-I"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now lets create compute the sum of squares loss for the training data"
+      ],
+      "metadata": {
+        "id": "QU5mdGvpTtEG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_loss(data_x, data_y, model, phi):\n",
+        "  # TODO -- Write this function -- replace the line below\n",
+        "  # First make model predictions from data x\n",
+        "  # Then compute the squared difference between the predictions and true y values\n",
+        "  # Then sum them all and return\n",
+        "  loss = 0\n",
+        "  return loss"
+      ],
+      "metadata": {
+        "id": "I7dqTY2Gg7CR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's just test that we got that right"
+      ],
+      "metadata": {
+        "id": "eB5DQvU5hYNx"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
+        "print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 12.367))"
+      ],
+      "metadata": {
+        "id": "Ty05UtEEg9tc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's plot the whole loss function"
+      ],
+      "metadata": {
+        "id": "F3trnavPiHpH"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def draw_loss_function(compute_loss, data,  model, phi_iters = None):\n",
+        "  # Define pretty colormap\n",
+        "  my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
+        "  my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
+        "  r = np.floor(my_colormap_vals_dec/(256*256))\n",
+        "  g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
+        "  b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
+        "  my_colormap = ListedColormap(np.vstack((r,g,b)).transpose()/255.0)\n",
+        "\n",
+        "  # Make grid of intercept/slope values to plot\n",
+        "  intercepts_mesh, slopes_mesh = np.meshgrid(np.arange(0.0,2.0,0.02), np.arange(-1.0,1.0,0.002))\n",
+        "  loss_mesh = np.zeros_like(slopes_mesh)\n",
+        "  # Compute loss for every set of parameters\n",
+        "  for idslope, slope in np.ndenumerate(slopes_mesh):\n",
+        "     loss_mesh[idslope] = compute_loss(data[0,:], data[1,:], model, np.array([[intercepts_mesh[idslope]], [slope]]))\n",
+        "\n",
+        "  fig,ax = plt.subplots()\n",
+        "  fig.set_size_inches(8,8)\n",
+        "  ax.contourf(intercepts_mesh,slopes_mesh,loss_mesh,256,cmap=my_colormap)\n",
+        "  ax.contour(intercepts_mesh,slopes_mesh,loss_mesh,40,colors=['#80808080'])\n",
+        "  if phi_iters is not None:\n",
+        "    ax.plot(phi_iters[0,:], phi_iters[1,:],'go-')\n",
+        "  ax.set_ylim([1,-1])\n",
+        "  ax.set_xlabel('Intercept $\\phi_{0}$'); ax.set_ylabel('Slope, $\\phi_{1}$')\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "K-NTHpAAHlCl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "draw_loss_function(compute_loss, data, model)"
+      ],
+      "metadata": {
+        "id": "l8HbvIupnTME"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's compute the gradient vector for a given set of parameters:\n",
+        "\n",
+        "\\begin{equation}\n",
+        "\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
+        "\\end{equation}"
+      ],
+      "metadata": {
+        "id": "s9Duf05WqqSC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# These are in the lecture slides and notes, but worth trying to calculate them yourself to \n",
+        "# check that you get them right.  Write out the expression for the sum of squares loss and take the\n",
+        "# derivative with respect to phi0 and phi1\n",
+        "def compute_gradient(data_x, data_y, phi):\n",
+        "    # TODO -- write this function, replacing the lines below\n",
+        "    dl_dphi0 = 0.0\n",
+        "    dl_dphi1 = 0.0\n",
+        "\n",
+        "    # Return the gradient\n",
+        "    return np.array([[dl_dphi0],[dl_dphi1]])"
+      ],
+      "metadata": {
+        "id": "UpswmkL2qwBT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We can check we got this right using a trick known as **finite differences**.  If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
+        "\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "We don't do this when there are many parameters;  for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
+      ],
+      "metadata": {
+        "id": "RS1nEcYVuEAM"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Compute the gradient using your function\n",
+        "gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
+        "print(\"Your gradients: (%3.3f,%3.3f)\"%(gradient[0],gradient[1]))\n",
+        "# Approximate the gradients with finite differences\n",
+        "delta = 0.0001\n",
+        "dl_dphi0_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[delta],[0]])) - \\\n",
+        "                    compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
+        "dl_dphi1_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[0],[delta]])) - \\\n",
+        "                    compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
+        "print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n"
+      ],
+      "metadata": {
+        "id": "QuwAHN7yt-gi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we are ready to perform gradient descent.  We'll need to use our line search routine from part I, which I've reproduced here plus the helper function loss_function_1D that converts from a 2D problem to a 1D problem"
+      ],
+      "metadata": {
+        "id": "5EIjMM9Fw2eT"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def loss_function_1D(dist_prop, data, model, phi_start, gradient):\n",
+        "  # Return the loss after moving this far\n",
+        "  return compute_loss(data[0,:], data[1,:], model, phi_start+ gradient * dist_prop)\n",
+        "\n",
+        "def line_search(data, model, phi, gradient, thresh=.00001, max_dist = 0.1, max_iter = 15, verbose=False):\n",
+        "    # Initialize four points along the rnage we are going to search\n",
+        "    a = 0\n",
+        "    b = 0.33 * max_dist\n",
+        "    c = 0.66 * max_dist\n",
+        "    d = 1.0 * max_dist\n",
+        "    n_iter  =0;\n",
+        "    \n",
+        "    # While we haven't found the minimum closely enough\n",
+        "    while np.abs(b-c) > thresh and n_iter < max_iter:\n",
+        "        # Increment iteration counter (just to prevent an infinite loop)\n",
+        "        n_iter = n_iter+1\n",
+        "        # Calculate all four points\n",
+        "        lossa = loss_function_1D(a, data, model, phi,gradient)\n",
+        "        lossb = loss_function_1D(b, data, model, phi,gradient)\n",
+        "        lossc = loss_function_1D(c, data, model, phi,gradient)\n",
+        "        lossd = loss_function_1D(d, data, model, phi,gradient)\n",
+        "\n",
+        "        if verbose:\n",
+        "          print('Iter %d, a=%3.3f, b=%3.3f, c=%3.3f, d=%3.3f'%(n_iter, a,b,c,d))\n",
+        "          print('a %f, b%f, c%f, d%f'%(lossa,lossb,lossc,lossd))\n",
+        "\n",
+        "        # Rule #1 If point A is less than points B, C, and D then halve points B,C, and D\n",
+        "        if np.argmin((lossa,lossb,lossc,lossd))==0:\n",
+        "          b = b/2\n",
+        "          c = c/2\n",
+        "          d = d/2\n",
+        "          continue;\n",
+        "\n",
+        "        # Rule #2 If point b is less than point c then\n",
+        "        #                     then point d becomes point c, and\n",
+        "        #                     point b becomes 1/3 between a and new d\n",
+        "        #                     point c beocome 2/3 between a and new d \n",
+        "        if lossb < lossc:\n",
+        "          d = c\n",
+        "          b = a+ (d-a)/3\n",
+        "          c = a+ 2*(d-a)/3\n",
+        "          continue\n",
+        "\n",
+        "        # Rule #2 If point c is less than point b then\n",
+        "        #                     then point a becomes point b, and\n",
+        "        #                     point b becomes 1/3 between new a and d\n",
+        "        #                     point c beocome 2/3 between new a and d \n",
+        "        a = b\n",
+        "        b = a+ (d-a)/3\n",
+        "        c = a+ 2*(d-a)/3\n",
+        "    \n",
+        "    # Return average of two middle points\n",
+        "    return (b+c)/2.0"
+      ],
+      "metadata": {
+        "id": "XrJ2gQjfw1XP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def gradient_descent_step(phi, data,  model):\n",
+        "  # TODO -- update Phi with the gradient descent step\n",
+        "  # 1. Compute the gradient\n",
+        "  # 2. Find the best step size alpha using line search (use minus 1 times the gradient so it searches in the right direction)\n",
+        "  # 3. Update the parameters phi \n",
+        "\n",
+        "  return phi"
+      ],
+      "metadata": {
+        "id": "YVq6rmaWRD2M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parameters and draw the model\n",
+        "n_steps = 10\n",
+        "phi_all = np.zeros((2,n_steps+1))\n",
+        "phi_all[0,0] = 1.6\n",
+        "phi_all[1,0] = -0.5\n",
+        "\n",
+        "# Measure loss and draw initial model\n",
+        "loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
+        "draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
+        "\n",
+        "for c_step in range (n_steps):\n",
+        "  # Do gradient descent step\n",
+        "  phi_all[:,c_step+1:c_step+2] = gradient_descent_step(phi_all[:,c_step:c_step+1],data, model)\n",
+        "  # Measure loss and draw model\n",
+        "  loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
+        "  draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
+        "\n",
+        "draw_loss_function(compute_loss, data, model,phi_all)\n"
+      ],
+      "metadata": {
+        "id": "tOLd0gtdRLLS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "Oi8ZlH0ptLqA"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CM20315_Training_III.ipynb
+++ b/CM20315_Training_III.ipynb
@@ -0,0 +1,585 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyMzgGVp+/BUCXimg7Ip9lhp",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Training_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Training III\n",
+        "\n",
+        "We now have a model and a loss function which we can use to judge how good that model is. It's time to put the \"learning\" into machine learning.\n",
+        "\n",
+        "Learning involves finding the parameters that minimize the loss. That might seems like it's not too hard, but modern models might have billions of parameters. There's an exponential number of possible parameter combinations, and there's no way we can make any progress with exhaustive search.\n",
+        "\n",
+        "In part I we considered 1D search using a bracketing approach.  In part II we experimented with fitting a linear regression model (which has a convex loss function).  In this part, we'll fit the Gabor model, which has a non-convex loss function.\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "el8l05WQEO46"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xhmIOLiZELV_"
+      },
+      "outputs": [],
+      "source": [
+        "# import libraries\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt\n",
+        "from matplotlib import cm\n",
+        "from matplotlib.colors import ListedColormap"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's create our training data 30 pairs {x_i, y_i}\n",
+        "# We'll try to fit the Gabor model to these data\n",
+        "data = np.array([[-1.920e+00,-1.422e+01,1.490e+00,-1.940e+00,-2.389e+00,-5.090e+00,\n",
+        "                 -8.861e+00,3.578e+00,-6.010e+00,-6.995e+00,3.634e+00,8.743e-01,\n",
+        "                 -1.096e+01,4.073e-01,-9.467e+00,8.560e+00,1.062e+01,-1.729e-01,\n",
+        "                  1.040e+01,-1.261e+01,1.574e-01,-1.304e+01,-2.156e+00,-1.210e+01,\n",
+        "                 -1.119e+01,2.902e+00,-8.220e+00,-1.179e+01,-8.391e+00,-4.505e+00],\n",
+        "                  [-1.051e+00,-2.482e-02,8.896e-01,-4.943e-01,-9.371e-01,4.306e-01,\n",
+        "                  9.577e-03,-7.944e-02 ,1.624e-01,-2.682e-01,-3.129e-01,8.303e-01,\n",
+        "                  -2.365e-02,5.098e-01,-2.777e-01,3.367e-01,1.927e-01,-2.222e-01,\n",
+        "                  6.352e-02,6.888e-03,3.224e-02,1.091e-02,-5.706e-01,-5.258e-02,\n",
+        "                  -3.666e-02,1.709e-01,-4.805e-02,2.008e-01,-1.904e-01,5.952e-01]])"
+      ],
+      "metadata": {
+        "id": "4cRkrh9MZ58Z"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's define our model\n",
+        "def model(phi,x):\n",
+        "  sin_component = np.sin(phi[0] + 0.06 * phi[1] * x)\n",
+        "  gauss_component = np.exp(-(phi[0] + 0.06 * phi[1] * x) * (phi[0] + 0.06 * phi[1] * x) / 32)\n",
+        "  y_pred= sin_component * gauss_component\n",
+        "  return y_pred"
+      ],
+      "metadata": {
+        "id": "WQUERmb2erAe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw model\n",
+        "def draw_model(data,model,phi,title=None):\n",
+        "  x_model = np.arange(-15,15,0.1)\n",
+        "  y_model = model(phi,x_model)\n",
+        "\n",
+        "  fix, ax = plt.subplots()\n",
+        "  ax.plot(data[0,:],data[1,:],'bo')\n",
+        "  ax.plot(x_model,y_model,'m-')\n",
+        "  ax.set_xlim([-15,15]);ax.set_ylim([-1,1])\n",
+        "  ax.set_xlabel('x'); ax.set_ylabel('y')\n",
+        "  if title is not None:\n",
+        "    ax.set_title(title)\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "qFRe9POHF2le"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parmaeters and draw the model\n",
+        "phi = np.zeros((2,1))\n",
+        "phi[0] =  -5     # Horizontal offset\n",
+        "phi[1] =  25     # Frequency\n",
+        "draw_model(data,model,phi, \"Initial parameters\")\n"
+      ],
+      "metadata": {
+        "id": "TXx1Tpd1Tl-I"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now lets create compute the sum of squares loss for the training data"
+      ],
+      "metadata": {
+        "id": "QU5mdGvpTtEG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_loss(data_x, data_y, model, phi):\n",
+        "  # TODO -- Write this function -- replace the line below\n",
+        "  # TODO -- First make model predictions from data x\n",
+        "  # TODO -- Then compute the squared difference between the predictions and true y values\n",
+        "  # TODO -- Then sum them all and return\n",
+        "  loss = 0\n",
+        "  return loss"
+      ],
+      "metadata": {
+        "id": "I7dqTY2Gg7CR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Let's just test that we got that right"
+      ],
+      "metadata": {
+        "id": "eB5DQvU5hYNx"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
+        "print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 16.419))"
+      ],
+      "metadata": {
+        "id": "Ty05UtEEg9tc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's plot the whole loss function"
+      ],
+      "metadata": {
+        "id": "F3trnavPiHpH"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def draw_loss_function(compute_loss, data,  model, phi_iters = None):\n",
+        "  # Define pretty colormap\n",
+        "  my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
+        "  my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
+        "  r = np.floor(my_colormap_vals_dec/(256*256))\n",
+        "  g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
+        "  b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
+        "  my_colormap = ListedColormap(np.vstack((r,g,b)).transpose()/255.0)\n",
+        "\n",
+        "  # Make grid of intercept/slope values to plot\n",
+        "  offsets_mesh, freqs_mesh = np.meshgrid(np.arange(-10,10.0,0.1), np.arange(2.5,22.5,0.1))\n",
+        "  loss_mesh = np.zeros_like(freqs_mesh)\n",
+        "  # Compute loss for every set of parameters\n",
+        "  for idslope, slope in np.ndenumerate(freqs_mesh):\n",
+        "     loss_mesh[idslope] = compute_loss(data[0,:], data[1,:], model, np.array([[offsets_mesh[idslope]], [slope]]))\n",
+        "\n",
+        "  fig,ax = plt.subplots()\n",
+        "  fig.set_size_inches(8,8)\n",
+        "  ax.contourf(offsets_mesh,freqs_mesh,loss_mesh,256,cmap=my_colormap)\n",
+        "  ax.contour(offsets_mesh,freqs_mesh,loss_mesh,20,colors=['#80808080'])\n",
+        "  if phi_iters is not None:\n",
+        "    ax.plot(phi_iters[0,:], phi_iters[1,:],'go-')\n",
+        "  ax.set_ylim([2.5,22.5])\n",
+        "  ax.set_xlabel('Offset $\\phi_{0}$'); ax.set_ylabel('Frequency, $\\phi_{1}$')\n",
+        "  plt.show()"
+      ],
+      "metadata": {
+        "id": "K-NTHpAAHlCl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "draw_loss_function(compute_loss, data, model)"
+      ],
+      "metadata": {
+        "id": "l8HbvIupnTME"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's compute the gradient vector for a given set of parameters:\n",
+        "\n",
+        "\\begin{equation}\n",
+        "\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
+        "\\end{equation}"
+      ],
+      "metadata": {
+        "id": "s9Duf05WqqSC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# These came from writing out the expression for the sum of squares loss and taking the\n",
+        "# derivative with respect to phi0 and phi1. It was a lot of hassle to get it right!\n",
+        "def gabor_deriv_phi0(data_x,data_y,phi0, phi1):\n",
+        "    x = 0.06 * phi1 * data_x + phi0\n",
+        "    y = data_y           \n",
+        "    cos_component = np.cos(x)\n",
+        "    sin_component = np.sin(x)\n",
+        "    gauss_component = np.exp(-0.5 * x *x / 16)\n",
+        "    deriv = cos_component * gauss_component - sin_component * gauss_component * x / 16\n",
+        "    deriv = 2* deriv * (sin_component * gauss_component - y)\n",
+        "    return np.sum(deriv)\n",
+        "\n",
+        "def gabor_deriv_phi1(data_x, data_y,phi0, phi1):\n",
+        "    x = 0.06 * phi1 * data_x + phi0\n",
+        "    y = data_y            \n",
+        "    cos_component = np.cos(x)\n",
+        "    sin_component = np.sin(x)\n",
+        "    gauss_component = np.exp(-0.5 * x *x / 16)\n",
+        "    deriv = 0.06 * data_x * cos_component * gauss_component - 0.06 * data_x*sin_component * gauss_component * x / 16\n",
+        "    deriv = 2*deriv * (sin_component * gauss_component - y)\n",
+        "    return np.sum(deriv)\n",
+        "\n",
+        "def compute_gradient(data_x, data_y, phi):\n",
+        "    dl_dphi0 = gabor_deriv_phi0(data_x, data_y, phi[0],phi[1])\n",
+        "    dl_dphi1 = gabor_deriv_phi1(data_x, data_y, phi[0],phi[1])\n",
+        "    # Return the gradient\n",
+        "    return np.array([[dl_dphi0],[dl_dphi1]])"
+      ],
+      "metadata": {
+        "id": "UpswmkL2qwBT"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "We can check we got this right using a trick known as **finite differences**.  If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
+        "\n",
+        "\\begin{eqnarray}\n",
+        "\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
+        "\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
+        "\\end{eqnarray}\n",
+        "\n",
+        "We don't do this when there are many parameters;  for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
+      ],
+      "metadata": {
+        "id": "RS1nEcYVuEAM"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Compute the gradient using your function\n",
+        "gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
+        "print(\"Your gradients: (%3.3f,%3.3f)\"%(gradient[0],gradient[1]))\n",
+        "# Approximate the gradients with finite differences\n",
+        "delta = 0.0001\n",
+        "dl_dphi0_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[delta],[0]])) - \\\n",
+        "                    compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
+        "dl_dphi1_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[0],[delta]])) - \\\n",
+        "                    compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
+        "print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n"
+      ],
+      "metadata": {
+        "id": "QuwAHN7yt-gi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now we are ready to perform gradient descent.  We'll need to use our line search routine from part I, which I've reproduced here plus the helper function loss_function_1D that converts from a 2D problem to a 1D problem"
+      ],
+      "metadata": {
+        "id": "5EIjMM9Fw2eT"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def loss_function_1D(dist_prop, data, model, phi_start, gradient):\n",
+        "  # Return the loss after moving this far\n",
+        "  return compute_loss(data[0,:], data[1,:], model, phi_start+ gradient * dist_prop)\n",
+        "\n",
+        "def line_search(data, model, phi, gradient, thresh=.00001, max_dist = 0.1, max_iter = 15, verbose=False):\n",
+        "    # Initialize four points along the rnage we are going to search\n",
+        "    a = 0\n",
+        "    b = 0.33 * max_dist\n",
+        "    c = 0.66 * max_dist\n",
+        "    d = 1.0 * max_dist\n",
+        "    n_iter  =0;\n",
+        "    \n",
+        "    # While we haven't found the minimum closely enough\n",
+        "    while np.abs(b-c) > thresh and n_iter < max_iter:\n",
+        "        # Increment iteration counter (just to prevent an infinite loop)\n",
+        "        n_iter = n_iter+1\n",
+        "        # Calculate all four points\n",
+        "        lossa = loss_function_1D(a, data, model, phi,gradient)\n",
+        "        lossb = loss_function_1D(b, data, model, phi,gradient)\n",
+        "        lossc = loss_function_1D(c, data, model, phi,gradient)\n",
+        "        lossd = loss_function_1D(d, data, model, phi,gradient)\n",
+        "\n",
+        "        if verbose:\n",
+        "          print('Iter %d, a=%3.3f, b=%3.3f, c=%3.3f, d=%3.3f'%(n_iter, a,b,c,d))\n",
+        "          print('a %f, b%f, c%f, d%f'%(lossa,lossb,lossc,lossd))\n",
+        "\n",
+        "        # Rule #1 If point A is less than points B, C, and D then halve points B,C, and D\n",
+        "        if np.argmin((lossa,lossb,lossc,lossd))==0:\n",
+        "          b = b/2\n",
+        "          c = c/2\n",
+        "          d = d/2\n",
+        "          continue;\n",
+        "\n",
+        "        # Rule #2 If point b is less than point c then\n",
+        "        #                     then point d becomes point c, and\n",
+        "        #                     point b becomes 1/3 between a and new d\n",
+        "        #                     point c beocome 2/3 between a and new d \n",
+        "        if lossb < lossc:\n",
+        "          d = c\n",
+        "          b = a+ (d-a)/3\n",
+        "          c = a+ 2*(d-a)/3\n",
+        "          continue\n",
+        "\n",
+        "        # Rule #2 If point c is less than point b then\n",
+        "        #                     then point a becomes point b, and\n",
+        "        #                     point b becomes 1/3 between new a and d\n",
+        "        #                     point c beocome 2/3 between new a and d \n",
+        "        a = b\n",
+        "        b = a+ (d-a)/3\n",
+        "        c = a+ 2*(d-a)/3\n",
+        "    \n",
+        "    # Return average of two middle points\n",
+        "    return (b+c)/2.0"
+      ],
+      "metadata": {
+        "id": "XrJ2gQjfw1XP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def gradient_descent_step(phi, data,  model):\n",
+        "  # Step 1:  Compute the gradient\n",
+        "  gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
+        "  # Step 2:  Update the parameters -- note we want to search in the negative (downhill direction)\n",
+        "  alpha = line_search(data, model, phi, gradient*-1, max_dist = 2.0)\n",
+        "  phi = phi - alpha * gradient\n",
+        "  return phi"
+      ],
+      "metadata": {
+        "id": "YVq6rmaWRD2M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parameters\n",
+        "n_steps = 21\n",
+        "phi_all = np.zeros((2,n_steps+1))\n",
+        "phi_all[0,0] = -1.5\n",
+        "phi_all[1,0] = 8.5\n",
+        "\n",
+        "# Measure loss and draw initial model\n",
+        "loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
+        "draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
+        "\n",
+        "for c_step in range (n_steps):\n",
+        "  # Do gradient descent step\n",
+        "  phi_all[:,c_step+1:c_step+2] = gradient_descent_step(phi_all[:,c_step:c_step+1],data, model)\n",
+        "  # Measure loss and draw model every 4th step\n",
+        "  if c_step % 4 == 0:\n",
+        "    loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
+        "    draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
+        "\n",
+        "draw_loss_function(compute_loss, data, model,phi_all)\n"
+      ],
+      "metadata": {
+        "id": "tOLd0gtdRLLS"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO Experiment with starting the optimization in the previous cell in different places\n",
+        "# and show that it heads to a local minimum if we don't start it in the right valley"
+      ],
+      "metadata": {
+        "id": "Oi8ZlH0ptLqA"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def gradient_descent_step_fixed_learning_rate(phi, data,  model, alpha):\n",
+        "  # TODO -- fill in this routine so that we take a fixed size step of size alpha without using line search\n",
+        "\n",
+        "  return phi"
+      ],
+      "metadata": {
+        "id": "4l-ueLk-oAxV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the parameters\n",
+        "n_steps = 21\n",
+        "phi_all = np.zeros((2,n_steps+1))\n",
+        "phi_all[0,0] = -1.5\n",
+        "phi_all[1,0] = 8.5\n",
+        "\n",
+        "# Measure loss and draw initial model\n",
+        "loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
+        "draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
+        "\n",
+        "for c_step in range (n_steps):\n",
+        "  # Do gradient descent step\n",
+        "  phi_all[:,c_step+1:c_step+2] = gradient_descent_step_fixed_learning_rate(phi_all[:,c_step:c_step+1],data, model,alpha =0.2)\n",
+        "  # Measure loss and draw model every 4th step\n",
+        "  if c_step % 4 == 0:\n",
+        "    loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
+        "    draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
+        "\n",
+        "draw_loss_function(compute_loss, data, model,phi_all)\n"
+      ],
+      "metadata": {
+        "id": "oi9MX_GRpM41"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO Experiment with the learning rate, alpha.  \n",
+        "# What happens if you set it too large?\n",
+        "# What happens if you set it too small?"
+      ],
+      "metadata": {
+        "id": "In6sQ5YCpMqn"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def stochastic_gradient_descent_step(phi, data,  model, alpha, batch_size):\n",
+        "  # TODO -- fill in this routine so that we take a fixed size step of size alpha but only using a subset (batch) of the data\n",
+        "  # at each step\n",
+        "  # You can use the function np.random.permutation to generate a random permutation of the n_data = data.shape[1] indices\n",
+        "  # and then just choose the first n=batch_size of these indices.  Then select compute the gradient update\n",
+        "  # from just the data with these indices.   Don't worry about sampling with replacement.\n",
+        "\n",
+        "\n",
+        "  return phi"
+      ],
+      "metadata": {
+        "id": "VKTC9-1Gpm3N"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Set the random number generator so you always get same numbers (disable if you don't want this)\n",
+        "np.random.seed(1)\n",
+        "# Initialize the parameters\n",
+        "n_steps = 41\n",
+        "phi_all = np.zeros((2,n_steps+1))\n",
+        "phi_all[0,0] = 3.5\n",
+        "phi_all[1,0] = 6.5\n",
+        "\n",
+        "# Measure loss and draw initial model\n",
+        "loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
+        "draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
+        "\n",
+        "for c_step in range (n_steps):\n",
+        "  # Do gradient descent step\n",
+        "  phi_all[:,c_step+1:c_step+2] = stochastic_gradient_descent_step(phi_all[:,c_step:c_step+1],data, model,alpha =0.8, batch_size=5)\n",
+        "  # Measure loss and draw model every 4th step\n",
+        "  if c_step % 8 == 0:\n",
+        "    loss =  compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
+        "    draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
+        "\n",
+        "draw_loss_function(compute_loss, data, model,phi_all)"
+      ],
+      "metadata": {
+        "id": "469OP_UHskJ4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO -- Experiment with different learning rates, starting points, batch sizes, number of steps.  Get a feel for this."
+      ],
+      "metadata": {
+        "id": "LxE2kTa3s29p"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO -- How about adding a learning rate schedule?  Reduce the learning rate by a factor of beta every M iterations"
+      ],
+      "metadata": {
+        "id": "lw4QPOaQTh5e"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/CX20315_Transformers.ipynb
+++ b/CX20315_Transformers.ipynb
@@ -0,0 +1,634 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyMfWL40+ZshPZhweAtQ9Fn6",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CX20315_Transformers.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Transformers\n",
+        "\n",
+        "This practical investigates neural decoding from transformer models.  Run the next three cells as they might take a while to run (they have to download some stuff), and then read the next text box while you are waiting."
+      ],
+      "metadata": {
+        "id": "RnIUiieJWu6e"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install transformers"
+      ],
+      "metadata": {
+        "id": "7abjZ9pMVj3k"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from transformers import GPT2LMHeadModel, GPT2Tokenizer, set_seed\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "import numpy as np"
+      ],
+      "metadata": {
+        "id": "sMOyD0zem2Ef"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Load model and tokenizer\n",
+        "model = GPT2LMHeadModel.from_pretrained('gpt2')\n",
+        "tokenizer = GPT2Tokenizer.from_pretrained('gpt2')"
+      ],
+      "metadata": {
+        "id": "pZgfxbzKWNSR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Decoding from GPT2\n",
+        "\n",
+        "This tutorial investigates how to use GPT2 (the forerunner of GPT3) to generate text.  There are a number of ways to do this that trade-off the realism of the text against the amount of variation.\n",
+        "\n",
+        "At every stage, GPT2 takes an input string and returns a probability for each of the possible subsequent tokens.  We can choose what to do with these probability.  We could always *greedily choose* the most likely next token, or we could draw a *sample* randomly according to the probabilities.  There are also intermediate strategies such as *top-k sampling* and *nucleus sampling*, that have some controlled randomness.\n",
+        "\n",
+        "We'll also investigate *beam search* -- the idea is that rather than greedily take the next best token at each stage, we maintain a set of hypotheses  (beams)as we add each subsequent token and return the most likely overall hypothesis.  This is not necessarily the same result we get from greedily choosing the next token. "
+      ],
+      "metadata": {
+        "id": "TfhAGy0TXEvV"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "First, let's investigate the token themselves.  The code below prints out the vocabulary size and shows 20 random tokens.  "
+      ],
+      "metadata": {
+        "id": "vsmO9ptzau3_"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "np.random.seed(1)\n",
+        "print(\"Number of tokens in dictionary = %d\"%(tokenizer.vocab_size))\n",
+        "for i in range(20):\n",
+        "  index = np.random.randint(tokenizer.vocab_size)\n",
+        "  print(\"Token: %d \"%(index)+tokenizer.decode(torch.tensor(index), skip_special_tokens=True))\n"
+      ],
+      "metadata": {
+        "id": "dmmBNS5GY_yk"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Sampling\n",
+        "\n",
+        "Each time we run GPT2 it will take in a set of tokens, and return a probability over each of the possible next tokens.  The simplest thing we could do is to just draw a sample from this probability distribution each time."
+      ],
+      "metadata": {
+        "id": "MUM3kLEjbTso"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def sample_next_token(input_tokens, model, tokenizer):\n",
+        "  # Run model to get prediction over next output\n",
+        "  outputs = model(input_ids = input_tokens['input_ids'], attention_mask = input_tokens['attention_mask'])\n",
+        "  # Find prediction\n",
+        "  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]\n",
+        "  # TODO: Draw a random token according to the probabilities\n",
+        "  # Use:  https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html\n",
+        "  # Replace this line\n",
+        "  next_token = [5000]\n",
+        "\n",
+        "  # Append token to sentence\n",
+        "  output_tokens = input_tokens\n",
+        "  output_tokens[\"input_ids\"] = torch.cat((output_tokens['input_ids'],torch.tensor([next_token])),dim=1)\n",
+        "  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'],torch.tensor([[1]])),dim=1)\n",
+        "  output_tokens['last_token_prob'] = prob_over_tokens[next_token]\n",
+        "\n",
+        "  return output_tokens"
+      ],
+      "metadata": {
+        "id": "TIyNgg0FkJKO"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Expected output:\n",
+        "# \"The best thing about Bath is that they don't even change or shrink anymore.\"\n",
+        "\n",
+        "set_seed(0)\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "for i in range(10):\n",
+        "    input_tokens = sample_next_token(input_tokens, model, tokenizer)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "BHs-IWaz9MNY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO Modify the code below by changeing the number of tokens generated and the initial sentence\n",
+        "# to get a feel for how well this works.  Since I didn't reset the seed, it will give a different\n",
+        "# answer every time that you run it.\n",
+        "\n",
+        "# TODO Experiment with changing this line:\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "# TODO Experiment with changing this line:\n",
+        "for i in range(10):\n",
+        "    input_tokens = sample_next_token(input_tokens, model, tokenizer)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))"
+      ],
+      "metadata": {
+        "id": "yN98_7WqbvIe"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Greedy token selection\n",
+        "\n",
+        "You probably (correctly) got the impression that the text from pure sampling of the probability model can be kind of random.  How about if we choose most likely token at each step?\n"
+      ],
+      "metadata": {
+        "id": "7eHFLCeZcmmg"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def get_best_next_token(input_tokens, model, tokenizer):\n",
+        "  # Run model to get prediction over next output\n",
+        "  outputs = model(input_ids = input_tokens['input_ids'], attention_mask = input_tokens['attention_mask'])\n",
+        "  # Find prediction\n",
+        "  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]\n",
+        "\n",
+        "  # TODO -- find the token index with the maximum probability\n",
+        "  # It should be returns as a list (i.e., put squared brackets around it)\n",
+        "  # Use https://numpy.org/doc/stable/reference/generated/numpy.argmax.html\n",
+        "  # Replace this line\n",
+        "  next_token = [5000]\n",
+        "\n",
+        "  # Append token to sentence\n",
+        "  output_tokens = input_tokens\n",
+        "  output_tokens[\"input_ids\"] = torch.cat((output_tokens['input_ids'],torch.tensor([next_token])),dim=1)\n",
+        "  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'],torch.tensor([[1]])),dim=1)\n",
+        "  output_tokens['last_token_prob'] = prob_over_tokens[next_token]\n",
+        "  return output_tokens"
+      ],
+      "metadata": {
+        "id": "OhRzynEjxpZF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Expected output:\n",
+        "# The best thing about Bath is that it's a place where you can go to\n",
+        "set_seed(0)\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_best_next_token(input_tokens, model, tokenizer)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))"
+      ],
+      "metadata": {
+        "id": "gKB1Mgndj-Hm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO Modify the code below by changeing the number of tokens generated and the initial sentence\n",
+        "# to get a feel for how well this works.  \n",
+        "\n",
+        "# TODO Experiment with changing this line:\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "# TODO Experiment with changing this line:\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_best_next_token(input_tokens, model, tokenizer)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))"
+      ],
+      "metadata": {
+        "id": "L1YHKaYFfC0M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Top-K sampling\n",
+        "\n",
+        "You probably noticed that the greedy strategy produces quite realistic text, but it's kind of boring.  It produces generic answers.  Also, if this was a chatbot, then we wouldn't necessarily want it to produce the same answer to a question each time.  \n",
+        "\n",
+        "Top-K sampling is a compromise strategy that samples randomly from the top K most probable tokens.  We could just choose them with a uniform distribution, or (as here) we could sample them according to their original probabilities."
+      ],
+      "metadata": {
+        "id": "1ORFXYX_gBDT"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def get_top_k_token(input_tokens, model, tokenizer, k=20):\n",
+        "  # Run model to get prediction over next output\n",
+        "  outputs = model(input_ids = input_tokens['input_ids'], attention_mask = input_tokens['attention_mask'])\n",
+        "  # Find prediction\n",
+        "  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]\n",
+        "\n",
+        "  # Draw a sample from the top K most likely tokens.\n",
+        "  # Take copy of the probabilities and sort from largest to smallest (use np.sort)\n",
+        "  # TODO -- replace this line\n",
+        "  sorted_prob_over_tokens = prob_over_tokens\n",
+        "\n",
+        "  # Find the probability at the k'th position\n",
+        "  # TODO -- replace this line\n",
+        "  kth_prob_value = 0.0\n",
+        "\n",
+        "  # Set all probabilities below this value to zero \n",
+        "  prob_over_tokens[prob_over_tokens<kth_prob_value] = 0\n",
+        "\n",
+        "  # Renormalize the probabilities so that they sum to one\n",
+        "  # TODO -- replace this line\n",
+        "  prob_over_tokens = prob_over_tokens\n",
+        "\n",
+        "  # Draw random token\n",
+        "  next_token = np.random.choice(len(prob_over_tokens), 1, replace=False, p=prob_over_tokens)\n",
+        "\n",
+        "  # Append token to sentence\n",
+        "  output_tokens = input_tokens \n",
+        "  output_tokens[\"input_ids\"] = torch.cat((output_tokens['input_ids'],torch.tensor([next_token])),dim=1)\n",
+        "  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'],torch.tensor([[1]])),dim=1)\n",
+        "  output_tokens['last_token_prob'] = prob_over_tokens[next_token]\n",
+        "  return output_tokens"
+      ],
+      "metadata": {
+        "id": "7RFbn6c-0Z4v"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Expected output:\n",
+        "# The best thing about Bath is that you get to see all the beautiful faces of\n",
+        "\n",
+        "set_seed(0)\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_top_k_token(input_tokens, model, tokenizer, k=10)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))"
+      ],
+      "metadata": {
+        "id": "G3w1GVED4HYv"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO \n",
+        "# Experiment with different values of k \n",
+        "# If you set it to a lower number (say 3) the text will be less random\n",
+        "# If you set it to a higher number (say 5000) the text will be more random\n",
+        "\n",
+        "set_seed(0)\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_top_k_token(input_tokens, model, tokenizer, k=10)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))"
+      ],
+      "metadata": {
+        "id": "RySu2bzqpW9E"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Nucleus sampling\n",
+        "\n",
+        "Top-K sampling has the disadvantage that sometimes there are only a few plausible next tokens, and sometimes there are a lot.  How do we adapt to this situation?  One way is to sample from a fixed proportion of the probability mass.  That is we order the tokens in terms of probability and cut off the possibility of sampling when the cumulative sum is greater than a threshold.\n",
+        "\n",
+        "This way, we adapt the number of possible tokens that we can choose."
+      ],
+      "metadata": {
+        "id": "fOHak_QJfU-2"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def get_nucleus_sampling_token(input_tokens, model, tokenizer, thresh=0.25):\n",
+        "  # Run model to get prediction over next output\n",
+        "  outputs = model(input_ids = input_tokens['input_ids'], attention_mask = input_tokens['attention_mask'])\n",
+        "  # Find prediction\n",
+        "  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]\n",
+        "\n",
+        "  # Find the most likely tokens that make up the first (thresh) of the probability \n",
+        "  # TODO -- sort the probabilities in decreasing order\n",
+        "  # Replace this line\n",
+        "  sorted_probs_decreasing = prob_over_tokens\n",
+        "  # TODO -- compute the cumulative sum of these probabilities\n",
+        "  # Replace this line\n",
+        "  cum_sum_probs = sorted_probs_decreasing\n",
+        "\n",
+        "  # Find index where that the cumulative sum is greater than the threshold\n",
+        "  thresh_index = np.argmax(cum_sum_probs>thresh)\n",
+        "  print(\"Choosing from %d tokens\"%(thresh_index))\n",
+        "  # TODO:  Find the probabilitiy value to threshold  \n",
+        "  # Replace this line:\n",
+        "  thresh_prob = sorted_probs_decreasing[thresh_index]\n",
+        "\n",
+        "  # Set any probabilities less than this to zero \n",
+        "  prob_over_tokens[prob_over_tokens<thresh_prob] = 0\n",
+        "  # Renormalize\n",
+        "  prob_over_tokens = prob_over_tokens / np.sum(prob_over_tokens)\n",
+        "  # Draw random token\n",
+        "  next_token = np.random.choice(len(prob_over_tokens), 1, replace=False, p=prob_over_tokens)\n",
+        "  \n",
+        "  # Append token to sentence\n",
+        "  output_tokens = input_tokens \n",
+        "  output_tokens[\"input_ids\"] = torch.cat((output_tokens['input_ids'],torch.tensor([next_token])),dim=1)\n",
+        "  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'],torch.tensor([[1]])),dim=1)\n",
+        "  output_tokens['last_token_prob'] = prob_over_tokens[next_token]\n",
+        "  return output_tokens"
+      ],
+      "metadata": {
+        "id": "PtxS4kNDyUcm"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Expected output:\n",
+        "# The best thing about Bath is that it's not a city that has been around\n",
+        "set_seed(0)\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_nucleus_sampling_token(input_tokens, model, tokenizer, thresh = 0.2)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))\n"
+      ],
+      "metadata": {
+        "id": "K2Vk1Ly40S6c"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# TODO -- experiment with setting the threshold probability to larger or smaller values\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_nucleus_sampling_token(input_tokens, model, tokenizer, thresh = 0.2)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))"
+      ],
+      "metadata": {
+        "id": "eQNNHe14wDvC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Beam search\n",
+        "\n",
+        "All of the methods we've seen so far choose the tokens one by one.  But this isn't necessarily sensible.  Even greedily choosing the best token doesn't necessarily retrieve the sequence with the highest probability.  It might be that the most likely token only has very unlikely tokens following it.\n",
+        "\n",
+        "Beam search maintains $K$ hypotheses about the best possible continuation.  It starts with the top $K$ continuations.  Then for each of those, it finds the top K continuations, giving $K^2$ hypotheses.  Then it retains just the top $K$ of these so that the number of hypotheses stays the same."
+      ],
+      "metadata": {
+        "id": "WMMNeLixwlgM"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# This routine reutnrs the k'th most likely next token.\n",
+        "# If k =0 then it returns the most likely token, if k=1 it returns the next most likely and so on\n",
+        "# We will need this for beam search\n",
+        "def get_kth_most_likely_token(input_tokens, model, tokenizer, k):\n",
+        "  # Run model to get prediction over next output\n",
+        "  outputs = model(input_ids = input_tokens['input_ids'], attention_mask = input_tokens['attention_mask'])\n",
+        "  # Find prediction\n",
+        "  prob_over_tokens = F.softmax(outputs.logits, dim=-1).detach().numpy()[0,-1]\n",
+        "\n",
+        "  # Find the k'th most likely token \n",
+        "  # TODO Sort the probabilities from largest to smallest\n",
+        "  # Replace this line:\n",
+        "  sorted_prob_over_tokens = prob_over_tokens\n",
+        "  # TODO Find the k'th sorted probability\n",
+        "  # Replace this line\n",
+        "  kth_prob_value = prob_over_tokens[0]\n",
+        "\n",
+        "  # Find position of this token.\n",
+        "  next_token = np.where(prob_over_tokens == kth_prob_value)[0]\n",
+        "\n",
+        "  # Append token to sentence\n",
+        "  output_tokens = input_tokens \n",
+        "  output_tokens[\"input_ids\"] = torch.cat((output_tokens['input_ids'],torch.tensor([next_token])),dim=1)\n",
+        "  output_tokens['attention_mask'] = torch.cat((output_tokens['attention_mask'],torch.tensor([[1]])),dim=1)\n",
+        "  output_tokens['last_token_prob'] = prob_over_tokens[next_token]\n",
+        "  output_tokens['log_prob'] = output_tokens['log_prob'] + np.log(prob_over_tokens[next_token])\n",
+        "  return output_tokens"
+      ],
+      "metadata": {
+        "id": "sAI2bClXCe2F"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# We can test this code and see that if we choose the 2nd most likely (K=1) token each time\n",
+        "# then we get much better generation results than if we choose the 2001st most likely token\n",
+        "\n",
+        "# Expected output:\n",
+        "# The best thing about Bath is the way you get the most bang outta the\n",
+        "set_seed(0)\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "input_tokens['log_prob'] = 0.0\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_kth_most_likely_token(input_tokens, model, tokenizer, k=1)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))\n",
+        "\n",
+        "# Expected output:\n",
+        "# The best thing about Bath is mixed profits partnerships» buy generic+ Honda throttlecont\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "input_tokens['log_prob'] = 0.0\n",
+        "for i in range(10):\n",
+        "    input_tokens = get_kth_most_likely_token(input_tokens, model, tokenizer, k=2000)\n",
+        "    print(tokenizer.decode(input_tokens[\"input_ids\"][0], skip_special_tokens=True))\n",
+        "\n",
+        "# TODO -- play around with different values of K"
+      ],
+      "metadata": {
+        "id": "6kSc0WrTELMd"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Print out each beam plus the log probability\n",
+        "def print_beams(beams):\n",
+        "  for index,beam in enumerate(beams):\n",
+        "    print(\"Beam %d, Prob %3.3f: \"%(index,beam['log_prob'])+tokenizer.decode(beam[\"input_ids\"][0], skip_special_tokens=True))\n",
+        "  print('---')\n",
+        "\n",
+        "\n",
+        "# TODO:  Read this code carefully!\n",
+        "def do_beam_search(input_tokens_in, model, tokenizer, n_beam=5, beam_length=10):  \n",
+        "  # Store beams in a list\n",
+        "  input_tokens['log_prob'] = 0.0\n",
+        "\n",
+        "  # Initialize with n_beam most likely continuations\n",
+        "  beams = [None] * n_beam\n",
+        "  for c_k in range(n_beam):\n",
+        "    beams[c_k] = dict(input_tokens_in)\n",
+        "    beams[c_k] = get_kth_most_likely_token(beams[c_k], model, tokenizer, c_k)\n",
+        "  \n",
+        "  print_beams(beams)\n",
+        "  \n",
+        "  # For each token in the sequence we will add\n",
+        "  for c_pos in range(beam_length-1):\n",
+        "    # Now for each beam, we continue it in the most likely ways, making n_beam*n_beam type hypotheses\n",
+        "    beams_all = [None] * (n_beam*n_beam)\n",
+        "    log_probs_all = np.zeros(n_beam*n_beam)\n",
+        "    # For each current hypothesis\n",
+        "    for c_beam in range(n_beam):\n",
+        "      # For each continuation\n",
+        "      for c_k in range(n_beam):\n",
+        "        # Store the continuation and the probability\n",
+        "        beams_all[c_beam * n_beam + c_k] = dict(get_kth_most_likely_token(beams[c_beam], model, tokenizer, c_k))\n",
+        "        log_probs_all[c_beam * n_beam + c_k] = beams_all[c_beam * n_beam + c_k]['log_prob']\n",
+        "  \n",
+        "    # Keep the best n_beams sequences with the highest probabilities\n",
+        "    sorted_index = np.argsort(np.array(log_probs_all)*-1)\n",
+        "    for c_k in range(n_beam):\n",
+        "      beams[c_k] = dict(beams_all[sorted_index[c_k]])\n",
+        "\n",
+        "    # Print the beams\n",
+        "    print_beams(beams)\n",
+        "\n",
+        "  return beams[0]"
+      ],
+      "metadata": {
+        "id": "Y4hFfwPFFxka"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Expected output:\n",
+        "# The best thing about Bath is that it's a place where you don't have to\n",
+        "\n",
+        "set_seed(0)\n",
+        "input_txt = \"The best thing about Bath is\"\n",
+        "input_tokens = tokenizer(input_txt, return_tensors='pt')\n",
+        "\n",
+        "# Now let's call the beam search\n",
+        "# It takes a while as it has to run the model multiple times to add a token\n",
+        "n_beams = 5\n",
+        "best_beam = do_beam_search(input_tokens,model,tokenizer)\n",
+        "print(\"Beam search result:\")\n",
+        "print(tokenizer.decode(best_beam[\"input_ids\"][0], skip_special_tokens=True))\n",
+        "\n",
+        "# You should see that the best answer is not the same as the greedy result we found above"
+      ],
+      "metadata": {
+        "id": "0YWKwZmz4NXb"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You can read about more decoding strategies in this blog (which uses a recursive neural network, not a transformer, but the principles are the same).\n",
+        "\n",
+        "https://www.borealisai.com/research-blogs/tutorial-6-neural-natural-language-generation-decoding-algorithms/\n",
+        "\n",
+        "You can also look at other possible language models via hugging face:\n",
+        "\n",
+        "https://huggingface.co/docs/transformers/v4.25.1/en/model_summary#decoders-or-autoregressive-models\n"
+      ],
+      "metadata": {
+        "id": "-SXpjZPYsMhv"
+      }
+    }
+  ]
+}
--- a/Slides/CM20315_01_Intro_01.pptx
+++ b/Slides/CM20315_01_Intro_01.pptx
--- a/Slides/CM20315_01_Intro_02.pptx
+++ b/Slides/CM20315_01_Intro_02.pptx
--- a/Slides/CM20315_02_Supervised.pptx
+++ b/Slides/CM20315_02_Supervised.pptx
--- a/Slides/CM20315_03_Shallow.pptx
+++ b/Slides/CM20315_03_Shallow.pptx
--- a/Slides/CM20315_04_Deep.pptx
+++ b/Slides/CM20315_04_Deep.pptx
--- a/Slides/CM20315_05_Loss.pptx
+++ b/Slides/CM20315_05_Loss.pptx
--- a/Slides/CM20315_05a_Catchup.pptx
+++ b/Slides/CM20315_05a_Catchup.pptx
--- a/Slides/CM20315_06_Fitting.pptx
+++ b/Slides/CM20315_06_Fitting.pptx
--- a/Slides/CM20315_07_Gradients.pptx
+++ b/Slides/CM20315_07_Gradients.pptx
--- a/Slides/CM20315_08_Performance.pptx
+++ b/Slides/CM20315_08_Performance.pptx
--- a/Slides/CM20315_09_Regularization.pptx
+++ b/Slides/CM20315_09_Regularization.pptx
--- a/Slides/CM20315_10_Convolutional.pptx
+++ b/Slides/CM20315_10_Convolutional.pptx
--- a/Slides/CM20315_10_Convolutional_2.pptx
+++ b/Slides/CM20315_10_Convolutional_2.pptx
--- a/Slides/CM20315_13_Unsupervised.pptx
+++ b/Slides/CM20315_13_Unsupervised.pptx
--- a/Training_I.ipynb
+++ b/Training_I.ipynb
@@ -0,0 +1,190 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyMUqLjI8VIQXHOYx0I37OmR",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Training_I.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Training\n",
+        "\n",
+        "We now have a model and a loss function which we can use to judge how good that model is.  It's time to put the \"learning\" into machine learning.\n",
+        "\n",
+        "Learning involves finding the parameters that minimize the loss.  That might seems like it's not too hard, but modern models might have billions of parameters.  There's an exponential number of possible parameter combinations, and there's no way we can make any progress with exhaustive search.\n",
+        "\n",
+        "We'll build this up in stages.  In this practical, we'll just consider 1D search using a bracketing approach.  In part II, we'll extend to fitting the linear regression model (which has a convex loss function).  Then in part III, we'll consider non-convex loss functions\n"
+      ],
+      "metadata": {
+        "id": "el8l05WQEO46"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xhmIOLiZELV_"
+      },
+      "outputs": [],
+      "source": [
+        "# import libraries\n",
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Let's create a simple 1D function\n",
+        "def loss_function(phi):\n",
+        "  return 1- 0.5 * np.exp(-(phi-0.65)*(phi-0.65)/0.1) - 0.45 *np.exp(-(phi-0.35)*(phi-0.35)/0.02)\n",
+        "\n",
+        "def draw_function(loss_function,a=None, b=None, c=None, d=None):\n",
+        "  # Plot the function \n",
+        "  phi_plot = np.arange(0,1,0.01);\n",
+        "  fig,ax = plt.subplots()\n",
+        "  ax.plot(phi_plot,loss_function(phi_plot),'r-')\n",
+        "  ax.set_xlim(0,1); ax.set_ylim(0,1)\n",
+        "  ax.set_xlabel('$\\phi$'); ax.set_ylabel('$L[\\phi]$')\n",
+        "  if a is not None and b is not None and c is not None and d is not None:\n",
+        "      plt.axvspan(a, d, facecolor='k', alpha=0.2)\n",
+        "      ax.plot([a,a],[0,1],'b-')\n",
+        "      ax.plot([b,b],[0,1],'b-')\n",
+        "      ax.plot([c,c],[0,1],'b-')\n",
+        "      ax.plot([d,d],[0,1],'b-')\n",
+        "  plt.show()\n"
+      ],
+      "metadata": {
+        "id": "qFRe9POHF2le"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Draw this function\n",
+        "draw_function(loss_function)"
+      ],
+      "metadata": {
+        "id": "TXx1Tpd1Tl-I"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now lets create a line search procedure to find the minimum in the range 0,1"
+      ],
+      "metadata": {
+        "id": "QU5mdGvpTtEG"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def line_search(loss_function, thresh=.0001, max_iter = 10, draw_flag = False):\n",
+        "\n",
+        "    # Initialize four points along the rnage we are going to search\n",
+        "    a = 0\n",
+        "    b = 0.33\n",
+        "    c = 0.66\n",
+        "    d = 1.0\n",
+        "    n_iter  =0;\n",
+        "    \n",
+        "    # While we haven't found the minimum closely enough\n",
+        "    while np.abs(b-c) > thresh and n_iter < max_iter:\n",
+        "        # Increment iteration counter (just to prevent an infinite loop)\n",
+        "        n_iter = n_iter+1\n",
+        "        # Calculate all four points\n",
+        "        lossa = loss_function(a)\n",
+        "        lossb = loss_function(b)\n",
+        "        lossc = loss_function(c)\n",
+        "        lossd = loss_function(d)\n",
+        "\n",
+        "        if draw_flag:\n",
+        "          draw_function(loss_function, a,b,c,d)\n",
+        "\n",
+        "        print('Iter %d, a=%3.3f, b=%3.3f, c=%3.3f, d=%3.3f'%(n_iter, a,b,c,d))\n",
+        "\n",
+        "        # Rule #1 If point A is less than points B, C, and D then halve values of B,C, and D\n",
+        "        # i.e. bring them closer to the original point\n",
+        "        # TODO REPLACE THE BLOCK OF CODE BELOW WITH THIS RULE\n",
+        "        if (0):\n",
+        "          continue;\n",
+        "\n",
+        "        # Rule #2 If point b is less than point c then\n",
+        "        #                     then point d becomes point c, and\n",
+        "        #                     point b becomes 1/3 between a and new d\n",
+        "        #                     point c beocome 2/3 between a and new d \n",
+        "        # TODO REPLACE THE BLOCK OF CODE BELOW WITH THIS RULE\n",
+        "        if (0):\n",
+        "          continue;\n",
+        "\n",
+        "        # Rule #3 If point c is less than point b then\n",
+        "        #                     then point a becomes point b, and\n",
+        "        #                     point b becomes 1/3 between new a and d\n",
+        "        #                     point c beocome 2/3 between new a and d \n",
+        "        # TODO REPLACE THE BLOCK OF CODE BELOW WITH THIS RULE\n",
+        "        if(0):\n",
+        "          continue\n",
+        "\n",
+        "\n",
+        "    # TODO -- FINAL SOLUTION IS AVERAGE OF B and C\n",
+        "    # REPLACE THIS LINE\n",
+        "    soln = 1\n",
+        "    \n",
+        "    return soln"
+      ],
+      "metadata": {
+        "id": "K-NTHpAAHlCl"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "soln = line_search(loss_function, draw_flag=True)\n",
+        "print('Soln = %3.3f, loss = %3.3f'%(soln,loss_function(soln)))"
+      ],
+      "metadata": {
+        "id": "YVq6rmaWRD2M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "tOLd0gtdRLLS"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
--- a/index.html
+++ b/index.html
@@ -1,51 +1,78 @@
-<h1>Understanding Deep Learning</h1>
-by Simon J.D. Prince
-<br>
-To be published by MIT Press.
-
-
-<h2>Key links</h2>
-<ul>
-	<li> <a href="https://github.com/udlbook/udlbook/releases/download/v0.2.0/UnderstandingDeepLearning_01_10_22_C.pdf">Draft PDF Chapters 2-13</a>  2022-10-01. CC-BY-NC-ND license
-	<li> Draft PDF Chapters 1,14-19 (coming Jan 2nd, 2023)
-	<li> Jupyter notebooks (coming Spring 2023)
-	<li> Report errata via <a href="https://github.com/udlbook/udlbook/issues">github</a> or contact me directly at udlbookmail@gmail.com
-	<li> Follow me on <a href="https://twitter.com/SimonPrinceAI">Twitter</a> or <a href="https://www.linkedin.com/in/simon-prince-615bb9165/">LinkedIn</a> for updates.
-</ul>
-
-<h2>Table of contents</h2>
-<ul> 
-	<li>  Chapter 1 - Introduction
-	<li>  Chapter 2 - Supervised learning <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap2PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap2.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 3 - Shallow neural networks <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap3PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap3.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 4 - Deep neural networks <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap4PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap4.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 5 - Loss functions <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap5PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap5.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 6 - Training models <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap6PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap6.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 7 - Gradients and initialization <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap7PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap7.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 8 - Measuring performance <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap8PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap8.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 9 - Regularization <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap9PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap9.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 10 - Convolutional nets  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap10PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap10.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 11 - Residual networks <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap11PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap11.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 12 - Transformers  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap12PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap12.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 13 - Graph neural networks  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap13PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap13.pptx">PowerPoint Figures</a> 
-	<li>  Chapter 14 - Variational auto-encoders
-	<li>  Chapter 15 - Normalizing flows
-	<li>  Chapter 16 - Generative adversarial networks
-	<li>  Chapter 17 - Diffusion models
-	<li>  Chapter 18 - Deep reinforcement learning
-	<li>  Chapter 19 - Why does deep learning work?
-</ul>
-
-<br>
-Citation:
-<pre><code>
- @book{prince2022understanding,
- author = "Simon J.D. Prince",
- title = "Understanding Deep Learning",
- publisher = "MIT Press",
- year = 2022,
- url = "https://udlbook.github.io/udlbook/"
-}
-</code></pre>
-	
-
+<h1>Understanding Deep Learning</h1>
+by Simon J.D. Prince
+<br>
+To be published by MIT Press.
+
+<h2> Download draft PDF </h2>
+
+<a href="https://github.com/udlbook/udlbook/releases/download/v1.0.1/UnderstandingDeepLearning_26_04_23_C.pdf">Draft PDF Chapters 1-21</a><br> 2023-04-26. CC-BY-NC-ND license
+<br>
+ <img src="https://img.shields.io/github/downloads/udlbook/udlbook/total" alt="download stats shield">
+<br>
+<ul>
+	<li> Appendices and notebooks coming soon
+	<li> Report errata via <a href="https://github.com/udlbook/udlbook/issues">github</a> or contact me directly at udlbookmail@gmail.com
+	<li> Follow me on <a href="https://twitter.com/SimonPrinceAI">Twitter</a> or <a href="https://www.linkedin.com/in/simon-prince-615bb9165/">LinkedIn</a> for updates.
+</ul>
+
+<h2>Table of contents</h2>
+<ul> 
+	<li>  Chapter 1 - Introduction
+	<li>  Chapter 2 - Supervised learning 
+	<li>  Chapter 3 - Shallow neural networks 
+	<li>  Chapter 4 - Deep neural networks 
+	<li>  Chapter 5 - Loss functions
+	<li>  Chapter 6 - Training models 
+	<li>  Chapter 7 - Gradients and initialization 
+	<li>  Chapter 8 - Measuring performance 
+	<li>  Chapter 9 - Regularization 
+	<li>  Chapter 10 - Convolutional networks
+	<li>  Chapter 11 - Residual networks
+	<li>  Chapter 12 - Transformers  
+	<li>  Chapter 13 - Graph neural networks 
+	<li>  Chapter 14 - Unsupervised learning
+	<li>  Chapter 15 - Generative adversarial networks
+	<li>  Chapter 16 - Normalizing flows
+	<li>  Chapter 17 - Variational autoencoders
+	<li>  Chapter 18 - Diffusion models
+	<li>  Chapter 19 - Deep reinforcement learning
+	<li>  Chapter 20 - Why does deep learning work?
+	<li>  Chapter 21 - Deep learning and ethics
+</ul>
+
+<br>
+Citation:
+<pre><code>
+ @book{prince2023understanding,
+ author = "Simon J.D. Prince",
+ title = "Understanding Deep Learning",
+ publisher = "MIT Press",
+ year = 2023,
+ url = "https://udlbook.github.io/udlbook/"
+}
+</code></pre>
+	
+<h2>Resources for instructors </h2>
+<ul> 
+	<li>  Chapter 1 - Introduction 
+	<li>  Chapter 2 - Supervised learning: Slides / Notebooks / <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap2PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap2.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 3 - Shallow neural networks:  Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap3PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap3.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 4 - Deep neural networks:  Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap4PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap4.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 5 - Loss functions:  Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap5PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap5.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 6 - Training models:  Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap6PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap6.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 7 - Gradients and initialization:  Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap7PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap7.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 8 - Measuring performance:  Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap8PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap8.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 9 - Regularization: Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap9PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap9.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 10 - Convolutional networks:  Slides / Notebooks / <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap10PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap10.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 11 - Residual networks: Slides / Notebooks /  <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap11PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap11.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 12 - Transformers:  Slides / Notebooks / <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap12PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap12.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 13 - Graph neural networks:  Slides / Notebooks /   <a href="https://github.com/udlbook/udlbook/raw/main/PDFFigures/UDLChap13PDF.zip">PDF Figures</a> / <a href="https://github.com/udlbook/udlbook/raw/main/Slides/UDLChap13.pptx">PowerPoint Figures</a> 
+	<li>  Chapter 14 - Unsupervised learning:  Slides / Notebooks / PDF Figures / Powerpoint Figures
+	<li>  Chapter 15 - Generative adversarial networks:  Slides / Notebooks / PDF Figures / PowerPoint Figures
+	<li>  Chapter 16 - Normalizing flows:  Slides / Notebooks / PDF Figures / PowerPoint Figures
+	<li>  Chapter 17 - Variational autoencoders:  Slides / Notebooks / PDF Figures / PowerPoint Figures
+	<li>  Chapter 18 - Diffusion models: Slides / Notebooks / PDF Figures / PowerPoint Figures
+	<li>  Chapter 19 - Deep reinforcement learning:  Slides / Notebooks / PDF Figures / PowerPoint Figures
+	<li>  Chapter 20 - Why does deep learning work?:  Slides / Notebooks / PDF Figures / PowerPoint Figures
+	<li>  Chapter 21 - Deep learning and ethics:  Slides / Notebooks / PDF Figures / PowerPoint Figures
+</ul>
--- a/practicals/test_data_x.npy
+++ b/practicals/test_data_x.npy
--- a/practicals/test_data_y.npy
+++ b/practicals/test_data_y.npy
--- a/practicals/train_data_x.csv
+++ b/practicals/train_data_x.csv
--- a/practicals/train_data_x.npy
+++ b/practicals/train_data_x.npy
--- a/practicals/train_data_y.csv
+++ b/practicals/train_data_y.csv
--- a/practicals/train_data_y.npy
+++ b/practicals/train_data_y.npy
--- a/practicals/val_data_x.csv
+++ b/practicals/val_data_x.csv
--- a/practicals/val_data_x.npy
+++ b/practicals/val_data_x.npy
--- a/practicals/val_data_y.csv
+++ b/practicals/val_data_y.csv
--- a/practicals/val_data_y.npy
+++ b/practicals/val_data_y.npy
Author	SHA1	Message	Date
udlbook	b5fbe8445e	Update index.html	2023-04-26 18:19:22 -04:00
udlbook	fd0144d4ab	Update index.html	2023-04-24 14:34:36 -04:00
udlbook	4335f935a1	Update index.html	2023-04-19 08:33:10 -04:00
udlbook	45ddca3c52	Update index.html	2023-04-17 14:01:58 -04:00
udlbook	b52d05a785	Update index.html	2023-04-11 11:23:03 -04:00
udlbook	61316a273b	Update index.html	2023-04-08 17:09:26 -04:00
udlbook	25b84c5cef	Update index.html	2023-04-06 14:53:14 -04:00
udlbook	57e5958296	Update index.html	2023-03-28 09:39:06 -07:00
udlbook	25e0c17f54	Update index.html	2023-03-25 18:52:50 -07:00
udlbook	ea08e72ce7	Update index.html	2023-03-21 18:41:43 -07:00
udlbook	863a5db7fa	Update index.html	2023-03-18 09:00:30 -07:00
udlbook	4ba1a3a18e	Update index.html	2023-03-11 07:11:36 -08:00
udlbook	96c48791b9	Update index.html	2023-03-10 07:50:57 -08:00
udlbook	bf7ef7f552	Update index.html	2023-03-09 19:49:49 -08:00
udlbook	a454a074f6	Update index.html	2023-03-07 18:12:57 -08:00
udlbook	247acf8e38	Update index.html	2023-03-03 13:51:49 -05:00
udlbook	11be14e1eb	Update index.html	2023-02-25 18:02:16 -05:00
udlbook	ba250e5a23	Update index.html	2023-02-17 18:26:18 -05:00
udlbook	13f3dae8f4	Last chapter added.	2023-02-06 14:31:54 -05:00
udlbook	d2b5b4f3fc	v0.6.1	2023-02-02 18:39:09 -05:00
udlbook	39dded5a11	Update index.html	2023-01-31 17:19:17 -05:00
udlbook	7897eab367	Update index.html	2023-01-25 09:19:32 -05:00
udlbook	003ec5c4c9	Update index.html	2023-01-24 16:02:07 -05:00
udlbook	222fb3a66e	Update to new version with fixed references.	2023-01-24 15:51:43 -05:00
udlbook	7dec3bd387	Addition of diffusion models chapter.	2023-01-23 14:31:00 -05:00
udlbook	01fdc94a74	Change date	2023-01-17 08:49:45 -05:00
udlbook	1975d3b42c	Fixed typo	2023-01-17 08:49:07 -05:00
udlbook	8cd23ef2d4	Fixed date	2023-01-16 14:37:02 -05:00
udlbook	9b4b5ee60c	Updated for addition of chapter 17.	2023-01-16 14:36:18 -05:00
udlbook	73319bb22a	Adding course slides	2022-12-16 18:45:12 +00:00
udlbook	a59386bffe	Update index.html	2022-12-15 14:34:18 +00:00
udlbook	07be66c13f	Update index.html	2022-12-15 14:20:24 +00:00
udlbook	fbdc06086d	Add files via upload Adding test data for CM20315 coursework	2022-12-15 11:36:10 +00:00
udlbook	bde68ed94e	Update index.html Fixed chapter order	2022-12-15 11:02:48 +00:00
udlbook	d92c1b803d	Update index.html	2022-12-14 19:12:32 +00:00
udlbook	25437f27fd	Update index.html	2022-12-14 18:43:52 +00:00
udlbook	06663df82a	Created using Colaboratory	2022-12-13 10:53:54 +00:00
udlbook	b3a5d9debc	Created using Colaboratory	2022-12-13 09:56:51 +00:00
udlbook	6452362cfe	Created using Colaboratory	2022-12-06 10:14:14 +00:00
udlbook	ffb1778145	Created using Colaboratory	2022-12-06 10:01:10 +00:00
udlbook	6a52c2f647	Created using Colaboratory	2022-12-06 09:58:30 +00:00
udlbook	a75edafdfb	Created using Colaboratory	2022-11-21 13:15:03 +00:00
udlbook	135ee0c6aa	Created using Colaboratory	2022-11-21 11:29:43 +00:00
udlbook	c486cdcf24	Created using Colaboratory	2022-11-21 11:29:09 +00:00
udlbook	a2be175df9	Created using Colaboratory	2022-11-21 11:27:57 +00:00
udlbook	22d46f75a5	Add files via upload	2022-11-19 12:48:48 +00:00
udlbook	6db2f9d620	Add files via upload	2022-11-19 12:47:57 +00:00
udlbook	40dd348839	Add files via upload	2022-11-19 10:22:25 +00:00
udlbook	80afcfeb13	Add files via upload	2022-11-19 09:48:07 +00:00
udlbook	e182339f29	Update index.html	2022-11-18 14:30:12 +00:00
udlbook	8719f2fb79	Created using Colaboratory	2022-11-15 09:57:34 +00:00
udlbook	982891f23f	Created using Colaboratory	2022-11-15 08:44:11 +00:00
udlbook	22fd25fcc1	Created using Colaboratory	2022-11-14 17:02:26 +00:00
udlbook	bd3b2262c4	Created using Colaboratory	2022-11-08 10:30:40 +00:00
udlbook	a50969c680	Created using Colaboratory	2022-11-06 18:44:13 +00:00
udlbook	f6590715c1	Created using Colaboratory	2022-11-01 09:56:31 +00:00
udlbook	49b84c2d7e	Created using Colaboratory	2022-11-01 08:32:09 +00:00
udlbook	e4ef824621	Created using Colaboratory	2022-11-01 08:19:58 +00:00
udlbook	6e365b15a7	Created using Colaboratory	2022-11-01 08:17:10 +00:00
udlbook	54dcd44a5b	Created using Colaboratory	2022-11-01 08:03:19 +00:00
udlbook	90dfbb2f5d	Created using Colaboratory	2022-10-25 12:25:23 +01:00
udlbook	60d8923b16	Created using Colaboratory	2022-10-25 11:51:47 +01:00
udlbook	b23449306e	Created using Colaboratory	2022-10-25 11:46:59 +01:00
udlbook	015a34e3d4	Created using Colaboratory	2022-10-25 11:44:09 +01:00
udlbook	3ed9864417	Created using Colaboratory	2022-10-24 18:32:09 +01:00
udlbook	31be445cf0	Created using Colaboratory	2022-10-24 18:27:54 +01:00
udlbook	a2640e8f28	Created using Colaboratory	2022-10-24 18:25:21 +01:00
udlbook	159e78bd84	Created using Colaboratory	2022-10-24 15:48:56 +01:00
udlbook	b63f94bd5a	Created using Colaboratory	2022-10-24 14:51:39 +01:00
udlbook	291035da82	Update index.html	2022-10-19 17:33:23 +01:00
udlbook	0e6d1c5008	Update index.html	2022-10-19 17:28:54 +01:00
udlbook	4dc43c14bd	Update index.html	2022-10-19 17:25:52 +01:00
udlbook	637deffcbe	Created using Colaboratory	2022-10-18 11:50:10 +01:00
udlbook	94c4c4c0f9	Created using Colaboratory	2022-10-18 10:23:51 +01:00
udlbook	680857f0b6	Created using Colaboratory	2022-10-17 17:27:14 +01:00
udlbook	c952238b41	Created using Colaboratory	2022-10-17 17:24:12 +01:00
udlbook	b7432dcd93	Created using Colaboratory	2022-10-17 10:59:28 +01:00
udlbook	bd396fd221	Created using Colaboratory	2022-10-11 13:15:21 +01:00
udlbook	8aef7cc546	Update CM20315_Shallow.ipynb Fixed some typos in the last (optional) part.	2022-10-11 10:40:51 +01:00
udlbook	a81a86d2ff	Created using Colaboratory	2022-10-10 17:37:30 +01:00
udlbook	b9fdbd2ccf	Updated web page	2022-10-07 14:26:07 +01:00