From 22fd25fcc1eafbc180b527cacc306e441b868e1c Mon Sep 17 00:00:00 2001
From: udlbook <110402648+udlbook@users.noreply.github.com>
Date: Mon, 14 Nov 2022 17:02:26 +0000
Subject: [PATCH] Created using Colaboratory

---
 CM20315_Gradients_III.ipynb | 351 ++++++++++++++++++++++++++++++++++++
 1 file changed, 351 insertions(+)
 create mode 100644 CM20315_Gradients_III.ipynb
diff --git a/CM20315_Gradients_III.ipynb b/CM20315_Gradients_III.ipynb
new file mode 100644
index 0000000..123292c
--- /dev/null
+++ b/CM20315_Gradients_III.ipynb
@@ -0,0 +1,351 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyPr1jNETAJLP27xFPVEC09J",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Gradients_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# Initialization\n",
+        "\n",
+        "In this practical, we'll investigate the what happens to the activations and the forward pass if we don't initialize the parameters sensibly."
+      ],
+      "metadata": {
+        "id": "L6chybAVFJW2"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "LdIDglk1FFcG"
+      },
+      "outputs": [],
+      "source": [
+        "import numpy as np\n",
+        "import matplotlib.pyplot as plt"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "First let's define a neural network.  We'll just choose the weights and biaes randomly for now"
+      ],
+      "metadata": {
+        "id": "nnUoI0m6GyjC"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def init_params(K, D, sigma_sq_omega):\n",
+        "  # Set seed so we always get the same random numbers\n",
+        "  np.random.seed(0)\n",
+        "\n",
+        "  # Input layer\n",
+        "  D_i = 1\n",
+        "  # Output layer \n",
+        "  D_o = 1\n",
+        "\n",
+        "  # Make empty lists \n",
+        "  all_weights = [None] * (K+1)\n",
+        "  all_biases = [None] * (K+1)\n",
+        "\n",
+        "  # Create input and output layers\n",
+        "  all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)\n",
+        "  all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)\n",
+        "  all_biases[0] = np.zeros((D,1))\n",
+        "  all_biases[-1]= np.zeros((D_o,1))\n",
+        "\n",
+        "  # Create intermediate layers\n",
+        "  for layer in range(1,K):\n",
+        "    all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)\n",
+        "    all_biases[layer] = np.zeros((D,1)) \n",
+        "\n",
+        "  return all_weights, all_biases"
+      ],
+      "metadata": {
+        "id": "WVM4Tc_jGI0Q"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Define the Rectified Linear Unit (ReLU) function\n",
+        "def ReLU(preactivation):\n",
+        "  activation = preactivation.clip(0.0)\n",
+        "  return activation"
+      ],
+      "metadata": {
+        "id": "jZh-7bPXIDq4"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def compute_network_output(net_input, all_weights, all_biases):\n",
+        "\n",
+        "  # Retrieve number of layers\n",
+        "  K = len(all_weights) -1\n",
+        "\n",
+        "  # We'll store the pre-activations at each layer in a list \"all_f\"\n",
+        "  # and the activations in a second list[all_h].  \n",
+        "  all_f = [None] * (K+1)\n",
+        "  all_h = [None] * (K+1)\n",
+        "\n",
+        "  #For convenience, we'll set \n",
+        "  # all_h[0] to be the input, and all_f[K] will be the output\n",
+        "  all_h[0] = net_input\n",
+        "\n",
+        "  # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n",
+        "  for layer in range(K):\n",
+        "      # Update preactivations and activations at this layer according to eqn 7.5\n",
+        "      all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])\n",
+        "      all_h[layer+1] = ReLU(all_f[layer])\n",
+        "\n",
+        "  # Compute the output from the last hidden layer\n",
+        "  all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])\n",
+        "\n",
+        "  # Retrieve the output\n",
+        "  net_output = all_f[K]\n",
+        "\n",
+        "  return net_output, all_f, all_h"
+      ],
+      "metadata": {
+        "id": "LgquJUJvJPaN"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's investigate how this the size of the outputs vary as we change the initialization variance:\n"
+      ],
+      "metadata": {
+        "id": "bIUrcXnOqChl"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Number of layers\n",
+        "K = 5\n",
+        "# Number of neurons per layer\n",
+        "D = 8\n",
+        "  # Input layer\n",
+        "D_i = 1\n",
+        "# Output layer \n",
+        "D_o = 1\n",
+        "# Set variance of initial weights to 1\n",
+        "sigma_sq_omega = 1.0\n",
+        "# Initialize parameters\n",
+        "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
+        "\n",
+        "n_data = 1000\n",
+        "data_in = np.random.normal(size=(1,n_data))\n",
+        "net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
+        "\n",
+        "for layer in range(K):\n",
+        "  print(\"Layer %d, std of hidden units = %3.3f\"%(layer, np.std(all_h[layer])))"
+      ],
+      "metadata": {
+        "id": "A55z3rKBqO7M"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
+        "# and the 1000 training examples\n",
+        "\n",
+        "# TO DO \n",
+        "# Change this to 50 layers with 80 hidden units per layer\n",
+        "\n",
+        "# TO DO \n",
+        "# Now experiment with sigma_sq_omega to try to stop the variance of the forward computation explode"
+      ],
+      "metadata": {
+        "id": "VL_SO4tar3DC"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's define a loss function.  We'll just use the least squaures loss function. We'll also write a function to compute dloss_doutput\n"
+      ],
+      "metadata": {
+        "id": "SxVTKp3IcoBF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "def least_squares_loss(net_output, y):\n",
+        "  return np.sum((net_output-y) * (net_output-y))\n",
+        "\n",
+        "def d_loss_d_output(net_output, y):\n",
+        "    return 2*(net_output -y); "
+      ],
+      "metadata": {
+        "id": "6XqWSYWJdhQR"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Here's the code for the backward pass"
+      ],
+      "metadata": {
+        "id": "98WmyqFYWA-0"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# We'll need the indicator function\n",
+        "def indicator_function(x):\n",
+        "  x_in = np.array(x)\n",
+        "  x_in[x_in>=0] = 1\n",
+        "  x_in[x_in<0] = 0\n",
+        "  return x_in\n",
+        "\n",
+        "# Main backward pass routine\n",
+        "def backward_pass(all_weights, all_biases, all_f, all_h, y):\n",
+        "  # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n",
+        "  all_dl_dweights = [None] * (K+1)\n",
+        "  all_dl_dbiases = [None] * (K+1)\n",
+        "  # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n",
+        "  all_dl_df = [None] * (K+1)\n",
+        "  all_dl_dh = [None] * (K+1)\n",
+        "  # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n",
+        "\n",
+        "  # Compute derivatives of net output with respect to loss\n",
+        "  all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))\n",
+        "\n",
+        "  # Now work backwards through the network\n",
+        "  for layer in range(K,-1,-1):\n",
+        "    # Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)\n",
+        "    all_dl_dbiases[layer] = np.array(all_dl_df[layer])\n",
+        "    # Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)\n",
+        "    all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())\n",
+        "\n",
+        "    # Calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)\n",
+        "    all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])\n",
+        "    # Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)\n",
+        "    if layer > 0:\n",
+        "      all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]\n",
+        "\n",
+        "  return all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df"
+      ],
+      "metadata": {
+        "id": "LJng7WpRPLMz"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "Now let's look at what happens to the magnitude of the gradients on the way back."
+      ],
+      "metadata": {
+        "id": "phFnbthqwhFi"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Number of layers\n",
+        "K = 5\n",
+        "# Number of neurons per layer\n",
+        "D = 8\n",
+        "  # Input layer\n",
+        "D_i = 1\n",
+        "# Output layer \n",
+        "D_o = 1\n",
+        "# Set variance of initial weights to 1\n",
+        "sigma_sq_omega = 1.0\n",
+        "# Initialize parameters\n",
+        "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
+        "\n",
+        "# For simplicity we'll just consider the gradients of the weights and biases between the first and last hidden layer\n",
+        "n_data = 100\n",
+        "aggregate_dl_df = [None] * (K+1)\n",
+        "for layer in range(1,K):\n",
+        "  # These 3D arrays will store the gradients for every data point\n",
+        "  aggregate_dl_df[layer] = np.zeros((D,n_data))\n",
+        "\n",
+        "\n",
+        "# We'll have to compute the derivatives of the parameters for each data point separately\n",
+        "for c_data in range(n_data):\n",
+        "  data_in = np.random.normal(size=(1,1))\n",
+        "  y = np.zeros((1,1))\n",
+        "  net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
+        "  all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df = backward_pass(all_weights, all_biases, all_f, all_h, y)\n",
+        "  for layer in range(1,K):\n",
+        "    aggregate_dl_df[layer][:,c_data] = np.squeeze(all_dl_df[layer])\n",
+        "\n",
+        "for layer in range(1,K):\n",
+        "  print(\"Layer %d, std of dl_dh = %3.3f\"%(layer, np.std(aggregate_dl_df[layer].ravel())))\n"
+      ],
+      "metadata": {
+        "id": "9A9MHc4sQvbp"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
+        "# and the 1000 training examples\n",
+        "\n",
+        "# TO DO \n",
+        "# Change this to 50 layers with 80 hidden units per layer\n",
+        "\n",
+        "# TO DO \n",
+        "# Now experiment with sigma_sq_omega to try to stop the variance of the gradients exploding"
+      ],
+      "metadata": {
+        "id": "gtokc0VX0839"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file