{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyPr1jNETAJLP27xFPVEC09J",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/CM20315_Gradients_III.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Initialization\n",
        "\n",
        "In this practical, we'll investigate the what happens to the activations and the forward pass if we don't initialize the parameters sensibly."
      ],
      "metadata": {
        "id": "L6chybAVFJW2"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "LdIDglk1FFcG"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "import matplotlib.pyplot as plt"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "First let's define a neural network.  We'll just choose the weights and biaes randomly for now"
      ],
      "metadata": {
        "id": "nnUoI0m6GyjC"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def init_params(K, D, sigma_sq_omega):\n",
        "  # Set seed so we always get the same random numbers\n",
        "  np.random.seed(0)\n",
        "\n",
        "  # Input layer\n",
        "  D_i = 1\n",
        "  # Output layer \n",
        "  D_o = 1\n",
        "\n",
        "  # Make empty lists \n",
        "  all_weights = [None] * (K+1)\n",
        "  all_biases = [None] * (K+1)\n",
        "\n",
        "  # Create input and output layers\n",
        "  all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)\n",
        "  all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)\n",
        "  all_biases[0] = np.zeros((D,1))\n",
        "  all_biases[-1]= np.zeros((D_o,1))\n",
        "\n",
        "  # Create intermediate layers\n",
        "  for layer in range(1,K):\n",
        "    all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)\n",
        "    all_biases[layer] = np.zeros((D,1)) \n",
        "\n",
        "  return all_weights, all_biases"
      ],
      "metadata": {
        "id": "WVM4Tc_jGI0Q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Define the Rectified Linear Unit (ReLU) function\n",
        "def ReLU(preactivation):\n",
        "  activation = preactivation.clip(0.0)\n",
        "  return activation"
      ],
      "metadata": {
        "id": "jZh-7bPXIDq4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def compute_network_output(net_input, all_weights, all_biases):\n",
        "\n",
        "  # Retrieve number of layers\n",
        "  K = len(all_weights) -1\n",
        "\n",
        "  # We'll store the pre-activations at each layer in a list \"all_f\"\n",
        "  # and the activations in a second list[all_h].  \n",
        "  all_f = [None] * (K+1)\n",
        "  all_h = [None] * (K+1)\n",
        "\n",
        "  #For convenience, we'll set \n",
        "  # all_h[0] to be the input, and all_f[K] will be the output\n",
        "  all_h[0] = net_input\n",
        "\n",
        "  # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n",
        "  for layer in range(K):\n",
        "      # Update preactivations and activations at this layer according to eqn 7.5\n",
        "      all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])\n",
        "      all_h[layer+1] = ReLU(all_f[layer])\n",
        "\n",
        "  # Compute the output from the last hidden layer\n",
        "  all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])\n",
        "\n",
        "  # Retrieve the output\n",
        "  net_output = all_f[K]\n",
        "\n",
        "  return net_output, all_f, all_h"
      ],
      "metadata": {
        "id": "LgquJUJvJPaN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now let's investigate how this the size of the outputs vary as we change the initialization variance:\n"
      ],
      "metadata": {
        "id": "bIUrcXnOqChl"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Number of layers\n",
        "K = 5\n",
        "# Number of neurons per layer\n",
        "D = 8\n",
        "  # Input layer\n",
        "D_i = 1\n",
        "# Output layer \n",
        "D_o = 1\n",
        "# Set variance of initial weights to 1\n",
        "sigma_sq_omega = 1.0\n",
        "# Initialize parameters\n",
        "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
        "\n",
        "n_data = 1000\n",
        "data_in = np.random.normal(size=(1,n_data))\n",
        "net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
        "\n",
        "for layer in range(K):\n",
        "  print(\"Layer %d, std of hidden units = %3.3f\"%(layer, np.std(all_h[layer])))"
      ],
      "metadata": {
        "id": "A55z3rKBqO7M"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
        "# and the 1000 training examples\n",
        "\n",
        "# TO DO \n",
        "# Change this to 50 layers with 80 hidden units per layer\n",
        "\n",
        "# TO DO \n",
        "# Now experiment with sigma_sq_omega to try to stop the variance of the forward computation explode"
      ],
      "metadata": {
        "id": "VL_SO4tar3DC"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now let's define a loss function.  We'll just use the least squaures loss function. We'll also write a function to compute dloss_doutput\n"
      ],
      "metadata": {
        "id": "SxVTKp3IcoBF"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def least_squares_loss(net_output, y):\n",
        "  return np.sum((net_output-y) * (net_output-y))\n",
        "\n",
        "def d_loss_d_output(net_output, y):\n",
        "    return 2*(net_output -y); "
      ],
      "metadata": {
        "id": "6XqWSYWJdhQR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Here's the code for the backward pass"
      ],
      "metadata": {
        "id": "98WmyqFYWA-0"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# We'll need the indicator function\n",
        "def indicator_function(x):\n",
        "  x_in = np.array(x)\n",
        "  x_in[x_in>=0] = 1\n",
        "  x_in[x_in<0] = 0\n",
        "  return x_in\n",
        "\n",
        "# Main backward pass routine\n",
        "def backward_pass(all_weights, all_biases, all_f, all_h, y):\n",
        "  # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n",
        "  all_dl_dweights = [None] * (K+1)\n",
        "  all_dl_dbiases = [None] * (K+1)\n",
        "  # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n",
        "  all_dl_df = [None] * (K+1)\n",
        "  all_dl_dh = [None] * (K+1)\n",
        "  # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n",
        "\n",
        "  # Compute derivatives of net output with respect to loss\n",
        "  all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))\n",
        "\n",
        "  # Now work backwards through the network\n",
        "  for layer in range(K,-1,-1):\n",
        "    # Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)\n",
        "    all_dl_dbiases[layer] = np.array(all_dl_df[layer])\n",
        "    # Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)\n",
        "    all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())\n",
        "\n",
        "    # Calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)\n",
        "    all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])\n",
        "    # Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)\n",
        "    if layer > 0:\n",
        "      all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]\n",
        "\n",
        "  return all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df"
      ],
      "metadata": {
        "id": "LJng7WpRPLMz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now let's look at what happens to the magnitude of the gradients on the way back."
      ],
      "metadata": {
        "id": "phFnbthqwhFi"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Number of layers\n",
        "K = 5\n",
        "# Number of neurons per layer\n",
        "D = 8\n",
        "  # Input layer\n",
        "D_i = 1\n",
        "# Output layer \n",
        "D_o = 1\n",
        "# Set variance of initial weights to 1\n",
        "sigma_sq_omega = 1.0\n",
        "# Initialize parameters\n",
        "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n",
        "\n",
        "# For simplicity we'll just consider the gradients of the weights and biases between the first and last hidden layer\n",
        "n_data = 100\n",
        "aggregate_dl_df = [None] * (K+1)\n",
        "for layer in range(1,K):\n",
        "  # These 3D arrays will store the gradients for every data point\n",
        "  aggregate_dl_df[layer] = np.zeros((D,n_data))\n",
        "\n",
        "\n",
        "# We'll have to compute the derivatives of the parameters for each data point separately\n",
        "for c_data in range(n_data):\n",
        "  data_in = np.random.normal(size=(1,1))\n",
        "  y = np.zeros((1,1))\n",
        "  net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n",
        "  all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df = backward_pass(all_weights, all_biases, all_f, all_h, y)\n",
        "  for layer in range(1,K):\n",
        "    aggregate_dl_df[layer][:,c_data] = np.squeeze(all_dl_df[layer])\n",
        "\n",
        "for layer in range(1,K):\n",
        "  print(\"Layer %d, std of dl_dh = %3.3f\"%(layer, np.std(aggregate_dl_df[layer].ravel())))\n"
      ],
      "metadata": {
        "id": "9A9MHc4sQvbp"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n",
        "# and the 1000 training examples\n",
        "\n",
        "# TO DO \n",
        "# Change this to 50 layers with 80 hidden units per layer\n",
        "\n",
        "# TO DO \n",
        "# Now experiment with sigma_sq_omega to try to stop the variance of the gradients exploding"
      ],
      "metadata": {
        "id": "gtokc0VX0839"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}