From 22fd25fcc1eafbc180b527cacc306e441b868e1c Mon Sep 17 00:00:00 2001 From: udlbook <110402648+udlbook@users.noreply.github.com> Date: Mon, 14 Nov 2022 17:02:26 +0000 Subject: [PATCH] Created using Colaboratory --- CM20315_Gradients_III.ipynb | 351 ++++++++++++++++++++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 CM20315_Gradients_III.ipynb diff --git a/CM20315_Gradients_III.ipynb b/CM20315_Gradients_III.ipynb new file mode 100644 index 0000000..123292c --- /dev/null +++ b/CM20315_Gradients_III.ipynb @@ -0,0 +1,351 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [], + "authorship_tag": "ABX9TyPr1jNETAJLP27xFPVEC09J", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Initialization\n", + "\n", + "In this practical, we'll investigate the what happens to the activations and the forward pass if we don't initialize the parameters sensibly." + ], + "metadata": { + "id": "L6chybAVFJW2" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LdIDglk1FFcG" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "source": [ + "First let's define a neural network. We'll just choose the weights and biaes randomly for now" + ], + "metadata": { + "id": "nnUoI0m6GyjC" + } + }, + { + "cell_type": "code", + "source": [ + "def init_params(K, D, sigma_sq_omega):\n", + " # Set seed so we always get the same random numbers\n", + " np.random.seed(0)\n", + "\n", + " # Input layer\n", + " D_i = 1\n", + " # Output layer \n", + " D_o = 1\n", + "\n", + " # Make empty lists \n", + " all_weights = [None] * (K+1)\n", + " all_biases = [None] * (K+1)\n", + "\n", + " # Create input and output layers\n", + " all_weights[0] = np.random.normal(size=(D, D_i))*np.sqrt(sigma_sq_omega)\n", + " all_weights[-1] = np.random.normal(size=(D_o, D)) * np.sqrt(sigma_sq_omega)\n", + " all_biases[0] = np.zeros((D,1))\n", + " all_biases[-1]= np.zeros((D_o,1))\n", + "\n", + " # Create intermediate layers\n", + " for layer in range(1,K):\n", + " all_weights[layer] = np.random.normal(size=(D,D))*np.sqrt(sigma_sq_omega)\n", + " all_biases[layer] = np.zeros((D,1)) \n", + "\n", + " return all_weights, all_biases" + ], + "metadata": { + "id": "WVM4Tc_jGI0Q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Define the Rectified Linear Unit (ReLU) function\n", + "def ReLU(preactivation):\n", + " activation = preactivation.clip(0.0)\n", + " return activation" + ], + "metadata": { + "id": "jZh-7bPXIDq4" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def compute_network_output(net_input, all_weights, all_biases):\n", + "\n", + " # Retrieve number of layers\n", + " K = len(all_weights) -1\n", + "\n", + " # We'll store the pre-activations at each layer in a list \"all_f\"\n", + " # and the activations in a second list[all_h]. \n", + " all_f = [None] * (K+1)\n", + " all_h = [None] * (K+1)\n", + "\n", + " #For convenience, we'll set \n", + " # all_h[0] to be the input, and all_f[K] will be the output\n", + " all_h[0] = net_input\n", + "\n", + " # Run through the layers, calculating all_f[0...K-1] and all_h[1...K]\n", + " for layer in range(K):\n", + " # Update preactivations and activations at this layer according to eqn 7.5\n", + " all_f[layer] = all_biases[layer] + np.matmul(all_weights[layer], all_h[layer])\n", + " all_h[layer+1] = ReLU(all_f[layer])\n", + "\n", + " # Compute the output from the last hidden layer\n", + " all_f[K] = all_biases[K] + np.matmul(all_weights[K], all_h[K])\n", + "\n", + " # Retrieve the output\n", + " net_output = all_f[K]\n", + "\n", + " return net_output, all_f, all_h" + ], + "metadata": { + "id": "LgquJUJvJPaN" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's investigate how this the size of the outputs vary as we change the initialization variance:\n" + ], + "metadata": { + "id": "bIUrcXnOqChl" + } + }, + { + "cell_type": "code", + "source": [ + "# Number of layers\n", + "K = 5\n", + "# Number of neurons per layer\n", + "D = 8\n", + " # Input layer\n", + "D_i = 1\n", + "# Output layer \n", + "D_o = 1\n", + "# Set variance of initial weights to 1\n", + "sigma_sq_omega = 1.0\n", + "# Initialize parameters\n", + "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n", + "\n", + "n_data = 1000\n", + "data_in = np.random.normal(size=(1,n_data))\n", + "net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n", + "\n", + "for layer in range(K):\n", + " print(\"Layer %d, std of hidden units = %3.3f\"%(layer, np.std(all_h[layer])))" + ], + "metadata": { + "id": "A55z3rKBqO7M" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n", + "# and the 1000 training examples\n", + "\n", + "# TO DO \n", + "# Change this to 50 layers with 80 hidden units per layer\n", + "\n", + "# TO DO \n", + "# Now experiment with sigma_sq_omega to try to stop the variance of the forward computation explode" + ], + "metadata": { + "id": "VL_SO4tar3DC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's define a loss function. We'll just use the least squaures loss function. We'll also write a function to compute dloss_doutput\n" + ], + "metadata": { + "id": "SxVTKp3IcoBF" + } + }, + { + "cell_type": "code", + "source": [ + "def least_squares_loss(net_output, y):\n", + " return np.sum((net_output-y) * (net_output-y))\n", + "\n", + "def d_loss_d_output(net_output, y):\n", + " return 2*(net_output -y); " + ], + "metadata": { + "id": "6XqWSYWJdhQR" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Here's the code for the backward pass" + ], + "metadata": { + "id": "98WmyqFYWA-0" + } + }, + { + "cell_type": "code", + "source": [ + "# We'll need the indicator function\n", + "def indicator_function(x):\n", + " x_in = np.array(x)\n", + " x_in[x_in>=0] = 1\n", + " x_in[x_in<0] = 0\n", + " return x_in\n", + "\n", + "# Main backward pass routine\n", + "def backward_pass(all_weights, all_biases, all_f, all_h, y):\n", + " # We'll store the derivatives dl_dweights and dl_dbiases in lists as well\n", + " all_dl_dweights = [None] * (K+1)\n", + " all_dl_dbiases = [None] * (K+1)\n", + " # And we'll store the derivatives of the loss with respect to the activation and preactivations in lists\n", + " all_dl_df = [None] * (K+1)\n", + " all_dl_dh = [None] * (K+1)\n", + " # Again for convenience we'll stick with the convention that all_h[0] is the net input and all_f[k] in the net output\n", + "\n", + " # Compute derivatives of net output with respect to loss\n", + " all_dl_df[K] = np.array(d_loss_d_output(all_f[K],y))\n", + "\n", + " # Now work backwards through the network\n", + " for layer in range(K,-1,-1):\n", + " # Calculate the derivatives of biases at layer from all_dl_df[K]. (eq 7.13, line 1)\n", + " all_dl_dbiases[layer] = np.array(all_dl_df[layer])\n", + " # Calculate the derivatives of weight at layer from all_dl_df[K] and all_h[K] (eq 7.13, line 2)\n", + " all_dl_dweights[layer] = np.matmul(all_dl_df[layer], all_h[layer].transpose())\n", + "\n", + " # Calculate the derivatives of activations from weight and derivatives of next preactivations (eq 7.13, line 3 second part)\n", + " all_dl_dh[layer] = np.matmul(all_weights[layer].transpose(), all_dl_df[layer])\n", + " # Calculate the derivatives of the pre-activation f with respect to activation h (eq 7.13, line 3, first part)\n", + " if layer > 0:\n", + " all_dl_df[layer-1] = indicator_function(all_f[layer-1]) * all_dl_dh[layer]\n", + "\n", + " return all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df" + ], + "metadata": { + "id": "LJng7WpRPLMz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now let's look at what happens to the magnitude of the gradients on the way back." + ], + "metadata": { + "id": "phFnbthqwhFi" + } + }, + { + "cell_type": "code", + "source": [ + "# Number of layers\n", + "K = 5\n", + "# Number of neurons per layer\n", + "D = 8\n", + " # Input layer\n", + "D_i = 1\n", + "# Output layer \n", + "D_o = 1\n", + "# Set variance of initial weights to 1\n", + "sigma_sq_omega = 1.0\n", + "# Initialize parameters\n", + "all_weights, all_biases = init_params(K,D,sigma_sq_omega)\n", + "\n", + "# For simplicity we'll just consider the gradients of the weights and biases between the first and last hidden layer\n", + "n_data = 100\n", + "aggregate_dl_df = [None] * (K+1)\n", + "for layer in range(1,K):\n", + " # These 3D arrays will store the gradients for every data point\n", + " aggregate_dl_df[layer] = np.zeros((D,n_data))\n", + "\n", + "\n", + "# We'll have to compute the derivatives of the parameters for each data point separately\n", + "for c_data in range(n_data):\n", + " data_in = np.random.normal(size=(1,1))\n", + " y = np.zeros((1,1))\n", + " net_output, all_f, all_h = compute_network_output(data_in, all_weights, all_biases)\n", + " all_dl_dweights, all_dl_dbiases, all_dl_dh, all_dl_df = backward_pass(all_weights, all_biases, all_f, all_h, y)\n", + " for layer in range(1,K):\n", + " aggregate_dl_df[layer][:,c_data] = np.squeeze(all_dl_df[layer])\n", + "\n", + "for layer in range(1,K):\n", + " print(\"Layer %d, std of dl_dh = %3.3f\"%(layer, np.std(aggregate_dl_df[layer].ravel())))\n" + ], + "metadata": { + "id": "9A9MHc4sQvbp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# You can see that the values of the hidden units are increasing on average (the variance is across all hidden units at the layer \n", + "# and the 1000 training examples\n", + "\n", + "# TO DO \n", + "# Change this to 50 layers with 80 hidden units per layer\n", + "\n", + "# TO DO \n", + "# Now experiment with sigma_sq_omega to try to stop the variance of the gradients exploding" + ], + "metadata": { + "id": "gtokc0VX0839" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file