Files
udlbook/Notebooks/Chap06/6_2_Gradient_Descent.ipynb
2023-10-30 17:52:25 +01:00

421 lines
18 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyM/FIXDTd6tZYs6WRzK00hB",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap06/6_2_Gradient_Descent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# **Notebook 6.2 Gradient descent**\n",
"\n",
"This notebook recreates the gradient descent algorithm as shown in figure 6.1.\n",
"\n",
"Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n",
"\n",
"Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions.\n",
"\n"
],
"metadata": {
"id": "el8l05WQEO46"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xhmIOLiZELV_"
},
"outputs": [],
"source": [
"# import libraries\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import cm\n",
"from matplotlib.colors import ListedColormap"
]
},
{
"cell_type": "code",
"source": [
"# Let's create our training data 12 pairs {x_i, y_i}\n",
"# We'll try to fit the straight line model to these data\n",
"data = np.array([[0.03,0.19,0.34,0.46,0.78,0.81,1.08,1.18,1.39,1.60,1.65,1.90],\n",
" [0.67,0.85,1.05,1.00,1.40,1.50,1.30,1.54,1.55,1.68,1.73,1.60]])"
],
"metadata": {
"id": "4cRkrh9MZ58Z"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Let's define our model -- just a straight line with intercept phi[0] and slope phi[1]\n",
"def model(phi,x):\n",
" y_pred = phi[0]+phi[1] * x\n",
" return y_pred"
],
"metadata": {
"id": "WQUERmb2erAe"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Draw model\n",
"def draw_model(data,model,phi,title=None):\n",
" x_model = np.arange(0,2,0.01)\n",
" y_model = model(phi,x_model)\n",
"\n",
" fix, ax = plt.subplots()\n",
" ax.plot(data[0,:],data[1,:],'bo')\n",
" ax.plot(x_model,y_model,'m-')\n",
" ax.set_xlim([0,2]);ax.set_ylim([0,2])\n",
" ax.set_xlabel('x'); ax.set_ylabel('y')\n",
" ax.set_aspect('equal')\n",
" if title is not None:\n",
" ax.set_title(title)\n",
" plt.show()"
],
"metadata": {
"id": "qFRe9POHF2le"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Initialize the parameters to some arbitrary values and draw the model\n",
"phi = np.zeros((2,1))\n",
"phi[0] = 0.6 # Intercept\n",
"phi[1] = -0.2 # Slope\n",
"draw_model(data,model,phi, \"Initial parameters\")\n"
],
"metadata": {
"id": "TXx1Tpd1Tl-I"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now lets create compute the sum of squares loss for the training data"
],
"metadata": {
"id": "QU5mdGvpTtEG"
}
},
{
"cell_type": "code",
"source": [
"def compute_loss(data_x, data_y, model, phi):\n",
" # TODO -- Write this function -- replace the line below\n",
" # First make model predictions from data x\n",
" # Then compute the squared difference between the predictions and true y values\n",
" # Then sum them all and return\n",
" pred_y = 0\n",
" loss = 0\n",
"\n",
" return loss"
],
"metadata": {
"id": "I7dqTY2Gg7CR"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Let's just test that we got that right"
],
"metadata": {
"id": "eB5DQvU5hYNx"
}
},
{
"cell_type": "code",
"source": [
"loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
"print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 12.367))"
],
"metadata": {
"id": "Ty05UtEEg9tc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now let's plot the whole loss function"
],
"metadata": {
"id": "F3trnavPiHpH"
}
},
{
"cell_type": "code",
"source": [
"def draw_loss_function(compute_loss, data, model, phi_iters = None):\n",
" # Define pretty colormap\n",
" my_colormap_vals_hex =('2a0902', '2b0a03', '2c0b04', '2d0c05', '2e0c06', '2f0d07', '300d08', '310e09', '320f0a', '330f0b', '34100b', '35110c', '36110d', '37120e', '38120f', '39130f', '3a1410', '3b1411', '3c1511', '3d1612', '3e1613', '3f1713', '401714', '411814', '421915', '431915', '451a16', '461b16', '471b17', '481c17', '491d18', '4a1d18', '4b1e19', '4c1f19', '4d1f1a', '4e201b', '50211b', '51211c', '52221c', '53231d', '54231d', '55241e', '56251e', '57261f', '58261f', '592720', '5b2821', '5c2821', '5d2922', '5e2a22', '5f2b23', '602b23', '612c24', '622d25', '632e25', '652e26', '662f26', '673027', '683027', '693128', '6a3229', '6b3329', '6c342a', '6d342a', '6f352b', '70362c', '71372c', '72372d', '73382e', '74392e', '753a2f', '763a2f', '773b30', '783c31', '7a3d31', '7b3e32', '7c3e33', '7d3f33', '7e4034', '7f4134', '804235', '814236', '824336', '834437', '854538', '864638', '874739', '88473a', '89483a', '8a493b', '8b4a3c', '8c4b3c', '8d4c3d', '8e4c3e', '8f4d3f', '904e3f', '924f40', '935041', '945141', '955242', '965343', '975343', '985444', '995545', '9a5646', '9b5746', '9c5847', '9d5948', '9e5a49', '9f5a49', 'a05b4a', 'a15c4b', 'a35d4b', 'a45e4c', 'a55f4d', 'a6604e', 'a7614e', 'a8624f', 'a96350', 'aa6451', 'ab6552', 'ac6552', 'ad6653', 'ae6754', 'af6855', 'b06955', 'b16a56', 'b26b57', 'b36c58', 'b46d59', 'b56e59', 'b66f5a', 'b7705b', 'b8715c', 'b9725d', 'ba735d', 'bb745e', 'bc755f', 'bd7660', 'be7761', 'bf7862', 'c07962', 'c17a63', 'c27b64', 'c27c65', 'c37d66', 'c47e67', 'c57f68', 'c68068', 'c78169', 'c8826a', 'c9836b', 'ca846c', 'cb856d', 'cc866e', 'cd876f', 'ce886f', 'ce8970', 'cf8a71', 'd08b72', 'd18c73', 'd28d74', 'd38e75', 'd48f76', 'd59077', 'd59178', 'd69279', 'd7937a', 'd8957b', 'd9967b', 'da977c', 'da987d', 'db997e', 'dc9a7f', 'dd9b80', 'de9c81', 'de9d82', 'df9e83', 'e09f84', 'e1a185', 'e2a286', 'e2a387', 'e3a488', 'e4a589', 'e5a68a', 'e5a78b', 'e6a88c', 'e7aa8d', 'e7ab8e', 'e8ac8f', 'e9ad90', 'eaae91', 'eaaf92', 'ebb093', 'ecb295', 'ecb396', 'edb497', 'eeb598', 'eeb699', 'efb79a', 'efb99b', 'f0ba9c', 'f1bb9d', 'f1bc9e', 'f2bd9f', 'f2bfa1', 'f3c0a2', 'f3c1a3', 'f4c2a4', 'f5c3a5', 'f5c5a6', 'f6c6a7', 'f6c7a8', 'f7c8aa', 'f7c9ab', 'f8cbac', 'f8ccad', 'f8cdae', 'f9ceb0', 'f9d0b1', 'fad1b2', 'fad2b3', 'fbd3b4', 'fbd5b6', 'fbd6b7', 'fcd7b8', 'fcd8b9', 'fcdaba', 'fddbbc', 'fddcbd', 'fddebe', 'fddfbf', 'fee0c1', 'fee1c2', 'fee3c3', 'fee4c5', 'ffe5c6', 'ffe7c7', 'ffe8c9', 'ffe9ca', 'ffebcb', 'ffeccd', 'ffedce', 'ffefcf', 'fff0d1', 'fff2d2', 'fff3d3', 'fff4d5', 'fff6d6', 'fff7d8', 'fff8d9', 'fffada', 'fffbdc', 'fffcdd', 'fffedf', 'ffffe0')\n",
" my_colormap_vals_dec = np.array([int(element,base=16) for element in my_colormap_vals_hex])\n",
" r = np.floor(my_colormap_vals_dec/(256*256))\n",
" g = np.floor((my_colormap_vals_dec - r *256 *256)/256)\n",
" b = np.floor(my_colormap_vals_dec - r * 256 *256 - g * 256)\n",
" my_colormap = ListedColormap(np.vstack((r,g,b)).transpose()/255.0)\n",
"\n",
" # Make grid of intercept/slope values to plot\n",
" intercepts_mesh, slopes_mesh = np.meshgrid(np.arange(0.0,2.0,0.02), np.arange(-1.0,1.0,0.002))\n",
" loss_mesh = np.zeros_like(slopes_mesh)\n",
" # Compute loss for every set of parameters\n",
" for idslope, slope in np.ndenumerate(slopes_mesh):\n",
" loss_mesh[idslope] = compute_loss(data[0,:], data[1,:], model, np.array([[intercepts_mesh[idslope]], [slope]]))\n",
"\n",
" fig,ax = plt.subplots()\n",
" fig.set_size_inches(8,8)\n",
" ax.contourf(intercepts_mesh,slopes_mesh,loss_mesh,256,cmap=my_colormap)\n",
" ax.contour(intercepts_mesh,slopes_mesh,loss_mesh,40,colors=['#80808080'])\n",
" if phi_iters is not None:\n",
" ax.plot(phi_iters[0,:], phi_iters[1,:],'go-')\n",
" ax.set_ylim([1,-1])\n",
" ax.set_xlabel('Intercept $\\phi_{0}$'); ax.set_ylabel('Slope, $\\phi_{1}$')\n",
" plt.show()"
],
"metadata": {
"id": "K-NTHpAAHlCl"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"draw_loss_function(compute_loss, data, model)"
],
"metadata": {
"id": "l8HbvIupnTME"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now let's compute the gradient vector for a given set of parameters:\n",
"\n",
"\\begin{equation}\n",
"\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
"\\end{equation}"
],
"metadata": {
"id": "s9Duf05WqqSC"
}
},
{
"cell_type": "code",
"source": [
"# These are in the lecture slides and notes, but worth trying to calculate them yourself to\n",
"# check that you get them right. Write out the expression for the sum of squares loss and take the\n",
"# derivative with respect to phi0 and phi1\n",
"def compute_gradient(data_x, data_y, phi):\n",
" # TODO -- write this function, replacing the lines below\n",
" dl_dphi0 = 0.0\n",
" dl_dphi1 = 0.0\n",
"\n",
" # Return the gradient\n",
" return np.array([[dl_dphi0],[dl_dphi1]])"
],
"metadata": {
"id": "UpswmkL2qwBT"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"We can check we got this right using a trick known as **finite differences**. If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
"\n",
"\\begin{eqnarray}\n",
"\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
"\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
"\\end{eqnarray}\n",
"\n",
"We can't do this when there are many parameters; for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
],
"metadata": {
"id": "RS1nEcYVuEAM"
}
},
{
"cell_type": "code",
"source": [
"# Compute the gradient using your function\n",
"gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
"print(\"Your gradients: (%3.3f,%3.3f)\"%(gradient[0],gradient[1]))\n",
"# Approximate the gradients with finite differences\n",
"delta = 0.0001\n",
"dl_dphi0_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[delta],[0]])) - \\\n",
" compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
"dl_dphi1_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[0],[delta]])) - \\\n",
" compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
"print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n",
"# There might be small differences in the last significant figure because finite gradients is an approximation\n"
],
"metadata": {
"id": "QuwAHN7yt-gi"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"Now we are ready to perform gradient descent. We'll need to use our line search routine from part I, which I've reproduced here plus the helper function loss_function_1D that converts from a 2D problem to a 1D problem"
],
"metadata": {
"id": "5EIjMM9Fw2eT"
}
},
{
"cell_type": "code",
"source": [
"def loss_function_1D(dist_prop, data, model, phi_start, gradient):\n",
" # Return the loss after moving this far\n",
" return compute_loss(data[0,:], data[1,:], model, phi_start+ gradient * dist_prop)\n",
"\n",
"def line_search(data, model, phi, gradient, thresh=.00001, max_dist = 0.1, max_iter = 15, verbose=False):\n",
" # Initialize four points along the range we are going to search\n",
" a = 0\n",
" b = 0.33 * max_dist\n",
" c = 0.66 * max_dist\n",
" d = 1.0 * max_dist\n",
" n_iter =0;\n",
"\n",
" # While we haven't found the minimum closely enough\n",
" while np.abs(b-c) > thresh and n_iter < max_iter:\n",
" # Increment iteration counter (just to prevent an infinite loop)\n",
" n_iter = n_iter+1\n",
" # Calculate all four points\n",
" lossa = loss_function_1D(a, data, model, phi,gradient)\n",
" lossb = loss_function_1D(b, data, model, phi,gradient)\n",
" lossc = loss_function_1D(c, data, model, phi,gradient)\n",
" lossd = loss_function_1D(d, data, model, phi,gradient)\n",
"\n",
" if verbose:\n",
" print('Iter %d, a=%3.3f, b=%3.3f, c=%3.3f, d=%3.3f'%(n_iter, a,b,c,d))\n",
" print('a %f, b%f, c%f, d%f'%(lossa,lossb,lossc,lossd))\n",
"\n",
" # Rule #1 If point A is less than points B, C, and D then halve points B,C, and D\n",
" if np.argmin((lossa,lossb,lossc,lossd))==0:\n",
" b = b/2\n",
" c = c/2\n",
" d = d/2\n",
" continue;\n",
"\n",
" # Rule #2 If point b is less than point c then\n",
" # then point d becomes point c, and\n",
" # point b becomes 1/3 between a and new d\n",
" # point c becomes 2/3 between a and new d\n",
" if lossb < lossc:\n",
" d = c\n",
" b = a+ (d-a)/3\n",
" c = a+ 2*(d-a)/3\n",
" continue\n",
"\n",
" # Rule #2 If point c is less than point b then\n",
" # then point a becomes point b, and\n",
" # point b becomes 1/3 between new a and d\n",
" # point c becomes 2/3 between new a and d\n",
" a = b\n",
" b = a+ (d-a)/3\n",
" c = a+ 2*(d-a)/3\n",
"\n",
" # Return average of two middle points\n",
" return (b+c)/2.0"
],
"metadata": {
"id": "XrJ2gQjfw1XP"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def gradient_descent_step(phi, data, model):\n",
" # TODO -- update Phi with the gradient descent step (equation 6.3)\n",
" # 1. Compute the gradient\n",
" # 2. Find the best step size alpha (use negative gradient as going downhill)\n",
" # 3. Update the parameters phi\n",
"\n",
" return phi"
],
"metadata": {
"id": "YVq6rmaWRD2M"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# Initialize the parameters and draw the model\n",
"n_steps = 10\n",
"phi_all = np.zeros((2,n_steps+1))\n",
"phi_all[0,0] = 1.6\n",
"phi_all[1,0] = -0.5\n",
"\n",
"# Measure loss and draw initial model\n",
"loss = compute_loss(data[0,:], data[1,:], model, phi_all[:,0:1])\n",
"draw_model(data,model,phi_all[:,0:1], \"Initial parameters, Loss = %f\"%(loss))\n",
"\n",
"# Repeatedly take gradient descent steps\n",
"for c_step in range (n_steps):\n",
" # Do gradient descent step\n",
" phi_all[:,c_step+1:c_step+2] = gradient_descent_step(phi_all[:,c_step:c_step+1],data, model)\n",
" # Measure loss and draw model\n",
" loss = compute_loss(data[0,:], data[1,:], model, phi_all[:,c_step+1:c_step+2])\n",
" draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
"\n",
"# Draw the trajectory on the loss function\n",
"draw_loss_function(compute_loss, data, model,phi_all)\n"
],
"metadata": {
"id": "tOLd0gtdRLLS"
},
"execution_count": null,
"outputs": []
}
]
}