Add files via upload
This commit is contained in:
@@ -1,32 +1,22 @@
|
|||||||
{
|
{
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"provenance": [],
|
|
||||||
"include_colab_link": true
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"name": "python3",
|
|
||||||
"display_name": "Python 3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "view-in-github",
|
"colab_type": "text",
|
||||||
"colab_type": "text"
|
"id": "view-in-github"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap06/6_2_Gradient_Descent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap06/6_2_Gradient_Descent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "el8l05WQEO46"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# **Notebook 6.2 Gradient descent**\n",
|
"# **Notebook 6.2 Gradient descent**\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -36,10 +26,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions.\n",
|
"Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions.\n",
|
||||||
"\n"
|
"\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "el8l05WQEO46"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
@@ -58,34 +45,39 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "4cRkrh9MZ58Z"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Let's create our training data 12 pairs {x_i, y_i}\n",
|
"# Let's create our training data 12 pairs {x_i, y_i}\n",
|
||||||
"# We'll try to fit the straight line model to these data\n",
|
"# We'll try to fit the straight line model to these data\n",
|
||||||
"data = np.array([[0.03,0.19,0.34,0.46,0.78,0.81,1.08,1.18,1.39,1.60,1.65,1.90],\n",
|
"data = np.array([[0.03,0.19,0.34,0.46,0.78,0.81,1.08,1.18,1.39,1.60,1.65,1.90],\n",
|
||||||
" [0.67,0.85,1.05,1.00,1.40,1.50,1.30,1.54,1.55,1.68,1.73,1.60]])"
|
" [0.67,0.85,1.05,1.00,1.40,1.50,1.30,1.54,1.55,1.68,1.73,1.60]])"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "4cRkrh9MZ58Z"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "WQUERmb2erAe"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Let's define our model -- just a straight line with intercept phi[0] and slope phi[1]\n",
|
"# Let's define our model -- just a straight line with intercept phi[0] and slope phi[1]\n",
|
||||||
"def model(phi,x):\n",
|
"def model(phi,x):\n",
|
||||||
" y_pred = phi[0]+phi[1] * x\n",
|
" y_pred = phi[0]+phi[1] * x\n",
|
||||||
" return y_pred"
|
" return y_pred"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "WQUERmb2erAe"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "qFRe9POHF2le"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Draw model\n",
|
"# Draw model\n",
|
||||||
"def draw_model(data,model,phi,title=None):\n",
|
"def draw_model(data,model,phi,title=None):\n",
|
||||||
@@ -101,39 +93,40 @@
|
|||||||
" if title is not None:\n",
|
" if title is not None:\n",
|
||||||
" ax.set_title(title)\n",
|
" ax.set_title(title)\n",
|
||||||
" plt.show()"
|
" plt.show()"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "qFRe9POHF2le"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "TXx1Tpd1Tl-I"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Initialize the parameters to some arbitrary values and draw the model\n",
|
"# Initialize the parameters to some arbitrary values and draw the model\n",
|
||||||
"phi = np.zeros((2,1))\n",
|
"phi = np.zeros((2,1))\n",
|
||||||
"phi[0] = 0.6 # Intercept\n",
|
"phi[0] = 0.6 # Intercept\n",
|
||||||
"phi[1] = -0.2 # Slope\n",
|
"phi[1] = -0.2 # Slope\n",
|
||||||
"draw_model(data,model,phi, \"Initial parameters\")\n"
|
"draw_model(data,model,phi, \"Initial parameters\")\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "TXx1Tpd1Tl-I"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Now lets create compute the sum of squares loss for the training data"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "QU5mdGvpTtEG"
|
"id": "QU5mdGvpTtEG"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Now lets create compute the sum of squares loss for the training data"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "I7dqTY2Gg7CR"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def compute_loss(data_x, data_y, model, phi):\n",
|
"def compute_loss(data_x, data_y, model, phi):\n",
|
||||||
" # TODO -- Write this function -- replace the line below\n",
|
" # TODO -- Write this function -- replace the line below\n",
|
||||||
@@ -144,45 +137,47 @@
|
|||||||
" loss = 0\n",
|
" loss = 0\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return loss"
|
" return loss"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "I7dqTY2Gg7CR"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Let's just test that we got that right"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "eB5DQvU5hYNx"
|
"id": "eB5DQvU5hYNx"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Let's just test that we got that right"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
|
|
||||||
"print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 12.367))"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "Ty05UtEEg9tc"
|
"id": "Ty05UtEEg9tc"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
|
||||||
|
"print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 12.367))"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Now let's plot the whole loss function"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "F3trnavPiHpH"
|
"id": "F3trnavPiHpH"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Now let's plot the whole loss function"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "K-NTHpAAHlCl"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def draw_loss_function(compute_loss, data, model, phi_iters = None):\n",
|
"def draw_loss_function(compute_loss, data, model, phi_iters = None):\n",
|
||||||
" # Define pretty colormap\n",
|
" # Define pretty colormap\n",
|
||||||
@@ -209,39 +204,40 @@
|
|||||||
" ax.set_ylim([1,-1])\n",
|
" ax.set_ylim([1,-1])\n",
|
||||||
" ax.set_xlabel('Intercept $\\phi_{0}$'); ax.set_ylabel('Slope, $\\phi_{1}$')\n",
|
" ax.set_xlabel('Intercept $\\phi_{0}$'); ax.set_ylabel('Slope, $\\phi_{1}$')\n",
|
||||||
" plt.show()"
|
" plt.show()"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "K-NTHpAAHlCl"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"draw_loss_function(compute_loss, data, model)"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "l8HbvIupnTME"
|
"id": "l8HbvIupnTME"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"draw_loss_function(compute_loss, data, model)"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "s9Duf05WqqSC"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"Now let's compute the gradient vector for a given set of parameters:\n",
|
"Now let's compute the gradient vector for a given set of parameters:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\\begin{equation}\n",
|
"\\begin{equation}\n",
|
||||||
"\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
|
"\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
|
||||||
"\\end{equation}"
|
"\\end{equation}"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "s9Duf05WqqSC"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "UpswmkL2qwBT"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# These are in the lecture slides and notes, but worth trying to calculate them yourself to\n",
|
"# These are in the lecture slides and notes, but worth trying to calculate them yourself to\n",
|
||||||
"# check that you get them right. Write out the expression for the sum of squares loss and take the\n",
|
"# check that you get them right. Write out the expression for the sum of squares loss and take the\n",
|
||||||
@@ -253,31 +249,32 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" # Return the gradient\n",
|
" # Return the gradient\n",
|
||||||
" return np.array([[dl_dphi0],[dl_dphi1]])"
|
" return np.array([[dl_dphi0],[dl_dphi1]])"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "UpswmkL2qwBT"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "RS1nEcYVuEAM"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"We can check we got this right using a trick known as **finite differences**. If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
|
"We can check we got this right using a trick known as **finite differences**. If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\\begin{eqnarray}\n",
|
"\\begin{align}\n",
|
||||||
"\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
|
"\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
|
||||||
"\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
|
"\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
|
||||||
"\\end{eqnarray}\n",
|
"\\end{align}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"We can't do this when there are many parameters; for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
|
"We can't do this when there are many parameters; for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "RS1nEcYVuEAM"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "QuwAHN7yt-gi"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Compute the gradient using your function\n",
|
"# Compute the gradient using your function\n",
|
||||||
"gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
|
"gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
|
||||||
@@ -290,24 +287,25 @@
|
|||||||
" compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
|
" compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
|
||||||
"print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n",
|
"print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n",
|
||||||
"# There might be small differences in the last significant figure because finite gradients is an approximation\n"
|
"# There might be small differences in the last significant figure because finite gradients is an approximation\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "QuwAHN7yt-gi"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Now we are ready to perform gradient descent. We'll need to use our line search routine from notebook 6.1, which I've reproduced here plus the helper function loss_function_1D that maps the search along the negative gradient direction in 2D space to a 1D problem (distance along this direction)"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "5EIjMM9Fw2eT"
|
"id": "5EIjMM9Fw2eT"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Now we are ready to perform gradient descent. We'll need to use our line search routine from notebook 6.1, which I've reproduced here plus the helper function loss_function_1D that maps the search along the negative gradient direction in 2D space to a 1D problem (distance along this direction)"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "XrJ2gQjfw1XP"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def loss_function_1D(dist_prop, data, model, phi_start, search_direction):\n",
|
"def loss_function_1D(dist_prop, data, model, phi_start, search_direction):\n",
|
||||||
" # Return the loss after moving this far\n",
|
" # Return the loss after moving this far\n",
|
||||||
@@ -362,15 +360,15 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" # Return average of two middle points\n",
|
" # Return average of two middle points\n",
|
||||||
" return (b+c)/2.0"
|
" return (b+c)/2.0"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "XrJ2gQjfw1XP"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "YVq6rmaWRD2M"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def gradient_descent_step(phi, data, model):\n",
|
"def gradient_descent_step(phi, data, model):\n",
|
||||||
" # TODO -- update Phi with the gradient descent step (equation 6.3)\n",
|
" # TODO -- update Phi with the gradient descent step (equation 6.3)\n",
|
||||||
@@ -379,15 +377,15 @@
|
|||||||
" # 3. Update the parameters phi based on the gradient and the step size alpha.\n",
|
" # 3. Update the parameters phi based on the gradient and the step size alpha.\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return phi"
|
" return phi"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "YVq6rmaWRD2M"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "tOLd0gtdRLLS"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Initialize the parameters and draw the model\n",
|
"# Initialize the parameters and draw the model\n",
|
||||||
"n_steps = 10\n",
|
"n_steps = 10\n",
|
||||||
@@ -409,12 +407,22 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# Draw the trajectory on the loss function\n",
|
"# Draw the trajectory on the loss function\n",
|
||||||
"draw_loss_function(compute_loss, data, model,phi_all)\n"
|
"draw_loss_function(compute_loss, data, model,phi_all)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "tOLd0gtdRLLS"
|
"colab": {
|
||||||
|
"include_colab_link": true,
|
||||||
|
"provenance": []
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"kernelspec": {
|
||||||
"outputs": []
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
}
|
}
|
||||||
]
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,33 +1,22 @@
|
|||||||
{
|
{
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 0,
|
|
||||||
"metadata": {
|
|
||||||
"colab": {
|
|
||||||
"provenance": [],
|
|
||||||
"authorship_tag": "ABX9TyNk5FN4qlw3pk8BwDVWw1jN",
|
|
||||||
"include_colab_link": true
|
|
||||||
},
|
|
||||||
"kernelspec": {
|
|
||||||
"name": "python3",
|
|
||||||
"display_name": "Python 3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"name": "python"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "view-in-github",
|
"colab_type": "text",
|
||||||
"colab_type": "text"
|
"id": "view-in-github"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap06/6_3_Stochastic_Gradient_Descent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
"<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap06/6_3_Stochastic_Gradient_Descent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "el8l05WQEO46"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"# **Notebook 6.3: Stochastic gradient descent**\n",
|
"# **Notebook 6.3: Stochastic gradient descent**\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -39,10 +28,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n"
|
"\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "el8l05WQEO46"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
@@ -61,6 +47,11 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "4cRkrh9MZ58Z"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Let's create our training data 30 pairs {x_i, y_i}\n",
|
"# Let's create our training data 30 pairs {x_i, y_i}\n",
|
||||||
"# We'll try to fit the Gabor model to these data\n",
|
"# We'll try to fit the Gabor model to these data\n",
|
||||||
@@ -74,15 +65,15 @@
|
|||||||
" -2.365e-02,5.098e-01,-2.777e-01,3.367e-01,1.927e-01,-2.222e-01,\n",
|
" -2.365e-02,5.098e-01,-2.777e-01,3.367e-01,1.927e-01,-2.222e-01,\n",
|
||||||
" 6.352e-02,6.888e-03,3.224e-02,1.091e-02,-5.706e-01,-5.258e-02,\n",
|
" 6.352e-02,6.888e-03,3.224e-02,1.091e-02,-5.706e-01,-5.258e-02,\n",
|
||||||
" -3.666e-02,1.709e-01,-4.805e-02,2.008e-01,-1.904e-01,5.952e-01]])"
|
" -3.666e-02,1.709e-01,-4.805e-02,2.008e-01,-1.904e-01,5.952e-01]])"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "4cRkrh9MZ58Z"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "WQUERmb2erAe"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Let's define our model\n",
|
"# Let's define our model\n",
|
||||||
"def model(phi,x):\n",
|
"def model(phi,x):\n",
|
||||||
@@ -90,15 +81,15 @@
|
|||||||
" gauss_component = np.exp(-(phi[0] + 0.06 * phi[1] * x) * (phi[0] + 0.06 * phi[1] * x) / 32)\n",
|
" gauss_component = np.exp(-(phi[0] + 0.06 * phi[1] * x) * (phi[0] + 0.06 * phi[1] * x) / 32)\n",
|
||||||
" y_pred= sin_component * gauss_component\n",
|
" y_pred= sin_component * gauss_component\n",
|
||||||
" return y_pred"
|
" return y_pred"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "WQUERmb2erAe"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "qFRe9POHF2le"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Draw model\n",
|
"# Draw model\n",
|
||||||
"def draw_model(data,model,phi,title=None):\n",
|
"def draw_model(data,model,phi,title=None):\n",
|
||||||
@@ -113,39 +104,40 @@
|
|||||||
" if title is not None:\n",
|
" if title is not None:\n",
|
||||||
" ax.set_title(title)\n",
|
" ax.set_title(title)\n",
|
||||||
" plt.show()"
|
" plt.show()"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "qFRe9POHF2le"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "TXx1Tpd1Tl-I"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Initialize the parameters and draw the model\n",
|
"# Initialize the parameters and draw the model\n",
|
||||||
"phi = np.zeros((2,1))\n",
|
"phi = np.zeros((2,1))\n",
|
||||||
"phi[0] = -5 # Horizontal offset\n",
|
"phi[0] = -5 # Horizontal offset\n",
|
||||||
"phi[1] = 25 # Frequency\n",
|
"phi[1] = 25 # Frequency\n",
|
||||||
"draw_model(data,model,phi, \"Initial parameters\")\n"
|
"draw_model(data,model,phi, \"Initial parameters\")\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "TXx1Tpd1Tl-I"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Now lets create compute the sum of squares loss for the training data"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "QU5mdGvpTtEG"
|
"id": "QU5mdGvpTtEG"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Now lets create compute the sum of squares loss for the training data"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "I7dqTY2Gg7CR"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def compute_loss(data_x, data_y, model, phi):\n",
|
"def compute_loss(data_x, data_y, model, phi):\n",
|
||||||
" # TODO -- Write this function -- replace the line below\n",
|
" # TODO -- Write this function -- replace the line below\n",
|
||||||
@@ -155,45 +147,47 @@
|
|||||||
" loss = 0\n",
|
" loss = 0\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return loss"
|
" return loss"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "I7dqTY2Gg7CR"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Let's just test that we got that right"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "eB5DQvU5hYNx"
|
"id": "eB5DQvU5hYNx"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Let's just test that we got that right"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
|
|
||||||
"print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 16.419))"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "Ty05UtEEg9tc"
|
"id": "Ty05UtEEg9tc"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"loss = compute_loss(data[0,:],data[1,:],model,np.array([[0.6],[-0.2]]))\n",
|
||||||
|
"print('Your loss = %3.3f, Correct loss = %3.3f'%(loss, 16.419))"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Now let's plot the whole loss function"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "F3trnavPiHpH"
|
"id": "F3trnavPiHpH"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Now let's plot the whole loss function"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "K-NTHpAAHlCl"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def draw_loss_function(compute_loss, data, model, phi_iters = None):\n",
|
"def draw_loss_function(compute_loss, data, model, phi_iters = None):\n",
|
||||||
" # Define pretty colormap\n",
|
" # Define pretty colormap\n",
|
||||||
@@ -220,39 +214,40 @@
|
|||||||
" ax.set_ylim([2.5,22.5])\n",
|
" ax.set_ylim([2.5,22.5])\n",
|
||||||
" ax.set_xlabel('Offset $\\phi_{0}$'); ax.set_ylabel('Frequency, $\\phi_{1}$')\n",
|
" ax.set_xlabel('Offset $\\phi_{0}$'); ax.set_ylabel('Frequency, $\\phi_{1}$')\n",
|
||||||
" plt.show()"
|
" plt.show()"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "K-NTHpAAHlCl"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"draw_loss_function(compute_loss, data, model)"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "l8HbvIupnTME"
|
"id": "l8HbvIupnTME"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"draw_loss_function(compute_loss, data, model)"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "s9Duf05WqqSC"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"Now let's compute the gradient vector for a given set of parameters:\n",
|
"Now let's compute the gradient vector for a given set of parameters:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\\begin{equation}\n",
|
"\\begin{equation}\n",
|
||||||
"\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
|
"\\frac{\\partial L}{\\partial \\boldsymbol\\phi} = \\begin{bmatrix}\\frac{\\partial L}{\\partial \\phi_0} \\\\\\frac{\\partial L}{\\partial \\phi_1} \\end{bmatrix}.\n",
|
||||||
"\\end{equation}"
|
"\\end{equation}"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "s9Duf05WqqSC"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "UpswmkL2qwBT"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# These came from writing out the expression for the sum of squares loss and taking the\n",
|
"# These came from writing out the expression for the sum of squares loss and taking the\n",
|
||||||
"# derivative with respect to phi0 and phi1. It was a lot of hassle to get it right!\n",
|
"# derivative with respect to phi0 and phi1. It was a lot of hassle to get it right!\n",
|
||||||
@@ -281,31 +276,32 @@
|
|||||||
" dl_dphi1 = gabor_deriv_phi1(data_x, data_y, phi[0],phi[1])\n",
|
" dl_dphi1 = gabor_deriv_phi1(data_x, data_y, phi[0],phi[1])\n",
|
||||||
" # Return the gradient\n",
|
" # Return the gradient\n",
|
||||||
" return np.array([[dl_dphi0],[dl_dphi1]])"
|
" return np.array([[dl_dphi0],[dl_dphi1]])"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "UpswmkL2qwBT"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "RS1nEcYVuEAM"
|
||||||
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"We can check we got this right using a trick known as **finite differences**. If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
|
"We can check we got this right using a trick known as **finite differences**. If we evaluate the function and then change one of the parameters by a very small amount and normalize by that amount, we get an approximation to the gradient, so:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\\begin{eqnarray}\n",
|
"\\begin{align}\n",
|
||||||
"\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
|
"\\frac{\\partial L}{\\partial \\phi_{0}}&\\approx & \\frac{L[\\phi_0+\\delta, \\phi_1]-L[\\phi_0, \\phi_1]}{\\delta}\\\\\n",
|
||||||
"\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
|
"\\frac{\\partial L}{\\partial \\phi_{1}}&\\approx & \\frac{L[\\phi_0, \\phi_1+\\delta]-L[\\phi_0, \\phi_1]}{\\delta}\n",
|
||||||
"\\end{eqnarray}\n",
|
"\\end{align}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"We can't do this when there are many parameters; for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
|
"We can't do this when there are many parameters; for a million parameters, we would have to evaluate the loss function two million times, and usually computing the gradients directly is much more efficient."
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "RS1nEcYVuEAM"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "QuwAHN7yt-gi"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Compute the gradient using your function\n",
|
"# Compute the gradient using your function\n",
|
||||||
"gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
|
"gradient = compute_gradient(data[0,:],data[1,:], phi)\n",
|
||||||
@@ -317,24 +313,25 @@
|
|||||||
"dl_dphi1_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[0],[delta]])) - \\\n",
|
"dl_dphi1_est = (compute_loss(data[0,:],data[1,:],model,phi+np.array([[0],[delta]])) - \\\n",
|
||||||
" compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
|
" compute_loss(data[0,:],data[1,:],model,phi))/delta\n",
|
||||||
"print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n"
|
"print(\"Approx gradients: (%3.3f,%3.3f)\"%(dl_dphi0_est,dl_dphi1_est))\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "QuwAHN7yt-gi"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
"attachments": {},
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"source": [
|
|
||||||
"Now we are ready to perform gradient descent. We'll need to use our line search routine from Notebook 6.1, which I've reproduced here plus the helper function loss_function_1D that converts from a 2D problem to a 1D problem"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "5EIjMM9Fw2eT"
|
"id": "5EIjMM9Fw2eT"
|
||||||
}
|
},
|
||||||
|
"source": [
|
||||||
|
"Now we are ready to perform gradient descent. We'll need to use our line search routine from Notebook 6.1, which I've reproduced here plus the helper function loss_function_1D that converts from a 2D problem to a 1D problem"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "XrJ2gQjfw1XP"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def loss_function_1D(dist_prop, data, model, phi_start, gradient):\n",
|
"def loss_function_1D(dist_prop, data, model, phi_start, gradient):\n",
|
||||||
" # Return the loss after moving this far\n",
|
" # Return the loss after moving this far\n",
|
||||||
@@ -389,15 +386,15 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" # Return average of two middle points\n",
|
" # Return average of two middle points\n",
|
||||||
" return (b+c)/2.0"
|
" return (b+c)/2.0"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "XrJ2gQjfw1XP"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "YVq6rmaWRD2M"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def gradient_descent_step(phi, data, model):\n",
|
"def gradient_descent_step(phi, data, model):\n",
|
||||||
" # Step 1: Compute the gradient\n",
|
" # Step 1: Compute the gradient\n",
|
||||||
@@ -406,15 +403,15 @@
|
|||||||
" alpha = line_search(data, model, phi, gradient*-1, max_dist = 2.0)\n",
|
" alpha = line_search(data, model, phi, gradient*-1, max_dist = 2.0)\n",
|
||||||
" phi = phi - alpha * gradient\n",
|
" phi = phi - alpha * gradient\n",
|
||||||
" return phi"
|
" return phi"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "YVq6rmaWRD2M"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "tOLd0gtdRLLS"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Initialize the parameters\n",
|
"# Initialize the parameters\n",
|
||||||
"n_steps = 21\n",
|
"n_steps = 21\n",
|
||||||
@@ -435,41 +432,41 @@
|
|||||||
" draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
|
" draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"draw_loss_function(compute_loss, data, model,phi_all)\n"
|
"draw_loss_function(compute_loss, data, model,phi_all)\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "tOLd0gtdRLLS"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"# TODO Experiment with starting the optimization in the previous cell in different places\n",
|
|
||||||
"# and show that it heads to a local minimum if we don't start it in the right valley"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "Oi8ZlH0ptLqA"
|
"id": "Oi8ZlH0ptLqA"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"# TODO Experiment with starting the optimization in the previous cell in different places\n",
|
||||||
|
"# and show that it heads to a local minimum if we don't start it in the right valley"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "4l-ueLk-oAxV"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def gradient_descent_step_fixed_learning_rate(phi, data, alpha):\n",
|
"def gradient_descent_step_fixed_learning_rate(phi, data, alpha):\n",
|
||||||
" # TODO -- fill in this routine so that we take a fixed size step of size alpha without using line search\n",
|
" # TODO -- fill in this routine so that we take a fixed size step of size alpha without using line search\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return phi"
|
" return phi"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "4l-ueLk-oAxV"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "oi9MX_GRpM41"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Initialize the parameters\n",
|
"# Initialize the parameters\n",
|
||||||
"n_steps = 21\n",
|
"n_steps = 21\n",
|
||||||
@@ -490,28 +487,28 @@
|
|||||||
" draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
|
" draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"draw_loss_function(compute_loss, data, model,phi_all)\n"
|
"draw_loss_function(compute_loss, data, model,phi_all)\n"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "oi9MX_GRpM41"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "In6sQ5YCpMqn"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# TODO Experiment with the learning rate, alpha.\n",
|
"# TODO Experiment with the learning rate, alpha.\n",
|
||||||
"# What happens if you set it too large?\n",
|
"# What happens if you set it too large?\n",
|
||||||
"# What happens if you set it too small?"
|
"# What happens if you set it too small?"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "In6sQ5YCpMqn"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "VKTC9-1Gpm3N"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def stochastic_gradient_descent_step(phi, data, alpha, batch_size):\n",
|
"def stochastic_gradient_descent_step(phi, data, alpha, batch_size):\n",
|
||||||
" # TODO -- fill in this routine so that we take a fixed size step of size alpha but only using a subset (batch) of the data\n",
|
" # TODO -- fill in this routine so that we take a fixed size step of size alpha but only using a subset (batch) of the data\n",
|
||||||
@@ -522,15 +519,15 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return phi"
|
" return phi"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "VKTC9-1Gpm3N"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "469OP_UHskJ4"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Set the random number generator so you always get same numbers (disable if you don't want this)\n",
|
"# Set the random number generator so you always get same numbers (disable if you don't want this)\n",
|
||||||
"np.random.seed(1)\n",
|
"np.random.seed(1)\n",
|
||||||
@@ -553,34 +550,45 @@
|
|||||||
" draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
|
" draw_model(data,model,phi_all[:,c_step+1], \"Iteration %d, loss = %f\"%(c_step+1,loss))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"draw_loss_function(compute_loss, data, model,phi_all)"
|
"draw_loss_function(compute_loss, data, model,phi_all)"
|
||||||
],
|
]
|
||||||
"metadata": {
|
|
||||||
"id": "469OP_UHskJ4"
|
|
||||||
},
|
|
||||||
"execution_count": null,
|
|
||||||
"outputs": []
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"# TODO -- Experiment with different learning rates, starting points, batch sizes, number of steps. Get a feel for this."
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "LxE2kTa3s29p"
|
"id": "LxE2kTa3s29p"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
|
"# TODO -- Experiment with different learning rates, starting points, batch sizes, number of steps. Get a feel for this."
|
||||||
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"source": [
|
"execution_count": null,
|
||||||
"# TODO -- Add a learning rate schedule. Reduce the learning rate by a factor of beta every M iterations"
|
|
||||||
],
|
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": "lw4QPOaQTh5e"
|
"id": "lw4QPOaQTh5e"
|
||||||
},
|
},
|
||||||
"execution_count": null,
|
"outputs": [],
|
||||||
"outputs": []
|
"source": [
|
||||||
}
|
"# TODO -- Add a learning rate schedule. Reduce the learning rate by a factor of beta every M iterations"
|
||||||
]
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"authorship_tag": "ABX9TyNk5FN4qlw3pk8BwDVWw1jN",
|
||||||
|
"include_colab_link": true,
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user