udlbook/Notebooks/Chap08/8_3_Double_Descent.ipynb

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/udlbook/udlbook/blob/main/Notebooks/Chap08/8_3_Double_Descent.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# **Notebook 8.3: Double Descent**\n",
        "\n",
        "This notebook investigates double descent as described in section 8.4 of the book.\n",
        "\n",
        "It uses the MNIST-1D database which can be found at https://github.com/greydanus/mnist1d\n",
        "\n",
        "Work through the cells below, running each cell in turn. In various places you will see the words \"TO DO\". Follow the instructions at these places and make predictions about what is going to happen or write code to complete the functions.\n",
        "\n",
        "Contact me at udlbookmail@gmail.com if you find any mistakes or have any suggestions."
      ],
      "metadata": {
        "id": "L6chybAVFJW2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Run this if you're in a Colab to install MNIST 1D repository\n",
        "!pip install git+https://github.com/greydanus/mnist1d"
      ],
      "metadata": {
        "id": "fn9BP5N5TguP"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import torch, torch.nn as nn\n",
        "from torch.utils.data import TensorDataset, DataLoader\n",
        "from torch.optim.lr_scheduler import StepLR\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import mnist1d\n",
        "import random\n",
        "random.seed(0)\n",
        "\n",
        "# Try attaching to GPU -- Use \"Change Runtime Type to change to GPUT\"\n",
        "DEVICE = str(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))\n",
        "print('Using:', DEVICE)"
      ],
      "metadata": {
        "id": "hFxuHpRqTgri"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "args = mnist1d.data.get_dataset_args()\n",
        "args.num_samples = 8000\n",
        "args.train_split = 0.5\n",
        "args.corr_noise_scale = 0.25\n",
        "args.iid_noise_scale=2e-2\n",
        "data = mnist1d.data.get_dataset(args, path='./mnist1d_data.pkl', download=False, regenerate=True)\n",
        "\n",
        "# Add 15% noise to training labels\n",
        "for c_y in range(len(data['y'])):\n",
        "    random_number = random.random()\n",
        "    if random_number < 0.15 :\n",
        "        random_int = int(random.random() * 10)\n",
        "        data['y'][c_y] = random_int\n",
        "\n",
        "# The training and test input and outputs are in\n",
        "# data['x'], data['y'], data['x_test'], and data['y_test']\n",
        "print(\"Examples in training set: {}\".format(len(data['y'])))\n",
        "print(\"Examples in test set: {}\".format(len(data['y_test'])))\n",
        "print(\"Dimensionality of each example: {}\".format(data['x'].shape[-1]))"
      ],
      "metadata": {
        "id": "PW2gyXL5UkLU"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Initialize the parameters with He initialization\n",
        "def weights_init(layer_in):\n",
        "  if isinstance(layer_in, nn.Linear):\n",
        "    nn.init.kaiming_uniform_(layer_in.weight)\n",
        "    layer_in.bias.data.fill_(0.0)\n",
        "\n",
        "# Return an initialized model with two hidden layers and n_hidden hidden units at each\n",
        "def get_model(n_hidden):\n",
        "\n",
        "  D_i = 40    # Input dimensions\n",
        "  D_k = n_hidden   # Hidden dimensions\n",
        "  D_o = 10    # Output dimensions\n",
        "\n",
        "  # Define a model with two hidden layers\n",
        "  # And ReLU activations between them\n",
        "  model = nn.Sequential(\n",
        "  nn.Linear(D_i, D_k),\n",
        "  nn.ReLU(),\n",
        "  nn.Linear(D_k, D_k),\n",
        "  nn.ReLU(),\n",
        "  nn.Linear(D_k, D_o))\n",
        "\n",
        "  # Call the function you just defined\n",
        "  model.apply(weights_init)\n",
        "\n",
        "  # Return the model\n",
        "  return model ;"
      ],
      "metadata": {
        "id": "hAIvZOAlTnk9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def fit_model(model, data, n_epoch):\n",
        "\n",
        "  # choose cross entropy loss function (equation 5.24)\n",
        "  loss_function = torch.nn.CrossEntropyLoss()\n",
        "  # construct SGD optimizer and initialize learning rate and momentum\n",
        "  # optimizer = torch.optim.Adam(model.parameters(), lr=0.01)\n",
        "  optimizer = torch.optim.SGD(model.parameters(), lr = 0.01, momentum=0.9)\n",
        "\n",
        "\n",
        "  x_train = torch.tensor(data['x'].astype('float32'))\n",
        "  y_train = torch.tensor(data['y'].transpose().astype('long'))\n",
        "  x_test= torch.tensor(data['x_test'].astype('float32'))\n",
        "  y_test = torch.tensor(data['y_test'].astype('long'))\n",
        "\n",
        "  # load the data into a class that creates the batches\n",
        "  data_loader = DataLoader(TensorDataset(x_train,y_train), batch_size=100, shuffle=True, worker_init_fn=np.random.seed(1))\n",
        "\n",
        "  for epoch in range(n_epoch):\n",
        "    # loop over batches\n",
        "    for i, batch in enumerate(data_loader):\n",
        "      # retrieve inputs and labels for this batch\n",
        "      x_batch, y_batch = batch\n",
        "      # zero the parameter gradients\n",
        "      optimizer.zero_grad()\n",
        "      # forward pass -- calculate model output\n",
        "      pred = model(x_batch)\n",
        "      # compute the loss\n",
        "      loss = loss_function(pred, y_batch)\n",
        "      # backward pass\n",
        "      loss.backward()\n",
        "      # SGD update\n",
        "      optimizer.step()\n",
        "\n",
        "    # Run whole dataset to get statistics -- normally wouldn't do this\n",
        "    pred_train = model(x_train)\n",
        "    pred_test = model(x_test)\n",
        "    _, predicted_train_class = torch.max(pred_train.data, 1)\n",
        "    _, predicted_test_class = torch.max(pred_test.data, 1)\n",
        "    errors_train = 100 - 100 * (predicted_train_class == y_train).float().sum() / len(y_train)\n",
        "    errors_test= 100 - 100 * (predicted_test_class == y_test).float().sum() / len(y_test)\n",
        "    losses_train = loss_function(pred_train, y_train).item()\n",
        "    losses_test= loss_function(pred_test, y_test).item()\n",
        "    if epoch%100 ==0 :\n",
        "      print(f'Epoch {epoch:5d}, train loss {losses_train:.6f}, train error {errors_train:3.2f},  test loss {losses_test:.6f}, test error {errors_test:3.2f}')\n",
        "\n",
        "  return errors_train, errors_test\n"
      ],
      "metadata": {
        "id": "AazlQhheWmHk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def count_parameters(model):\n",
        "    return sum(p.numel() for p in model.parameters() if p.requires_grad)"
      ],
      "metadata": {
        "id": "AQNCmFNV6JpV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The following code produces the double descent curve by training the model with different numbers of hidden units and plotting the test error.\n",
        "\n",
        "TO DO:\n",
        "\n",
        "*Before* you run the code, and considering that there are 4000 training examples predict:<br>\n",
        "\n",
        "1.    At what capacity do you think the training error will become zero?\n",
        "2.   At what capacity do you expect the first minima of the double descent curve to appear?\n",
        "3. At what capacity do you expect the maximum of the double descent curve to appear?"
      ],
      "metadata": {
        "id": "IcP4UPMudxPS"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# This code will take a while (~30 mins on GPU) to run!  Go and make a cup of coffee!\n",
        "\n",
        "hidden_variables = np.array([2,4,6,8,10,14,18,22,26,30,35,40,45,50,55,60,70,80,90,100,120,140,160,180,200,250,300,400]) ;\n",
        "\n",
        "errors_train_all = np.zeros_like(hidden_variables)\n",
        "errors_test_all = np.zeros_like(hidden_variables)\n",
        "total_weights_all = np.zeros_like(hidden_variables)\n",
        "\n",
        "# loop over the dataset n_epoch times\n",
        "n_epoch = 1000\n",
        "\n",
        "# For each hidden variable size\n",
        "for c_hidden in range(len(hidden_variables)):\n",
        "    print(f'Training model with {hidden_variables[c_hidden]:3d} hidden variables')\n",
        "    # Get a model\n",
        "    model = get_model(hidden_variables[c_hidden]) ;\n",
        "    # Count and store number of weights\n",
        "    total_weights_all[c_hidden] = count_parameters(model)\n",
        "    # Train the model\n",
        "    errors_train, errors_test = fit_model(model, data, n_epoch)\n",
        "    # Store the results\n",
        "    errors_train_all[c_hidden] = errors_train\n",
        "    errors_test_all[c_hidden]= errors_test\n",
        "\n"
      ],
      "metadata": {
        "id": "K4OmBZGHWXpk"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import numpy as np\n",
        "\n",
        "# Assuming data['y'] is available and contains the training examples\n",
        "num_training_examples = len(data['y'])\n",
        "\n",
        "# Find the index where total_weights_all is closest to num_training_examples\n",
        "closest_index = np.argmin(np.abs(np.array(total_weights_all) - num_training_examples))\n",
        "\n",
        "# Get the corresponding value of hidden variables\n",
        "hidden_variable_at_num_training_examples = hidden_variables[closest_index]\n",
        "\n",
        "# Plot the results\n",
        "fig, ax = plt.subplots()\n",
        "ax.plot(hidden_variables, errors_train_all, 'r-', label='train')\n",
        "ax.plot(hidden_variables, errors_test_all, 'b-', label='test')\n",
        "\n",
        "# Add a vertical line at the point where total weights equal the number of training examples\n",
        "ax.axvline(x=hidden_variable_at_num_training_examples, color='g', linestyle='--', label='N(weights) = N(train)')\n",
        "\n",
        "ax.set_ylim(0, 100)\n",
        "ax.set_xlabel('No. hidden variables')\n",
        "ax.set_ylabel('Error')\n",
        "ax.legend()\n",
        "plt.show()\n"
      ],
      "metadata": {
        "id": "Rw-iRboTXbck"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "KT4X8_hE5NFb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "iGKZSfVF2r4z"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}