{ "cells": [ { "cell_type": "markdown", "id": "64b72553", "metadata": {}, "source": [ "# K-fold Cross Validation Demo" ] }, { "cell_type": "code", "execution_count": null, "id": "81344524", "metadata": {}, "outputs": [], "source": [ "# 1. Import libraries and generate the synthetic dataset\n", "import numpy as np\n", "\n", "# Given dataset of 1000‑by‑50 feature matrix X, and 1000‑by‑1 labels vector\n", "X = np.random.random((1000, 50))\n", "y = np.random.random((1000,))" ] }, { "cell_type": "code", "execution_count": null, "id": "8446a12e", "metadata": {}, "outputs": [], "source": [ "# 2. Define helper functions: `fit` and `predict`\n", "def fit(Xin, Yin, lbda):\n", " \"\"\"Fit a linear model with the provided regularization scalar `lbda`.\n", " NOTE: This follows the *exact* formula from the script (adds lbda directly to the Gram matrix).\"\"\"\n", " # Compute feature‑wise mean for centering\n", " mu = np.mean(Xin, axis=0)\n", " # Center inputs\n", " Xin = Xin - mu\n", " # Solve (XᵀX + λ) w = Xᵀy → w\n", " w = np.linalg.solve(np.dot(Xin.T, Xin) + lbda, np.dot(Xin.T, Yin))\n", " # Intercept so that predictions are unbiased after centering\n", " b = np.mean(Yin) - np.dot(w, mu)\n", " return w, b\n", "\n", "def predict(w, b, Xin):\n", " \"\"\"Return predictions for the input data.\"\"\"\n", " return np.dot(Xin, w) + b" ] }, { "cell_type": "code", "execution_count": null, "id": "3ad22a10", "metadata": {}, "outputs": [], "source": [ "# 3. Split dataset into training/validation/test indices\n", "N_SAMPLES = X.shape[0]\n", "idx = np.random.permutation(N_SAMPLES)\n", "K_FOLD = 5 # number of folds for cross‑validation\n", "\n", "# 90% of data for NON_TEST, remaining 10% for TEST\n", "NON_TEST = idx[0 : 9 * N_SAMPLES // 10]\n", "N_PER_FOLD = len(NON_TEST) // K_FOLD\n", "TEST = idx[9 * N_SAMPLES // 10 :]\n", "\n", "# Candidate λ values\n", "lbdas = [0.1, 0.2, 0.3]\n", "err = np.zeros(len(lbdas))" ] }, { "cell_type": "code", "execution_count": null, "id": "6a0873bb", "metadata": {}, "outputs": [], "source": [ "# 4. 5‑fold cross‑validation loop (follows the script exactly)\n", "for lbda_idx, lbda in enumerate(lbdas):\n", " for i in range(K_FOLD):\n", " # Validation indices for the i‑th fold\n", " VAL = NON_TEST[i * N_PER_FOLD : (i + 1) * N_PER_FOLD]\n", " # Remaining indices form the TRAIN set\n", " TRAIN = np.concatenate((NON_TEST[: i * N_PER_FOLD], NON_TEST[(i + 1) * N_PER_FOLD :]))\n", "\n", " # Extract data subsets\n", " ytrain, Xtrain = y[TRAIN], X[TRAIN]\n", " yval, Xval = y[VAL], X[VAL]\n", "\n", " # Fit and evaluate\n", " w, b = fit(Xtrain, ytrain, lbda)\n", " yval_hat = predict(w, b, Xval)\n", "\n", " # Accumulate MSE for this fold\n", " err[lbda_idx] += np.mean((yval_hat - yval) ** 2)\n", "\n", " # Average over K folds\n", " err[lbda_idx] /= K_FOLD" ] }, { "cell_type": "code", "execution_count": null, "id": "1a4b20fe", "metadata": {}, "outputs": [], "source": [ "# 5. Select best λ and retrain on full cross‑validation data\n", "lbda_best = lbdas[int(np.argmin(err))]\n", "print('Best choice of lambda =', lbda_best)\n", "\n", "# Aggregate all NON_TEST samples (TRAIN + VAL from every fold)\n", "Xtot = X[NON_TEST]\n", "ytot = y[NON_TEST]\n", "w, b = fit(Xtot, ytot, lbda_best)" ] }, { "cell_type": "code", "execution_count": null, "id": "f6472256", "metadata": {}, "outputs": [], "source": [ "# 6. Evaluate on training aggregate and held‑out test set\n", "ytest, Xtest = y[TEST], X[TEST]\n", "\n", "train_error = np.mean((predict(w, b, Xtot) - ytot) ** 2)\n", "test_error = np.mean((predict(w, b, Xtest) - ytest) ** 2)\n", "print('Train error =', train_error)\n", "print('Test error =', test_error)" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }