{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "64b72553",
   "metadata": {},
   "source": [
    "# K-fold Cross Validation Demo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81344524",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Import libraries and generate the synthetic dataset\n",
    "import numpy as np\n",
    "\n",
    "# Given dataset of 1000‑by‑50 feature matrix X, and 1000‑by‑1 labels vector\n",
    "X = np.random.random((1000, 50))\n",
    "y = np.random.random((1000,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8446a12e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2. Define helper functions: `fit` and `predict`\n",
    "def fit(Xin, Yin, lbda):\n",
    "    \"\"\"Fit a linear model with the provided regularization scalar `lbda`.\n",
    "    NOTE: This follows the *exact* formula from the script (adds lbda directly to the Gram matrix).\"\"\"\n",
    "    # Compute feature‑wise mean for centering\n",
    "    mu = np.mean(Xin, axis=0)\n",
    "    # Center inputs\n",
    "    Xin = Xin - mu\n",
    "    # Solve (XᵀX + λ) w = Xᵀy  → w\n",
    "    w = np.linalg.solve(np.dot(Xin.T, Xin) + lbda, np.dot(Xin.T, Yin))\n",
    "    # Intercept so that predictions are unbiased after centering\n",
    "    b = np.mean(Yin) - np.dot(w, mu)\n",
    "    return w, b\n",
    "\n",
    "def predict(w, b, Xin):\n",
    "    \"\"\"Return predictions for the input data.\"\"\"\n",
    "    return np.dot(Xin, w) + b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ad22a10",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. Split dataset into training/validation/test indices\n",
    "N_SAMPLES = X.shape[0]\n",
    "idx = np.random.permutation(N_SAMPLES)\n",
    "K_FOLD = 5  # number of folds for cross‑validation\n",
    "\n",
    "# 90% of data for NON_TEST, remaining 10% for TEST\n",
    "NON_TEST = idx[0 : 9 * N_SAMPLES // 10]\n",
    "N_PER_FOLD = len(NON_TEST) // K_FOLD\n",
    "TEST = idx[9 * N_SAMPLES // 10 :]\n",
    "\n",
    "# Candidate λ values\n",
    "lbdas = [0.1, 0.2, 0.3]\n",
    "err = np.zeros(len(lbdas))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a0873bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 4. 5‑fold cross‑validation loop (follows the script exactly)\n",
    "for lbda_idx, lbda in enumerate(lbdas):\n",
    "    for i in range(K_FOLD):\n",
    "        # Validation indices for the i‑th fold\n",
    "        VAL = NON_TEST[i * N_PER_FOLD : (i + 1) * N_PER_FOLD]\n",
    "        # Remaining indices form the TRAIN set\n",
    "        TRAIN = np.concatenate((NON_TEST[: i * N_PER_FOLD], NON_TEST[(i + 1) * N_PER_FOLD :]))\n",
    "\n",
    "        # Extract data subsets\n",
    "        ytrain, Xtrain = y[TRAIN], X[TRAIN]\n",
    "        yval,   Xval   = y[VAL],   X[VAL]\n",
    "\n",
    "        # Fit and evaluate\n",
    "        w, b = fit(Xtrain, ytrain, lbda)\n",
    "        yval_hat = predict(w, b, Xval)\n",
    "\n",
    "        # Accumulate MSE for this fold\n",
    "        err[lbda_idx] += np.mean((yval_hat - yval) ** 2)\n",
    "\n",
    "    # Average over K folds\n",
    "    err[lbda_idx] /= K_FOLD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a4b20fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 5. Select best λ and retrain on full cross‑validation data\n",
    "lbda_best = lbdas[int(np.argmin(err))]\n",
    "print('Best choice of lambda =', lbda_best)\n",
    "\n",
    "# Aggregate all NON_TEST samples (TRAIN + VAL from every fold)\n",
    "Xtot = X[NON_TEST]\n",
    "ytot = y[NON_TEST]\n",
    "w, b = fit(Xtot, ytot, lbda_best)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6472256",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 6. Evaluate on training aggregate and held‑out test set\n",
    "ytest, Xtest = y[TEST], X[TEST]\n",
    "\n",
    "train_error = np.mean((predict(w, b, Xtot) - ytot) ** 2)\n",
    "test_error  = np.mean((predict(w, b, Xtest) - ytest) ** 2)\n",
    "print('Train error =', train_error)\n",
    "print('Test error  =', test_error)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}