# K-fold Cross Validation Demo

In [None]:
# 1. Import libraries and generate the synthetic dataset
import numpy as np

# Given dataset of 1000‑by‑50 feature matrix X, and 1000‑by‑1 labels vector
X = np.random.random((1000, 50))
y = np.random.random((1000,))

In [None]:
# 2. Define helper functions: `fit` and `predict`
def fit(Xin, Yin, lbda):
 """Fit a linear model with the provided regularization scalar `lbda`.
 NOTE: This follows the *exact* formula from the script (adds lbda directly to the Gram matrix)."""
 # Compute feature‑wise mean for centering
 mu = np.mean(Xin, axis=0)
 # Center inputs
 Xin = Xin - mu
 # Solve (XᵀX + λ) w = Xᵀy → w
 w = np.linalg.solve(np.dot(Xin.T, Xin) + lbda, np.dot(Xin.T, Yin))
 # Intercept so that predictions are unbiased after centering
 b = np.mean(Yin) - np.dot(w, mu)
 return w, b

def predict(w, b, Xin):
 """Return predictions for the input data."""
 return np.dot(Xin, w) + b

In [None]:
# 3. Split dataset into training/validation/test indices
N_SAMPLES = X.shape[0]
idx = np.random.permutation(N_SAMPLES)
K_FOLD = 5 # number of folds for cross‑validation

# 90% of data for NON_TEST, remaining 10% for TEST
NON_TEST = idx[0 : 9 * N_SAMPLES // 10]
N_PER_FOLD = len(NON_TEST) // K_FOLD
TEST = idx[9 * N_SAMPLES // 10 :]

# Candidate λ values
lbdas = [0.1, 0.2, 0.3]
err = np.zeros(len(lbdas))

In [None]:
# 4. 5‑fold cross‑validation loop (follows the script exactly)
for lbda_idx, lbda in enumerate(lbdas):
 for i in range(K_FOLD):
 # Validation indices for the i‑th fold
 VAL = NON_TEST[i * N_PER_FOLD : (i + 1) * N_PER_FOLD]
 # Remaining indices form the TRAIN set
 TRAIN = np.concatenate((NON_TEST[: i * N_PER_FOLD], NON_TEST[(i + 1) * N_PER_FOLD :]))

 # Extract data subsets
 ytrain, Xtrain = y[TRAIN], X[TRAIN]
 yval, Xval = y[VAL], X[VAL]

 # Fit and evaluate
 w, b = fit(Xtrain, ytrain, lbda)
 yval_hat = predict(w, b, Xval)

 # Accumulate MSE for this fold
 err[lbda_idx] += np.mean((yval_hat - yval) ** 2)

 # Average over K folds
 err[lbda_idx] /= K_FOLD

In [None]:
# 5. Select best λ and retrain on full cross‑validation data
lbda_best = lbdas[int(np.argmin(err))]
print('Best choice of lambda =', lbda_best)

# Aggregate all NON_TEST samples (TRAIN + VAL from every fold)
Xtot = X[NON_TEST]
ytot = y[NON_TEST]
w, b = fit(Xtot, ytot, lbda_best)

In [None]:
# 6. Evaluate on training aggregate and held‑out test set
ytest, Xtest = y[TEST], X[TEST]

train_error = np.mean((predict(w, b, Xtot) - ytot) ** 2)
test_error = np.mean((predict(w, b, Xtest) - ytest) ** 2)
print('Train error =', train_error)
print('Test error =', test_error)