# Code to estimate bias/variance for varying model complexities from __future__ import division import numpy as np import matplotlib.pyplot as plt WEIGHTS = np.array([10,0,-1,0,0.01]) SIGMA = 12 BOUNDS = 12 NUM_DATASETS = 12 # Expands vector of x's using our polynomial basis def expand_poly_basis(xs,order): X = np.ones((xs.size,order+1)) for i in range(order): X[:,i+1] = X[:,i]*xs return X # Evaluates a polynomial on all points in the vector 'xs' # Can evaluate multiple polynomials if weights has multiple columns def eval_poly(xs,weights): order = weights.shape[0] - 1 X = expand_poly_basis(xs,order) return X.dot(weights) # Generates 'num' sets of target data using Gaussian noise def gen_random_targets(xs,weights,sigma,num): noise = np.random.normal(0,sigma,(num,xs.size)) return (eval_poly(xs,weights) + noise).T # Plots a polynomial given its coefficients def plot_poly(weights,bounds,color='k--',lw=1,label=None): xs = np.linspace(-bounds,bounds) ys = eval_poly(xs,weights) plt.plot(xs,ys,color,lw=lw,label=label) # Solves linear regression problems for a number of data/target pairs. # Assumes multiple sets of target data for one set of 'xs' def solve(xs,Y,order): expanded = expand_poly_basis(xs,order) weights,_,_,_ = np.linalg.lstsq(expanded,Y) return weights if __name__ == '__main__': # Default 'macosx' backend is really annoying plt.switch_backend('tkagg') xs_train = np.linspace(-BOUNDS,BOUNDS,BOUNDS+1) xs_test = np.linspace(-BOUNDS+1,BOUNDS-1,BOUNDS) # Describe how this process will work, we draw random datasets # and fit a polynomial to each one Y_train = gen_random_targets(xs_train,WEIGHTS,SIGMA,4) W = solve(xs_train,Y_train,2) colors = ['b','g','c','m'] for i in range(4): plot_poly(WEIGHTS,BOUNDS+1,'k',2,'Ground Truth') plot_poly(W[:,i],BOUNDS+1,colors[i],2,'Learned Function') plt.scatter(xs_train,Y_train[:,i],c=colors[i],s=40) plt.legend() plt.xlim(-BOUNDS-1,BOUNDS+1) plt.ylim(-50,250) plt.get_current_fig_manager().full_screen_toggle() plt.show() # We average them together to get our w_bar, and we'll # compute the bias/variance using the original set of 'xs' w_bar = W.mean(axis=1) plot_poly(WEIGHTS,BOUNDS+1,'k',2,'Ground Truth') for x in xs_test: plt.axvline(x=x,ls='--',c='k') for i in range(4): plot_poly(W[:,i],BOUNDS+1,colors[i],label='Learned Function') plt.scatter(xs_train,Y_train[:,i],c=colors[i],s=40) plot_poly(w_bar,BOUNDS+2,'r',4,'Average Function') plt.legend() plt.xlim(-BOUNDS-1,BOUNDS+1) plt.ylim(-50,250) plt.get_current_fig_manager().full_screen_toggle() plt.show() # Do this process for increasing model complexity, using many random # datasets. We don't distinguish the datasets via color anymore Y_train = gen_random_targets(xs_train,WEIGHTS,SIGMA,NUM_DATASETS) y_true = eval_poly(xs_test,WEIGHTS) orders = np.arange(1,BOUNDS) biases = np.empty_like(orders) variances = np.empty_like(orders) for i in range(orders.size): # Solve the linear regression for each dataset W = solve(xs_train,Y_train,orders[i]) w_bar = W.mean(axis=1) # Compute the bias/variance Y_test = eval_poly(xs_test,W) y_bar = eval_poly(xs_test,w_bar) resid = y_true - y_bar biases[i] = np.mean(resid**2) resid = Y_test - y_bar[:,np.newaxis] variances[i] = np.mean(resid**2) # Plot the learned functions/average plt.figure(figsize=(10,6)) plt.subplot(1,2,1) plt.title("Order = {}".format(orders[i])) plot_poly(WEIGHTS,BOUNDS+1,'k',4,'Ground Truth') plot_poly(w_bar,BOUNDS+1,'r',4,'Average') for j in range(NUM_DATASETS): plt.scatter(xs_train,Y_train[:,j],c='k',s=40) plot_poly(W[:,j],BOUNDS+1) plt.legend() plt.xlim(-BOUNDS-1,BOUNDS+1) plt.ylim(-50,250) # Plot the bias/variance plt.subplot(1,2,2) plt.title("Bias/Variance") plt.plot(orders[:i+1],biases[:i+1],'b-',lw=2,label='Bias') plt.plot(orders[:i+1],variances[:i+1],'r-',lw=2,label='Variance') plt.scatter(orders[:i+1],biases[:i+1],c='b',s=30) plt.scatter(orders[:i+1],variances[:i+1],c='r',s=30) plt.legend() plt.xlim(0,orders[-1]+1) plt.ylim(0,600) plt.get_current_fig_manager().full_screen_toggle() plt.show()