# generate training data
import numpy as np
n = 40 # sample size
x = np.random.uniform(-1,1,n) #sample input data
x = np.sort(x)
x[0]=-1
x[n-1]=1

# ground truth is 5-th order polynomial and we add Gaussian noise to it
y = (x-.99)*(x-.4)*(x-.25)*(x+.6)*(x+.8) + .03*np.random.randn(n)

# generate test data
n_ = 100
x_ = np.random.uniform(-1,1,n_)
y_ = (x_-.99)*(x_-.4)*(x_-.25)*(x_+.6)*(x_+.8) + .03*np.random.randn(n_)


# plot the samples and grounstruth
t = np.linspace(-1,1,100)
y0 = (t-.99)*(t-.4)*(t-.25)*(t+.6)*(t+.8)
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(9, 8), dpi= 80, facecolor='w', edgecolor='k')
plt.plot(x,y,'o',label='train dataset')
plt.plot(x_,y_,'ro',label='test dataset')
plt.plot(t,y0,'k-')
plt.legend()
plt.show()


num_runs = 100
yhat_simple = np.zeros((num_runs,len(t)))
yhat_complex = np.zeros((num_runs,len(t)))
yhat_justright = np.zeros((num_runs,len(t)))

run=0
while run < num_runs:
    n = 40 # sample size
    x = np.random.uniform(-1,1,n) #sample input data
    x[0]=-1
    x[n-1]=1
    y = (x-.99)*(x-.4)*(x-.25)*(x+.6)*(x+.8) + .03*np.random.randn(n)


    # degree-3 polynomial linear regression
    p=3
    X = np.vstack([np.ones(len(x)),x,x**2,x**3]).T
    w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
    T_ = np.vstack([np.ones(len(t)),t,t**2,t**3]).T
    yhat_simple[int(run)] = np.matmul(T_,w)
    yh = np.matmul(X,w)
    X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3]).T
    y_h= np.matmul(X_,w)
    w_ = w
    
    # degree-5 polynomial linear regression
    p=5
    X = np.vstack([np.ones(len(x)),x,x**2,x**3,x**4,x**5]).T
    w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
    T_ = np.vstack([np.ones(len(t)),t,t**2,t**3,t**4,t**5]).T
    yhat_justright[int(run)] = np.matmul(T_,w)
    yh = np.matmul(X,w)
    X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3,x_**4,x_**5]).T
    y_h= np.matmul(X_,w)
    w_ = w

    # degree-15 polynomial linear regression
    p=15
    X = np.vstack([np.ones(len(x)),x,x**2,x**3,x**4,x**5,x**6,x**7,x**8,x**9,x**10,x**11,x**12,x**13,x**14,x**15]).T
    w = np.array(p+1)
    w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
    T_ = np.vstack([np.ones(len(t)),t,t**2,t**3,t**4,t**5,t**6,t**7,t**8,t**9,t**10,t**11,t**12,t**13,t**14,t**15]).T
    yhat_complex[int(run)] = np.matmul(T_,w)
    yh1 = np.matmul(X,w)
    X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3,x_**4,x_**5,x_**6,x_**7,x_**8,x_**9,x_**10,x_**11,x_**12,x_**13,x_**14,x_**15]).T
    y_h1= np.matmul(X_,w)


    run=run+1
    
import matplotlib.pyplot as plt
import seaborn as sns

def build_plot(yhat, title):
    ave_fit = np.zeros(np.size(t))
    for irun in range(1, num_runs):
        plt.plot(t,yhat[int(irun)],'g-',alpha=0.3)
        ave_fit = ave_fit + yhat[int(irun)]
    plt.plot(t,yhat[0],'g-',alpha=0.3,label='Individual fits')
    plt.plot(t,y0,'k-',label='Truth')
    plt.plot(t,(1/float(num_runs))*ave_fit,'r-',label='Average fit')
    plt.legend()
    plt.title(title)
    axes = plt.gca()
    axes.set_ylim([-0.4,0.2])

fig=plt.figure(figsize=(12, 6), dpi= 80, facecolor='w', edgecolor='k')
plt.subplot(1,3,1)
build_plot(yhat_simple, 'Degree 3')
plt.subplot(1,3,2)
build_plot(yhat_justright, 'Degree 5')
plt.subplot(1,3,3)
build_plot(yhat_complex, 'Degree 15')
plt.show()

Bias-variance tradeoff¶

Takeaways¶