we want to solve linear regression $${\rm minimize}_w \|{\bf X}w-{\bf y}\|_2^2$$ first generate input, output data examples
there is a subtlety (in notations mainly) in how we handle constant
given data $x=(x[1],x[2],x[3],\cdots,x[d])$ in $d$-dimensions, one option is to use linear predictor of the form $$f(x) = w^Tx = w_1x[1] + w_2x[2] + \cdots, w_dx[d]$$
anogher option is to use affine predictor, by appending a one to the data, i.e. $x=(1,x[1],x[2],x[3],\cdots,x[d])$ $$f(x) = w^Tx = w_0 + w_1x[1] + \cdots + w_d x[d]$$
We will use these notations interchangeably, as it should be clear from the context which one we are using
# generate training data
import numpy as np
n = 100 # sample size
x = np.random.uniform(-1,1,n) #sample input data
x = np.sort(x)
x[0]=-1
x[n-1]=1
# ground truth is 5-th order polynomial and we add Gaussian noise to it
y = (x-.99)*(x-.4)*(x-.25)*(x+.6)*(x+.8) + .03*np.random.randn(n)
# plot the samples and grounstruth
t = np.linspace(-1,1,100)
y0 = (t-.99)*(t-.4)*(t-.25)*(t+.6)*(t+.8)
import matplotlib.pyplot as plt
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.show()
# generate test data
n_ = 100
x_ = np.random.uniform(-1,1,n_)
y_ = (x_-.99)*(x_-.4)*(x_-.25)*(x_+.6)*(x_+.8) + .03*np.random.randn(n_)
next, create data matrix ${\bf X}$ $${\bf X} = \begin{bmatrix} (x_1)^T\\ (x_2)^T\\ \vdots \\ (x_n)^T \end{bmatrix}$$
we are going to ignore any input data and try to fit a constanf function $f(x)=w_1$
we create a data matrix with $x_i=1$
data matrix is
# create a data matrix X
X = np.vstack([np.ones(len(x))]).T
display(X)
Recall that the least squares solution is \begin{eqnarray} \hat{w}_{\rm LS} &=& (X^TX)^{-1}X^T y\\ &=& n^{-1} 1^T y \\ &=& \frac{1}{n}\sum_{i=1}^n y_i \end{eqnarray}
which is just the average of the sample outcome data, and the prediction is $\hat{w}_{\rm LS} = \frac{1}{n}\sum_{i=1}^n y_i$ for all data point $x$,
$$ \hat{y} = \begin{bmatrix} \hat{y}_1\\ \vdots\\ \hat{y}_n \end{bmatrix} = \begin{bmatrix} \hat{w}_{\rm LS}^Tx_1 \\ \vdots \\ \hat{w}_{\rm LS}^T x_n\end{bmatrix} = \hat{w}_{\rm LS} \begin{bmatrix}1\\ \vdots\\ 1\end{bmatrix}$$hence, the average is the best constant fit for suqared loss
and the MSE is the variance of the outcome:
$$ \frac1n \sum_{i=1}^n ({\rm ave}(y)-y_i)^2$$# linear regression with constant feature
w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh = w*X
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(x, yh, 'g')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh)**2)
display(MSE_train)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_))]).T
yh_ = w*X_
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
for affine fit, we use $(1,x_i)$ as the input data, and the data matrix ${\bf X}$ is
# create a data matrix X
X = np.vstack([np.ones(len(x)),x]).T
display(X)
the prediction has the form $$\hat{y} = w_1+w_2x$$ this is called straight-line fit and if $x$ is time, it is called trend line
# linear regression
p=1
w1 = np.array(2)
w1 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh1= np.matmul(X,w1)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(x, yh1, 'g')
plt.plot(x, yh, 'g-.')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh1)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_]).T
yh_= np.matmul(X_,w1)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.array([[1,MSE_train,MSE_test]])
polynomial functions are useful features, as any smooth function can be represented by polynomials of large enough degree using Taylor expansion.
for degere 2 polynomial, we use $(1,x,x^2)$ as feature vector
# create a data matrix X
p=2
X = np.vstack([np.ones(len(x)),x,x**2]).T
display(X)
# degree-2 polynomial linear regression
w2 = np.array(3)
w2 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh2= np.matmul(X,w2)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(x, yh2, 'g')
plt.plot(x, yh1, 'g-.')
plt.plot(x, yh, 'g-.')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh2)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_,x_**2]).T
yh_= np.matmul(X_,w2)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.append(MSE,[[p,MSE_train,MSE_test]],axis=0)
for p=3 degree polynomial, we use feature vector $(1,x,x^2,x^3)$
# create a data matrix X
p=3
X = np.vstack([np.ones(len(x)),x,x**2,x**3]).T
# degree-p polynomial linear regression
w3 = np.array(p+1)
w3 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh3= np.matmul(X,w3)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(x, yh3, 'g')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh3)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3]).T
yh_= np.matmul(X_,w3)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.append(MSE,[[p,MSE_train,MSE_test]],axis=0)
# create a data matrix X
p=4
X = np.vstack([np.ones(len(x)),x,x**2,x**3,x**4]).T
# degree-p polynomial linear regression
w4 = np.array(p+1)
w4 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh4= np.matmul(X,w4)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(t, np.matmul(np.vstack([np.ones(len(t)),t,t**2,t**3,t**4]).T,w4), 'g')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh4)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3,x_**4]).T
yh_= np.matmul(X_,w4)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.append(MSE,[[p,MSE_train,MSE_test]],axis=0)
# create a data matrix X
p=5
X = np.vstack([np.ones(len(x)),x,x**2,x**3,x**4,x**5]).T
# degree-p polynomial linear regression
w5 = np.array(p+1)
w5 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh5= np.matmul(X,w5)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(t, np.matmul(np.vstack([np.ones(len(t)),t,t**2,t**3,t**4,t**5]).T,w5), 'g')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh5)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3,x_**4,x_**5]).T
yh_= np.matmul(X_,w5)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.append(MSE,[[p,MSE_train,MSE_test]],axis=0)
# create a data matrix X
p=10
X = np.vstack([np.ones(len(x)),x,x**2,x**3,x**4,x**5,x**6,x**7,x**8,x**9,x**10]).T
# degree-p polynomial linear regression
w10 = np.array(p+1)
w10 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh10= np.matmul(X,w10)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(t, np.matmul(np.vstack([np.ones(len(t)),t,t**2,t**3,t**4,t**5,t**6,t**7,t**8,t**9,t**10]).T,w10), 'g')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh10)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3,x_**4,x_**5,x_**6,x_**7,x_**8,x_**9,x_**10]).T
yh_= np.matmul(X_,w10)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.append(MSE,[[p,MSE_train,MSE_test]],axis=0)
# create a data matrix X
p=15
X = np.vstack([np.ones(len(x)),x,x**2,x**3,x**4,x**5,x**6,x**7,x**8,x**9,x**10,x**11,x**12,x**13,x**14,x**15]).T
# degree-p polynomial linear regression
w15 = np.array(p+1)
w15 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh15= np.matmul(X,w15)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(t, np.matmul(np.vstack([np.ones(len(t)),t,t**2,t**3,t**4,t**5,t**6,t**7,t**8,t**9,t**10,t**11,t**12,t**13,t**14,t**15]).T,w15), 'g')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh15)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3,x_**4,x_**5,x_**6,x_**7,x_**8,x_**9,x_**10,x_**11,x_**12,x_**13,x_**14,x_**15]).T
yh_= np.matmul(X_,w15)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.append(MSE,[[p,MSE_train,MSE_test]],axis=0)
# create a data matrix X
p=20
X = np.vstack([np.ones(len(x)),x,x**2,x**3,x**4,x**5,x**6,x**7,x**8,x**9,x**10,x**11,x**12,x**13,x**14,x**15,x**16,x**17,x**18,x**19,x**20]).T
# degree-p polynomial linear regression
w20 = np.array(p+1)
w20 = np.matmul(np.matmul(np.linalg.inv(np.matmul(X.T,X)),X.T),y)
yh20= np.matmul(X,w20)
plt.plot(x,y,'o')
plt.plot(t,y0,'k-')
plt.plot(t, np.matmul(np.vstack([np.ones(len(t)),t,t**2,t**3,t**4,t**5,t**6,t**7,t**8,t**9,t**10,t**11,t**12,t**13,t**14,t**15,t**16,t**17,t**18,t**19,t**20]).T,w20), 'g')
plt.show()
MSE_train = (1/float(n))*np.sum((y - yh20)**2)
# MSE on fresh data not used in training
X_ = np.vstack([np.ones(len(x_)),x_,x_**2,x_**3,x_**4,x_**5,x_**6,x_**7,x_**8,x_**9,x_**10,x_**11,x_**12,x_**13,x_**14,x_**15,x_**16,x_**17,x_**18,x_**19,x_**20]).T
yh_= np.matmul(X_,w20)
MSE_test = (1/float(n_))*np.sum((y_ - yh_)**2)
MSE = np.append(MSE,[[p,MSE_train,MSE_test]],axis=0)
feature engineering requires domain knowledge about what are good features for the application
import matplotlib.pyplot as plt
plt.plot(MSE.T[0],MSE.T[1],'o-',label='train error')
plt.plot(MSE.T[0],MSE.T[2],'o-',label='test error')
plt.legend()
plt.show()