data from https://www.kaggle.com/emurphy/ames-iowa-housing-prices-dataset/version/1?login=true#train1.csv ¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

filename = "house_train.csv"
# data = np.loadtxt(filename)
# X = data.T[1:16]
# Y = data.T[17]
# X.shape
df = pd.read_csv(filename)
data = df.values
X = data.T[1:17]
Y = data.T[17]

for i in range(16):
    if (i <4 or (i>5 and i<14) or i>15):
        plt.subplot(1,2,2-(i+1)%2)
        A = np.vstack([X[i], np.ones(len(X[i]))]).T
        w1, w0 = np.linalg.lstsq(A, Y, rcond=None)[0]
        t = np.linspace(np.min(X[i]),np.max(X[i]),20)
        plt.plot(X[i],Y,'.')
        plt.plot(t, w1*t + w0, 'r')
        if ((i+1)%2==0):
            plt.show()

i=8
A = np.vstack([X[i], np.ones(len(X[i]))]).T
w1, w0 = np.linalg.lstsq(A, Y, rcond=None)[0]
t = np.linspace(np.min(X[i]),np.max(X[i]),20)
plt.plot(X[i],Y,'.')
plt.plot(t, w1*t + w0, 'r')
plt.show()

# randomly shuffle the data and split into TRAIN and TEST sets 
Ntrain= 1164
Ntest= 291
N = Ntrain+Ntest
np.random.shuffle(data)
Dx=data.T[1:17]
Dy=data.T[17]
X = Dx.T[0:Ntrain]
X_= Dx.T[Ntrain:N]
Y = np.log(Dy.T[0:Ntrain])
Y_= np.log(Dy.T[Ntrain:N])

# preprocess the data to standardize (remove mean and standardize variance)
from sklearn import preprocessing
X = preprocessing.scale(X)
X_ = preprocessing.scale(X_)
Y = preprocessing.scale(Y)
Y_ = preprocessing.scale(Y_)

# linear fit the data and plot the test y and predicted y
from sklearn.linear_model import LinearRegression
linreg = LinearRegression(normalize=True)
linreg.fit(X,Y)
Y_pred = linreg.predict(X_)
plt.plot(Y_,Y_pred,'.')
t = np.linspace(np.min(Y_),np.max(Y_),20)
plt.plot([t[0], t[-1]],[t[0], t[-1]],'k-')
plt.show()

# randomly shuffle the data and split into TRAIN and TEST sets 
Ntrain= 1164
Ntest= 291
N = Ntrain+Ntest
np.random.shuffle(data)
Dx=data.T[1:17]
Dy=data.T[17]
X = Dx.T[0:Ntrain]
X_= Dx.T[Ntrain:N]
Y = np.log(Dy.T[0:Ntrain])
Y_= np.log(Dy.T[Ntrain:N])

# preprocess the data to standardize (remove mean and standardize variance)
from sklearn import preprocessing
X = preprocessing.scale(X)
X_ = preprocessing.scale(X_)
Y = preprocessing.scale(Y) 
Y_ = preprocessing.scale(Y_) 

# run ridge regression
from sklearn.linear_model import Ridge
Ns = 1000
MSE_train = np.zeros(Ns)
MSE_test = np.zeros(Ns)
Intct = np.zeros(Ns)
Coef = np.zeros((Ns,16))
for i in range(Ns):
    lambd = float(10)**(0.007*float(i)-3)
    ridgereg = Ridge(alpha=lambd,normalize=True)
    # fit the model for varying lambda
    ridgereg.fit(X,Y)
    # compute the predicted values for both train and test datasets
    Y_pred = ridgereg.predict(X_)
    Ypred  = ridgereg.predict(X)
 
    Intct[i] = ridgereg.intercept_
    Coef[i] = ridgereg.coef_
    MSE_train[i] = 1/float(Ntrain)*(sum((Y-Ypred)**2))
    MSE_test[i] = 1/float(Ntest)*(sum((Y_-Y_pred)**2))

    
# plot training error and the coefficient values 
plt.plot(0.007*np.arange(Ns)-3,MSE_train,'-b')
plt.show()
plt.plot(0.007*np.arange(Ns)-3,Coef)
plt.show()

# plot the trianing and test error (for a zommed-in regime) and the coefficient values
I=400
plt.plot(0.007*np.arange(I)-3,MSE_test[0:I],'-r')
plt.plot(0.007*np.arange(I)-3,MSE_train[0:I],'-b')
plt.show()
plt.plot(0.007*np.arange(I)-3,Coef[0:I])
plt.show()

# plot y vs. prediction
plt.plot(Y_,Y_pred,'.')
t = np.linspace(np.min(Y_),np.max(Y_),20)
plt.plot([t[0], t[-1]],[t[0], t[-1]],'k-')
plt.show()

piecewise linear fit¶

# generate training data
import numpy as np
N = 100
x = np.random.uniform(-1,1,N)
x = np.sort(x)
x[0]=-1
x[N-1]=1
y = (x-.95)*(x-.6)*(x-.25)*(x+.5)*(x+.9) + .03*np.random.randn(N)
# generate test data
N_ = 100
x_ = np.random.uniform(-1,1,N_)
y_ = (x_-.95)*(x_-.6)*(x_-.25)*(x_+.5)*(x_+.9) + .03*np.random.randn(N_)

t = np.linspace(-1,1,100)
y0 = (t-.95)*(t-.6)*(t-.25)*(t+.5)*(t+.9)

import matplotlib.pyplot as plt
plt.plot(x,y,'o')
plt.plot(t,y0,'g-')
plt.show()

# feature engineering
A = np.zeros((6,len(x)))
A_ = np.zeros((6,len(x_)))
A[0] = np.ones(len(x))
A[1] = np.clip(x,-1,1)
A[2] = np.clip(x,-0.75,1)
A[3] = np.clip(x,-0.2,1)
A[4] = np.clip(x,0.4,1)
A[5] = np.clip(x,0.8,1)
A_[0] = np.ones(len(x_))
A_[1] = np.clip(x_,-1,1)
A_[2] = np.clip(x_,-0.75,1)
A_[3] = np.clip(x_,-0.2,1)
A_[4] = np.clip(x_,0.4,1)
A_[5] = np.clip(x_,0.8,1)

# Ridge regression fit
from sklearn.linear_model import Ridge
lambd = 0.000001
ridgereg = Ridge(alpha=lambd,normalize=True)
# fit the model for varying lambda
ridgereg.fit(A.T,y)
# compute the predicted values for both train and test datasets
y_pred = ridgereg.predict(A_.T)
ypred  = ridgereg.predict(A.T)

Intct = ridgereg.intercept_
Coef = ridgereg.coef_
MSE_train = 1/float(N)*(sum((y-ypred)**2))
MSE_test = 1/float(N_)*(sum((y_-y_pred)**2))

plt.plot(x,y,'o')
plt.plot(x,ypred,'g-')
plt.show()

plt.plot(y_,y_pred,'.')
t = np.linspace(np.min(y_),np.max(y_),20)
plt.plot([t[0], t[-1]],[t[0], t[-1]],'k-')
plt.show()

# generate training data where there are more parameters than the data points
import numpy as np
N=3
x = np.zeros(N)
x_ = np.zeros(N)
x[0]=-0.83
x[1]=0.72
x[2]=0.91
y = (x-.95)*(x-.6)*(x-.25)*(x+.5)*(x+.9) + .03*np.random.randn(N)
N_=100
x_= np.random.uniform(-1,1,N_)
y_ = (x_-.95)*(x_-.6)*(x_-.25)*(x_+.5)*(x_+.9) + .03*np.random.randn(N_)

# feature engineering
A = np.zeros((6,len(x)))
A_ = np.zeros((6,len(x_)))
A[0] = np.ones(len(x))
A[1] = np.clip(x,-1,1)
A[2] = np.clip(x,-0.75,1)
A[3] = np.clip(x,-0.2,1)
A[4] = np.clip(x,0.4,1)
A[5] = np.clip(x,0.8,1)
A_[0] = np.ones(len(x_))
A_[1] = np.clip(x_,-1,1)
A_[2] = np.clip(x_,-0.75,1)
A_[3] = np.clip(x_,-0.2,1)
A_[4] = np.clip(x_,0.4,1)
A_[5] = np.clip(x_,0.8,1)

# Ridge regression fit
from sklearn.linear_model import Ridge
lambd = 100
ridgereg = Ridge(alpha=lambd,normalize=True)
# fit the model for varying lambda
ridgereg.fit(A.T,y)
# compute the predicted values for both train and test datasets
y_pred = ridgereg.predict(A_.T)
ypred  = ridgereg.predict(A.T)

Intct = ridgereg.intercept_
Coef = ridgereg.coef_
MSE_train = 1/float(N)*(sum((y-ypred)**2))
MSE_test = 1/float(N_)*(sum((y_-y_pred)**2))

plt.plot(x,y,'o')
plt.plot(x,ypred,'g-')
plt.show()

plt.plot(y_,y_pred,'.')
t = np.linspace(np.min(y_),np.max(y_),20)
plt.plot([t[0], t[-1]],[t[0], t[-1]],'k-')
plt.show()

# generate training data where there are more parameters than the data points
import numpy as np
N=5
x = np.zeros(N)
x_ = np.zeros(N)
x= np.random.uniform(-1,1,N)
y = (x-.95)*(x-.6)*(x-.25)*(x+.5)*(x+.9) + .03*np.random.randn(N)
N_=100
x_= np.random.uniform(-1,1,N_)
y_ = (x_-.95)*(x_-.6)*(x_-.25)*(x_+.5)*(x_+.9) + .03*np.random.randn(N_)

# feature engineering
A = np.zeros((6,len(x)))
A_ = np.zeros((6,len(x_)))
A[0] = np.ones(len(x))
A[1] = np.clip(x,-1,1)
A[2] = np.clip(x,-0.75,1)
A[3] = np.clip(x,-0.2,1)
A[4] = np.clip(x,0.4,1)
A[5] = np.clip(x,0.8,1)
A_[0] = np.ones(len(x_))
A_[1] = np.clip(x_,-1,1)
A_[2] = np.clip(x_,-0.75,1)
A_[3] = np.clip(x_,-0.2,1)
A_[4] = np.clip(x_,0.4,1)
A_[5] = np.clip(x_,0.8,1)

# run ridge regression
from sklearn.linear_model import Ridge
Ns = 1000
MSE_train = np.zeros(Ns)
MSE_test = np.zeros(Ns)
Intct = np.zeros(Ns)
Coef = np.zeros((Ns,6))
for i in range(Ns):
    lambd = float(10)**(0.009*float(i)-3)
    ridgereg = Ridge(alpha=lambd,normalize=True)
    # fit the model for varying lambda
    ridgereg.fit(A.T,y)
    # compute the predicted values for both train and test datasets
    Y_pred = ridgereg.predict(A_.T)
    Ypred  = ridgereg.predict(A.T)
 
    Intct[i] = ridgereg.intercept_
    Coef[i] = ridgereg.coef_
    MSE_train[i] = 1/float(N)*(sum((y-Ypred)**2))
    MSE_test[i] = 1/float(N)*(sum((y_-Y_pred)**2))

    
# plot training error and the coefficient values 
plt.plot(0.007*np.arange(Ns)-3,MSE_train,'-b')
plt.show()
plt.plot(0.007*np.arange(Ns)-3,Coef)
plt.show()

# plot the trianing and test error (for a zommed-in regime) and the coefficient values
I=1000
plt.plot(0.007*np.arange(I)-3,np.log(MSE_test[0:I]),'-r')
plt.plot(0.007*np.arange(I)-3,np.log(MSE_train[0:I]),'-b')
plt.show()
plt.plot(0.007*np.arange(I)-3,Coef[0:I])
plt.show()

data from https://www.kaggle.com/emurphy/ames-iowa-housing-prices-dataset/version/1?login=true#train1.csv¶

piecewise linear fit¶

data from https://www.kaggle.com/emurphy/ames-iowa-housing-prices-dataset/version/1?login=true#train1.csv ¶