from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd

homes = pd.read_csv("homes.csv")
homes

def model_parameters(reg, columns):
    """Returns a string with the linear regression model parameters for the given column names."""
    slopes = [f"{coef:.2f}({columns[i]})" for i, coef in enumerate(reg.coef_)]
    return " + ".join([f"{reg.intercept_:.2f}"] + slopes)

# pd.get_dummies(...)
X = pd.get_dummies(homes.drop("price", axis=1))
y = homes["price"]
reg = LinearRegression().fit(X, y)

print("Model:", model_parameters(reg, X.columns))
print("Error:", mean_squared_error(y, reg.predict(X)))

Model: 2667451.86 + -91657.73(beds) + -46672.61(bath) + -2986.99(year_built) + 1611.73(sqft) + 2563.08(price_per_sqft) + -1081.63(elevation) + -173696.89(city_NY) + 173696.89(city_SF)
Error: 976495129525.2129

X

sensor_data = pd.read_csv("sensor_data.csv")
sensor_data

X = sensor_data.drop("AQS", axis=1)
y = sensor_data["AQS"]
reg = LinearRegression().fit(X, y)

print("Model:", model_parameters(reg, X.columns))
# squared=False for RMSE: root mean squared error
# Take the mean squared error, then square root the result
print("Error:", mean_squared_error(y, reg.predict(X), squared=False))

Model: 7.11 + -0.03(temp) + -0.08(humidity) + 0.03(dew) + 0.43(PAS)
Error: 2.267367483110072

# The goal of this model selection is to choose the model that is not just
# the lowest error, but a model that actually translates well to the real
# world. Train-test split is to simulate "real-world" by "holding-out" part
# of your data from your model, and then only evaluating the "held-out" test
# data at the end of your machine learning process.

from sklearn.model_selection import train_test_split

# test_size=0.2 indicates 80% training dataset and 20% testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# During model training, use only the training dataset
reg = LinearRegression().fit(X_train, y_train)

print("Model:", model_parameters(reg, X_train.columns))
# During model evaluation, use the testing dataset
print("Error:", mean_squared_error(y_test, reg.predict(X_test), squared=False))

Model: 7.23 + -0.03(temp) + -0.08(humidity) + 0.03(dew) + 0.43(PAS)
Error: 2.3097590903105507

# Pick the variable/feature with the lowest coefficient (slope) weight,
# and remove that variable/feature from the features list!

# Note also that the coefficient (slope) weights are different in magnitude!
# Scale of your data is going to affect the magnitude, so one thing that ML
# engineers will do is apply the z-score: takes all your numbers and removes
# the mean and standard deviations.
# Coefficients would also be different based on their relationship with the
# target label.
features = ["PAS", "humidity"]

reg = LinearRegression().fit(X_train.loc[:, features], y_train)
print("Model:", model_parameters(reg, features))
print("Error:", mean_squared_error(y_test, reg.predict(X_test.loc[:, features]), squared=False))

Model: 7.23 + 0.43(PAS) + -0.08(humidity) + -0.03(temp) + 0.03(dew)
Error: 2.3097590903105507

from sklearn.feature_selection import RFE

# Remove 1 feature per step until half the original features remain
# A concern for cross-validation: how do I know what to set for step=,
# n_features_to_select... We're actually introducing a bias in this very
# choice!
rfe = RFE(LinearRegression(), step=1, n_features_to_select=2, verbose=1)
# How is it overfitting when I peek at the test error in choose n_features_to_select?
# --> This is basically a decision that I am making about what I think the real world
#     data will look like. I'm overfitting because I'm making this decision based just
#     on the data that my model has access to! Machine learning models want to answer
#     questions on new, unseen data, but our decisions as ML engineers and designers
#     also have implications on what we choose as what the real world data will look
#     like.
rfe.fit(X_train, y_train)

# Show the final subset of features
rfe_features = X.columns[[r == 1 for r in rfe.ranking_]]
print("Features:", list(rfe_features))

# Extract the last LinearRegression model trained on the final subset of features
reg = rfe.estimator_

print("Model:", model_parameters(reg, rfe_features))
print("Error:", mean_squared_error(y_test, rfe.predict(X_test), squared=False))

Fitting estimator with 4 features.
Fitting estimator with 3 features.
Features: ['humidity', 'PAS']
Model: 6.28 + -0.07(humidity) + 0.43(PAS)
Error: 2.3079571351026877

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

cross_val_score(
    estimator=DecisionTreeRegressor(max_depth=2),
    X=X_train,
    y=y_train,
    scoring="neg_root_mean_squared_error",
    verbose=3,
)

[CV] END ............................... score: (test=-2.820) total time=   0.0s
[CV] END ............................... score: (test=-3.039) total time=   0.0s
[CV] END ............................... score: (test=-2.986) total time=   0.0s
[CV] END ............................... score: (test=-2.865) total time=   0.0s
[CV] END ............................... score: (test=-3.627) total time=   0.0s

array([-2.82019847, -3.03903411, -2.98609911, -2.86499044, -3.6272734 ])

from sklearn.model_selection import GridSearchCV

search = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10]},
    scoring="neg_root_mean_squared_error",
    verbose=3,
)
search.fit(X_train, y_train)

# Show the best score and best estimator at the end of hyperparameter search
print("Mean score for best model:", search.best_score_)
reg = search.best_estimator_
print("Best model:", reg)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ......................max_depth=2;, score=-2.820 total time=   0.0s
[CV 2/5] END ......................max_depth=2;, score=-3.039 total time=   0.0s
[CV 3/5] END ......................max_depth=2;, score=-2.986 total time=   0.0s
[CV 4/5] END ......................max_depth=2;, score=-2.865 total time=   0.0s
[CV 5/5] END ......................max_depth=2;, score=-3.627 total time=   0.0s
[CV 1/5] END ......................max_depth=3;, score=-2.649 total time=   0.0s
[CV 2/5] END ......................max_depth=3;, score=-2.788 total time=   0.0s
[CV 3/5] END ......................max_depth=3;, score=-2.656 total time=   0.0s
[CV 4/5] END ......................max_depth=3;, score=-2.685 total time=   0.0s
[CV 5/5] END ......................max_depth=3;, score=-3.337 total time=   0.0s
[CV 1/5] END ......................max_depth=4;, score=-2.413 total time=   0.0s
[CV 2/5] END ......................max_depth=4;, score=-2.499 total time=   0.0s
[CV 3/5] END ......................max_depth=4;, score=-2.417 total time=   0.0s
[CV 4/5] END ......................max_depth=4;, score=-2.414 total time=   0.0s
[CV 5/5] END ......................max_depth=4;, score=-2.668 total time=   0.0s
[CV 1/5] END ......................max_depth=5;, score=-2.338 total time=   0.0s
[CV 2/5] END ......................max_depth=5;, score=-2.421 total time=   0.0s
[CV 3/5] END ......................max_depth=5;, score=-2.371 total time=   0.0s
[CV 4/5] END ......................max_depth=5;, score=-2.358 total time=   0.0s
[CV 5/5] END ......................max_depth=5;, score=-2.604 total time=   0.0s
[CV 1/5] END ......................max_depth=6;, score=-2.271 total time=   0.0s
[CV 2/5] END ......................max_depth=6;, score=-2.323 total time=   0.0s
[CV 3/5] END ......................max_depth=6;, score=-2.555 total time=   0.0s
[CV 4/5] END ......................max_depth=6;, score=-2.299 total time=   0.0s
[CV 5/5] END ......................max_depth=6;, score=-2.620 total time=   0.0s
[CV 1/5] END ......................max_depth=7;, score=-2.262 total time=   0.0s
[CV 2/5] END ......................max_depth=7;, score=-2.265 total time=   0.0s
[CV 3/5] END ......................max_depth=7;, score=-2.289 total time=   0.0s
[CV 4/5] END ......................max_depth=7;, score=-2.284 total time=   0.0s
[CV 5/5] END ......................max_depth=7;, score=-2.522 total time=   0.0s
[CV 1/5] END ......................max_depth=8;, score=-2.243 total time=   0.0s
[CV 2/5] END ......................max_depth=8;, score=-2.308 total time=   0.0s
[CV 3/5] END ......................max_depth=8;, score=-2.292 total time=   0.0s
[CV 4/5] END ......................max_depth=8;, score=-2.189 total time=   0.0s
[CV 5/5] END ......................max_depth=8;, score=-2.612 total time=   0.0s
[CV 1/5] END ......................max_depth=9;, score=-2.239 total time=   0.0s
[CV 2/5] END ......................max_depth=9;, score=-2.363 total time=   0.0s
[CV 3/5] END ......................max_depth=9;, score=-2.318 total time=   0.0s
[CV 4/5] END ......................max_depth=9;, score=-2.272 total time=   0.0s
[CV 5/5] END ......................max_depth=9;, score=-2.637 total time=   0.0s
[CV 1/5] END .....................max_depth=10;, score=-2.388 total time=   0.0s
[CV 2/5] END .....................max_depth=10;, score=-2.380 total time=   0.0s
[CV 3/5] END .....................max_depth=10;, score=-2.356 total time=   0.0s
[CV 4/5] END .....................max_depth=10;, score=-2.303 total time=   0.0s
[CV 5/5] END .....................max_depth=10;, score=-2.739 total time=   0.0s
Mean score for best model: -2.324314363585862
Best model: DecisionTreeRegressor(max_depth=7)

print("Error:", mean_squared_error(y_test, reg.predict(X_test), squared=False))

Error: 2.206756068626163

# Split dataset into 80% training dataset and 20% testing dataset
X = sensor_data.drop("AQS", axis=1)
y = sensor_data["AQS"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Recursive feature elimination to select the single most important feature based on slope value
rfe = RFE(LinearRegression(), n_features_to_select=1, verbose=1)
rfe.fit(X_train, y_train)
# Print the best feature to predict AQS
rfe_feature = X.columns[rfe.ranking_.argmin()]
print("Best feature to predict AQS:", rfe_feature)
# Use only the best feature
X = X[[rfe_feature]]
X_train = X_train[[rfe_feature]]
X_test = X_test[[rfe_feature]]

# Grid search cross-validation to tune the max_depth hyperparameter using RMSE loss metric
search = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10]},
    scoring="neg_root_mean_squared_error",
    verbose=3,
)
search.fit(X_train, y_train)
# Print the best score and best estimator at the end of hyperparameter search
print("Mean score for best model:", search.best_score_)
reg = search.best_estimator_
print("Best model:", reg)

# During model evaluation, use the testing dataset
print("Test error:", mean_squared_error(y_test, search.predict(X_test), squared=False))

# Visualize the tree decisions compared to a LinearRegression model (lmplot)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.tree import plot_tree
sns.set_theme()
grid = sns.lmplot(sensor_data, x=rfe_feature, y="AQS")
# Create a demonstration dataset that counts from 0 to the max PAS value
X_demo = pd.DataFrame(np.arange(X[rfe_feature].max()), columns=[rfe_feature])
grid.facet_axis(0, 0).plot(X_demo, reg.predict(X_demo), c="orange")
grid.set(title=f"lmplot vs {reg}")
# Show nodes in the decision tree
plt.figure(dpi=300)
plot_tree(
    reg,
    max_depth=2, # Only show the first two levels
    feature_names=[rfe_feature],
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False
) and None # Hide return value of plot_tree

Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Best feature to predict AQS: PAS
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ......................max_depth=2;, score=-3.154 total time=   0.0s
[CV 2/5] END ......................max_depth=2;, score=-2.914 total time=   0.0s
[CV 3/5] END ......................max_depth=2;, score=-2.762 total time=   0.0s
[CV 4/5] END ......................max_depth=2;, score=-2.861 total time=   0.0s
[CV 5/5] END ......................max_depth=2;, score=-3.488 total time=   0.0s
[CV 1/5] END ......................max_depth=3;, score=-3.011 total time=   0.0s
[CV 2/5] END ......................max_depth=3;, score=-2.717 total time=   0.0s
[CV 3/5] END ......................max_depth=3;, score=-2.569 total time=   0.0s
[CV 4/5] END ......................max_depth=3;, score=-2.645 total time=   0.0s
[CV 5/5] END ......................max_depth=3;, score=-3.031 total time=   0.0s
[CV 1/5] END ......................max_depth=4;, score=-2.804 total time=   0.0s
[CV 2/5] END ......................max_depth=4;, score=-2.621 total time=   0.0s
[CV 3/5] END ......................max_depth=4;, score=-2.451 total time=   0.0s
[CV 4/5] END ......................max_depth=4;, score=-2.541 total time=   0.0s
[CV 5/5] END ......................max_depth=4;, score=-2.728 total time=   0.0s
[CV 1/5] END ......................max_depth=5;, score=-2.668 total time=   0.0s
[CV 2/5] END ......................max_depth=5;, score=-2.593 total time=   0.0s
[CV 3/5] END ......................max_depth=5;, score=-2.476 total time=   0.0s
[CV 4/5] END ......................max_depth=5;, score=-2.519 total time=   0.0s
[CV 5/5] END ......................max_depth=5;, score=-2.700 total time=   0.0s
[CV 1/5] END ......................max_depth=6;, score=-2.611 total time=   0.0s
[CV 2/5] END ......................max_depth=6;, score=-2.619 total time=   0.0s
[CV 3/5] END ......................max_depth=6;, score=-2.492 total time=   0.0s
[CV 4/5] END ......................max_depth=6;, score=-2.539 total time=   0.0s
[CV 5/5] END ......................max_depth=6;, score=-2.704 total time=   0.0s
[CV 1/5] END ......................max_depth=7;, score=-2.756 total time=   0.0s
[CV 2/5] END ......................max_depth=7;, score=-2.643 total time=   0.0s
[CV 3/5] END ......................max_depth=7;, score=-2.523 total time=   0.0s
[CV 4/5] END ......................max_depth=7;, score=-2.573 total time=   0.0s
[CV 5/5] END ......................max_depth=7;, score=-2.726 total time=   0.0s
[CV 1/5] END ......................max_depth=8;, score=-2.815 total time=   0.0s
[CV 2/5] END ......................max_depth=8;, score=-2.695 total time=   0.0s
[CV 3/5] END ......................max_depth=8;, score=-2.530 total time=   0.0s
[CV 4/5] END ......................max_depth=8;, score=-2.595 total time=   0.0s
[CV 5/5] END ......................max_depth=8;, score=-2.746 total time=   0.0s
[CV 1/5] END ......................max_depth=9;, score=-2.834 total time=   0.0s
[CV 2/5] END ......................max_depth=9;, score=-2.776 total time=   0.0s
[CV 3/5] END ......................max_depth=9;, score=-2.581 total time=   0.0s
[CV 4/5] END ......................max_depth=9;, score=-2.584 total time=   0.0s
[CV 5/5] END ......................max_depth=9;, score=-2.802 total time=   0.0s
[CV 1/5] END .....................max_depth=10;, score=-2.844 total time=   0.0s
[CV 2/5] END .....................max_depth=10;, score=-2.785 total time=   0.0s
[CV 3/5] END .....................max_depth=10;, score=-2.544 total time=   0.0s
[CV 4/5] END .....................max_depth=10;, score=-2.632 total time=   0.0s
[CV 5/5] END .....................max_depth=10;, score=-2.826 total time=   0.0s
Mean score for best model: -2.591353522186599
Best model: DecisionTreeRegressor(max_depth=5)
Test error: 2.62480017696235

print("Tree error:", mean_squared_error(y_test, search.predict(X_test), squared=False))
print("Line error:", mean_squared_error(
    y_test,
    LinearRegression().fit(X_train, y_train).predict(X_test),
    squared=False
))

Tree error: 2.62480017696235
Line error: 2.6376118905476775

	beds	bath	price	year_built	sqft	price_per_sqft	elevation	city
0	2.0	1.0	999000	1960	1000	999	10	NY
1	2.0	2.0	2750000	2006	1418	1939	0	NY
2	2.0	2.0	1350000	1900	2150	628	9	NY
3	1.0	1.0	629000	1903	500	1258	9	NY
4	0.0	1.0	439000	1930	500	878	10	NY
...	...	...	...	...	...	...	...	...
487	5.0	2.5	1800000	1890	3073	586	76	SF
488	2.0	1.0	695000	1923	1045	665	106	SF
489	3.0	2.0	1650000	1922	1483	1113	106	SF
490	1.0	1.0	649000	1983	850	764	163	SF
491	3.0	2.0	995000	1956	1305	762	216	SF

	AQS	temp	humidity	dew	PAS
0	6.7	18.027263	38.564815	3.629662	8.616954
1	3.8	16.115280	49.404315	5.442318	3.493916
2	4.0	19.897634	29.972222	1.734051	3.799601
3	4.7	21.378334	32.474513	4.165624	4.369691
4	3.2	18.443822	43.898226	5.867611	3.191071
...	...	...	...	...	...
12092	5.5	-12.101337	54.188889	-19.555834	2.386120
12093	16.8	4.159967	56.256030	-3.870659	32.444987
12094	15.6	1.707895	65.779221	-4.083768	25.297018
12095	14.0	-14.380144	48.206481	-23.015378	8.213208
12096	5.8	5.081813	52.200000	-4.016401	9.436011

Model Evaluation¶

Dummy variables¶

Overfitting¶

Feature selection¶

Cross validation¶

Visualizing decision tree models¶

	beds	bath	year_built	sqft	price_per_sqft	elevation	city_NY	city_SF
0	2.0	1.0	1960	1000	999	10	True	False
1	2.0	2.0	2006	1418	1939	0	True	False
2	2.0	2.0	1900	2150	628	9	True	False
3	1.0	1.0	1903	500	1258	9	True	False
4	0.0	1.0	1930	500	878	10	True	False
...	...	...	...	...	...	...	...	...
487	5.0	2.5	1890	3073	586	76	False	True
488	2.0	1.0	1923	1045	665	106	False	True
489	3.0	2.0	1922	1483	1113	106	False	True
490	1.0	1.0	1983	850	764	163	False	True
491	3.0	2.0	1956	1305	762	216	False	True