from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd

homes = pd.read_csv("homes.csv")
homes

def model_parameters(reg, columns):
    """Returns a string with the linear regression model parameters for the given column names."""
    slopes = [f"{coef:.2f}({columns[i]})" for i, coef in enumerate(reg.coef_)]
    return " + ".join([f"{reg.intercept_:.2f}"] + slopes)

X = pd.get_dummies(homes.drop("price", axis=1))
y = homes["price"]
reg = LinearRegression().fit(X, y)

print("Model:", model_parameters(reg, X.columns))
print("Error:", mean_squared_error(y, reg.predict(X)))

Model: 2667451.86 + -91657.73(beds) + -46672.61(bath) + -2986.99(year_built) + 1611.73(sqft) + 2563.08(price_per_sqft) + -1081.63(elevation) + -173696.89(city_NY) + 173696.89(city_SF)
Error: 976495129525.2129

315746013053.41156 < 976495129525.2129

True

sensor_data = pd.read_csv("sensor_data.csv")
sensor_data

from sklearn.metrics import root_mean_squared_error

X = sensor_data.drop("AQS", axis=1)
y = sensor_data["AQS"]
reg = LinearRegression().fit(X, y)

print("Model:", model_parameters(reg, X.columns))
# RMSE, or root mean squared error, is the square root of the mean of the squared errors.
print("Error:", root_mean_squared_error(y, reg.predict(X)))

Model: 7.11 + -0.03(temp) + -0.08(humidity) + 0.03(dew) + 0.43(PAS)
Error: 2.267367483110072

from sklearn.model_selection import train_test_split

# test_size=0.2 indicates 80% training dataset and 20% testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# During model training, use only the training dataset
reg = LinearRegression().fit(X_train, y_train)

print("Model:", model_parameters(reg, X_train.columns))
print("Error:", root_mean_squared_error(y_test, reg.predict(X_test)))

Model: 7.49 + -0.04(temp) + -0.09(humidity) + 0.04(dew) + 0.43(PAS)
Error: 2.279871556601017

print("Error:", root_mean_squared_error(y_train, reg.predict(X_train)))

Error: 2.2645273215512147

features = ["PAS", "temp", "dew"]

reg = LinearRegression().fit(X_train.loc[:, features], y_train)
print("Model:", model_parameters(reg, features))
print("Error:", root_mean_squared_error(y_test, reg.predict(X_test.loc[:, features])))

Model: 1.19 + 0.43(PAS) + 0.14(temp) + -0.16(dew)
Error: 2.2934548158537518

from sklearn.feature_selection import RFE

# Remove 1 feature per step until half the original features remain
rfe = RFE(LinearRegression(), step=1, n_features_to_select=0.5, verbose=1)
rfe.fit(X_train, y_train)

# Show the final subset of features
rfe_features = rfe.get_feature_names_out()
print("Features:", list(rfe_features))

# Extract the last LinearRegression model trained on the final subset of features
reg = rfe.estimator_

print("Model:", model_parameters(reg, rfe_features))
print("Error:", root_mean_squared_error(y_test, rfe.predict(X_test)))

Fitting estimator with 4 features.
Fitting estimator with 3 features.
Features: ['humidity', 'PAS']
Model: 6.28 + -0.07(humidity) + 0.43(PAS)
Error: 2.2763688116092813

reg = DecisionTreeRegressor(max_depth=12)
reg.fit(X_train, y_train)
root_mean_squared_error(y_test, reg.predict(X_test))

2.35894080048194

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

cross_val_score(
    estimator=DecisionTreeRegressor(max_depth=10),
    X=X_train,
    y=y_train,
    scoring="neg_root_mean_squared_error",
    verbose=3,
)

[CV] END ............................... score: (test=-2.367) total time=   0.1s
[CV] END ............................... score: (test=-2.167) total time=   0.1s
[CV] END ............................... score: (test=-2.448) total time=   0.1s
[CV] END ............................... score: (test=-2.612) total time=   0.1s
[CV] END ............................... score: (test=-2.593) total time=   0.1s

array([-2.36672821, -2.16695547, -2.44759552, -2.61203203, -2.59262568])

from sklearn.model_selection import GridSearchCV

X = sensor_data.drop("AQS", axis=1)
y = sensor_data["AQS"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

search = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]},
    scoring="neg_root_mean_squared_error",
    verbose=4,
)
search.fit(X_train, y_train)

# Show the best score and best estimator at the end of hyperparameter search
print("Mean score for best model:", search.best_score_)
reg = search.best_estimator_
print("Best model:", reg)

Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV 1/5] END ......................max_depth=2;, score=-2.852 total time=   0.0s
[CV 2/5] END ......................max_depth=2;, score=-3.118 total time=   0.0s
[CV 3/5] END ......................max_depth=2;, score=-2.936 total time=   0.0s
[CV 4/5] END ......................max_depth=2;, score=-2.988 total time=   0.0s
[CV 5/5] END ......................max_depth=2;, score=-3.451 total time=   0.0s
[CV 1/5] END ......................max_depth=3;, score=-2.647 total time=   0.0s
[CV 2/5] END ......................max_depth=3;, score=-2.874 total time=   0.0s
[CV 3/5] END ......................max_depth=3;, score=-2.726 total time=   0.0s
[CV 4/5] END ......................max_depth=3;, score=-2.699 total time=   0.0s
[CV 5/5] END ......................max_depth=3;, score=-2.934 total time=   0.0s
[CV 1/5] END ......................max_depth=4;, score=-2.468 total time=   0.0s
[CV 2/5] END ......................max_depth=4;, score=-2.461 total time=   0.0s
[CV 3/5] END ......................max_depth=4;, score=-2.501 total time=   0.0s
[CV 4/5] END ......................max_depth=4;, score=-2.431 total time=   0.0s
[CV 5/5] END ......................max_depth=4;, score=-2.694 total time=   0.0s
[CV 1/5] END ......................max_depth=5;, score=-2.383 total time=   0.0s
[CV 2/5] END ......................max_depth=5;, score=-2.347 total time=   0.0s
[CV 3/5] END ......................max_depth=5;, score=-2.379 total time=   0.0s
[CV 4/5] END ......................max_depth=5;, score=-2.328 total time=   0.0s
[CV 5/5] END ......................max_depth=5;, score=-2.610 total time=   0.0s
[CV 1/5] END ......................max_depth=6;, score=-2.326 total time=   0.0s
[CV 2/5] END ......................max_depth=6;, score=-2.187 total time=   0.0s
[CV 3/5] END ......................max_depth=6;, score=-2.300 total time=   0.0s
[CV 4/5] END ......................max_depth=6;, score=-2.242 total time=   0.0s
[CV 5/5] END ......................max_depth=6;, score=-2.576 total time=   0.0s
[CV 1/5] END ......................max_depth=7;, score=-2.290 total time=   0.0s
[CV 2/5] END ......................max_depth=7;, score=-2.110 total time=   0.0s
[CV 3/5] END ......................max_depth=7;, score=-2.245 total time=   0.0s
[CV 4/5] END ......................max_depth=7;, score=-2.207 total time=   0.0s
[CV 5/5] END ......................max_depth=7;, score=-2.532 total time=   0.0s
[CV 1/5] END ......................max_depth=8;, score=-2.311 total time=   0.0s
[CV 2/5] END ......................max_depth=8;, score=-2.131 total time=   0.0s
[CV 3/5] END ......................max_depth=8;, score=-2.331 total time=   0.0s
[CV 4/5] END ......................max_depth=8;, score=-2.256 total time=   0.0s
[CV 5/5] END ......................max_depth=8;, score=-2.519 total time=   0.1s
[CV 1/5] END ......................max_depth=9;, score=-2.335 total time=   0.1s
[CV 2/5] END ......................max_depth=9;, score=-2.102 total time=   0.1s
[CV 3/5] END ......................max_depth=9;, score=-2.451 total time=   0.1s
[CV 4/5] END ......................max_depth=9;, score=-2.230 total time=   0.1s
[CV 5/5] END ......................max_depth=9;, score=-2.555 total time=   0.1s
[CV 1/5] END .....................max_depth=10;, score=-2.365 total time=   0.1s
[CV 2/5] END .....................max_depth=10;, score=-2.146 total time=   0.1s
[CV 3/5] END .....................max_depth=10;, score=-2.437 total time=   0.1s
[CV 4/5] END .....................max_depth=10;, score=-2.345 total time=   0.1s
[CV 5/5] END .....................max_depth=10;, score=-2.598 total time=   0.1s
[CV 1/5] END .....................max_depth=11;, score=-2.399 total time=   0.1s
[CV 2/5] END .....................max_depth=11;, score=-2.229 total time=   0.1s
[CV 3/5] END .....................max_depth=11;, score=-2.498 total time=   0.1s
[CV 4/5] END .....................max_depth=11;, score=-2.398 total time=   0.1s
[CV 5/5] END .....................max_depth=11;, score=-2.629 total time=   0.1s
[CV 1/5] END .....................max_depth=12;, score=-2.448 total time=   0.1s
[CV 2/5] END .....................max_depth=12;, score=-2.306 total time=   0.1s
[CV 3/5] END .....................max_depth=12;, score=-2.651 total time=   0.1s
[CV 4/5] END .....................max_depth=12;, score=-2.719 total time=   0.1s
[CV 5/5] END .....................max_depth=12;, score=-2.640 total time=   0.1s
Mean score for best model: -2.27676368712062
Best model: DecisionTreeRegressor(max_depth=7)

print("Error:", root_mean_squared_error(y_test, reg.predict(X_test)))

Error: 2.3035535039577764

# Split dataset into 80% training dataset and 20% testing dataset
X = sensor_data.drop("AQS", axis=1)
y = sensor_data["AQS"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Recursive feature elimination to select the single most important feature based on slope value
rfe = RFE(LinearRegression(), n_features_to_select=1, verbose=1)
rfe.fit(X_train, y_train)
# Print the best feature to predict AQS
rfe_feature = X.columns[rfe.ranking_.argmin()]
print("Best feature to predict AQS:", rfe_feature)
# Use only the best feature
X = X[[rfe_feature]]
X_train = X_train[[rfe_feature]]
X_test = X_test[[rfe_feature]]

# Grid search cross-validation to tune the max_depth hyperparameter using RMSE loss metric
search = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={"max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10]},
    scoring="neg_root_mean_squared_error",
    verbose=3,
)
search.fit(X_train, y_train)
# Print the best score and best estimator at the end of hyperparameter search
print("Mean score for best model:", search.best_score_)
reg = search.best_estimator_
print("Best model:", reg)

# During model evaluation, use the testing dataset
print("Test error:", root_mean_squared_error(y_test, search.predict(X_test)))

# Visualize the tree decisions compared to a LinearRegression model (lmplot)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.tree import plot_tree
sns.set_theme()
grid = sns.lmplot(sensor_data, x=rfe_feature, y="AQS")
# Create a demonstration dataset that counts from 0 to the max PAS value
X_demo = pd.DataFrame(np.arange(X[rfe_feature].max()), columns=[rfe_feature])
grid.facet_axis(0, 0).plot(X_demo, reg.predict(X_demo), c="orange", linewidth=3)
grid.set(title=f"lmplot vs {reg}")
# Show nodes in the decision tree
plt.figure(dpi=300)
plot_tree(
    reg,
    max_depth=2, # Only show the first two levels
    feature_names=[rfe_feature],
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False
);

Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.
Best feature to predict AQS: PAS
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ......................max_depth=2;, score=-2.794 total time=   0.0s
[CV 2/5] END ......................max_depth=2;, score=-2.960 total time=   0.0s
[CV 3/5] END ......................max_depth=2;, score=-3.525 total time=   0.0s
[CV 4/5] END ......................max_depth=2;, score=-2.819 total time=   0.0s
[CV 5/5] END ......................max_depth=2;, score=-3.130 total time=   0.0s
[CV 1/5] END ......................max_depth=3;, score=-2.576 total time=   0.0s
[CV 2/5] END ......................max_depth=3;, score=-2.723 total time=   0.0s
[CV 3/5] END ......................max_depth=3;, score=-3.093 total time=   0.0s
[CV 4/5] END ......................max_depth=3;, score=-2.603 total time=   0.0s
[CV 5/5] END ......................max_depth=3;, score=-2.954 total time=   0.0s
[CV 1/5] END ......................max_depth=4;, score=-2.476 total time=   0.0s
[CV 2/5] END ......................max_depth=4;, score=-2.525 total time=   0.0s
[CV 3/5] END ......................max_depth=4;, score=-2.863 total time=   0.0s
[CV 4/5] END ......................max_depth=4;, score=-2.455 total time=   0.0s
[CV 5/5] END ......................max_depth=4;, score=-2.813 total time=   0.0s
[CV 1/5] END ......................max_depth=5;, score=-2.459 total time=   0.0s
[CV 2/5] END ......................max_depth=5;, score=-2.494 total time=   0.0s
[CV 3/5] END ......................max_depth=5;, score=-2.816 total time=   0.0s
[CV 4/5] END ......................max_depth=5;, score=-2.442 total time=   0.0s
[CV 5/5] END ......................max_depth=5;, score=-2.710 total time=   0.0s
[CV 1/5] END ......................max_depth=6;, score=-2.451 total time=   0.0s
[CV 2/5] END ......................max_depth=6;, score=-2.548 total time=   0.0s
[CV 3/5] END ......................max_depth=6;, score=-2.812 total time=   0.0s
[CV 4/5] END ......................max_depth=6;, score=-2.446 total time=   0.0s
[CV 5/5] END ......................max_depth=6;, score=-2.670 total time=   0.0s
[CV 1/5] END ......................max_depth=7;, score=-2.482 total time=   0.0s
[CV 2/5] END ......................max_depth=7;, score=-2.565 total time=   0.0s
[CV 3/5] END ......................max_depth=7;, score=-2.878 total time=   0.0s
[CV 4/5] END ......................max_depth=7;, score=-2.466 total time=   0.0s
[CV 5/5] END ......................max_depth=7;, score=-2.684 total time=   0.0s
[CV 1/5] END ......................max_depth=8;, score=-2.624 total time=   0.0s
[CV 2/5] END ......................max_depth=8;, score=-2.605 total time=   0.0s
[CV 3/5] END ......................max_depth=8;, score=-2.898 total time=   0.0s
[CV 4/5] END ......................max_depth=8;, score=-2.480 total time=   0.0s
[CV 5/5] END ......................max_depth=8;, score=-2.695 total time=   0.0s
[CV 1/5] END ......................max_depth=9;, score=-2.681 total time=   0.0s
[CV 2/5] END ......................max_depth=9;, score=-2.621 total time=   0.0s
[CV 3/5] END ......................max_depth=9;, score=-2.970 total time=   0.0s
[CV 4/5] END ......................max_depth=9;, score=-2.513 total time=   0.0s
[CV 5/5] END ......................max_depth=9;, score=-2.711 total time=   0.0s
[CV 1/5] END .....................max_depth=10;, score=-2.774 total time=   0.0s
[CV 2/5] END .....................max_depth=10;, score=-2.715 total time=   0.0s
[CV 3/5] END .....................max_depth=10;, score=-2.973 total time=   0.0s
[CV 4/5] END .....................max_depth=10;, score=-2.547 total time=   0.0s
[CV 5/5] END .....................max_depth=10;, score=-2.774 total time=   0.0s
Mean score for best model: -2.5841136333686174
Best model: DecisionTreeRegressor(max_depth=5)
Test error: 2.4596026762053023

print("Tree error:", root_mean_squared_error(y_test, search.predict(X_test)))
print("Line error:", root_mean_squared_error(
    y_test,
    LinearRegression().fit(X_train, y_train).predict(X_test),
))

Tree error: 2.4596026762053023
Line error: 2.489013997463602

	beds	bath	price	year_built	sqft	price_per_sqft	elevation	city
0	2.0	1.0	999000	1960	1000	999	10	NY
1	2.0	2.0	2750000	2006	1418	1939	0	NY
2	2.0	2.0	1350000	1900	2150	628	9	NY
3	1.0	1.0	629000	1903	500	1258	9	NY
4	0.0	1.0	439000	1930	500	878	10	NY
...	...	...	...	...	...	...	...	...
487	5.0	2.5	1800000	1890	3073	586	76	SF
488	2.0	1.0	695000	1923	1045	665	106	SF
489	3.0	2.0	1650000	1922	1483	1113	106	SF
490	1.0	1.0	649000	1983	850	764	163	SF
491	3.0	2.0	995000	1956	1305	762	216	SF

	AQS	temp	humidity	dew	PAS
0	6.7	18.027263	38.564815	3.629662	8.616954
1	3.8	16.115280	49.404315	5.442318	3.493916
2	4.0	19.897634	29.972222	1.734051	3.799601
3	4.7	21.378334	32.474513	4.165624	4.369691
4	3.2	18.443822	43.898226	5.867611	3.191071
...	...	...	...	...	...
12092	5.5	-12.101337	54.188889	-19.555834	2.386120
12093	16.8	4.159967	56.256030	-3.870659	32.444987
12094	15.6	1.707895	65.779221	-4.083768	25.297018
12095	14.0	-14.380144	48.206481	-23.015378	8.213208
12096	5.8	5.081813	52.200000	-4.016401	9.436011

Model Evaluation¶

Dummy variables¶

Overfitting¶

Feature selection¶

Cross validation¶

Visualizing decision tree models¶