# For interactive slider widget
from ipywidgets import interact

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme()

sensor_data = pd.read_csv("sensor_data.csv")
X = sensor_data[["PAS"]]
y = sensor_data["AQS"]
# Create a demonstration dataset that counts from 0 to the max PAS value
X_plot = pd.DataFrame(np.arange(sensor_data["PAS"].max()), columns=["PAS"])

sensor_data

max_depths = list(range(1, 13))

# Grid search cross-validation to tune the max_depth hyperparameter using RMSE loss metric
search = GridSearchCV(
    estimator=DecisionTreeRegressor(),
    param_grid={"max_depth": max_depths},
    scoring="neg_root_mean_squared_error",
    verbose=1,
)
search.fit(X, y)
# Print the best score and best estimator at the end of hyperparameter search
reg = search.best_estimator_
print("Best model:", reg)
print("Mean score:", search.best_score_)

# Load results of cross-validation into a DataFrame
cv_results = pd.DataFrame(search.cv_results_)[[
    "param_max_depth",
    "split0_test_score",
    "split1_test_score",
    "split2_test_score",
    "split3_test_score",
    "split4_test_score",
    "mean_test_score",
]]
cv_results = cv_results.melt(id_vars=["param_max_depth"]).set_index("param_max_depth") * (-1)
# Plot the validation scores with confidence intervals
grid = sns.relplot(cv_results, x="param_max_depth", y="value", kind="line")
grid.set(title="DecisionTreeRegressor CV error", xlabel="Max depth", ylabel="Validation RMSE")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best model: DecisionTreeRegressor(max_depth=4)
Mean score: -2.7255644852153065

<seaborn.axisgrid.FacetGrid at 0x7ab2eb7fbca0>

# Randomly re-sample a training dataset and a testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

@interact(max_depth=(1, 12, 1))
def lmplot_compare(max_depth=12):
    grid = sns.lmplot(sensor_data, x="PAS", y="AQS", ci=None)
    reg = DecisionTreeRegressor(max_depth=max_depth).fit(X_train, y_train)
    grid.facet_axis(0, 0).plot(X_plot, reg.predict(X_plot), color="orange", linewidth=3)
    train_error = mean_squared_error(y_train, reg.predict(X_train), squared=False)
    test_error = mean_squared_error(y_test, reg.predict(X_test), squared=False)
    grid.set(title=f"Training error = {train_error:.2f}, Testing error = {test_error:.2f}")

interactive(children=(IntSlider(value=12, description='max_depth', max=12, min=1), Output()), _dom_classes=('w…

sensor_data[sensor_data["AQS"] > 50]

# Recreate the tidy (long-form) dataframe for Min degree = associate's
index = pd.MultiIndex.from_product([
    ["White", "Black", "Hispanic", "Asian", "Pacific Islander", "American Indian/Alaska Native"],
    range(2009, 2019),
], names=["Race", "Year"])
data = pd.DataFrame([
    47.1, 48.9, 50.1, 49.9, 51.0, 51.9, 54.0, 54.3, 53.5, 53.6,
    27.8, 29.4, 29.8, 31.6, 29.5, 32.0, 31.1, 31.7, 32.7, 32.6,
    18.4, 20.5, 20.6, 22.7, 23.1, 23.4, 25.7, 27.0, 27.7, 30.5,
    66.7, 63.4, 64.6, 68.3, 67.2, 70.3, 71.7, 71.5, 69.9, 75.5,
    20.9, 22.0, 39.7, 32.4, 37.3, float("nan"), 24.9, 28.6, 35.8, 22.6,
    20.8, 28.9, 25.0, 23.6, 26.3, 18.2, 22.3, 16.5, 27.1, 24.4
], index=index, columns=["Percentage"])

# Recreate the line plot comparing educational attainment by race
grid = sns.relplot(data, x="Year", y="Percentage", hue="Race", kind="line")
grid.set(title="Asian associate's attainment reaches new heights")

<seaborn.axisgrid.FacetGrid at 0x7ab2e357dbd0>

def plot_decisions(data, reg_cls, *args, **kwargs):
    """Trains a regression model for the given data hyperparameters and plots predictions."""
    hyperparameters = data.iloc[0, :]
    reg = reg_cls(**hyperparameters).fit(X_train, y_train)
    plt.plot(X_plot, reg.predict(X_plot), **kwargs)


# Manually create a 4-column FacetGrid to compare different hyperparameter values
grid = sns.FacetGrid(pd.DataFrame(max_depths, columns=["max_depth"]), col="max_depth", col_wrap=4)

# First, plot a linear regression model in the background
grid.map(sns.regplot, data=sensor_data, x="PAS", y="AQS", ci=None, color="lightgray")

# Then, plot the specified regression algorithm for each hyperparameter value
grid.map_dataframe(plot_decisions, reg_cls=DecisionTreeRegressor, color="orange", linewidth=3)

<seaborn.axisgrid.FacetGrid at 0x7ab2e33d5d20>

	AQS	temp	humidity	dew	PAS
0	6.7	18.027263	38.564815	3.629662	8.616954
1	3.8	16.115280	49.404315	5.442318	3.493916
2	4.0	19.897634	29.972222	1.734051	3.799601
3	4.7	21.378334	32.474513	4.165624	4.369691
4	3.2	18.443822	43.898226	5.867611	3.191071
...	...	...	...	...	...
12092	5.5	-12.101337	54.188889	-19.555834	2.386120
12093	16.8	4.159967	56.256030	-3.870659	32.444987
12094	15.6	1.707895	65.779221	-4.083768	25.297018
12095	14.0	-14.380144	48.206481	-23.015378	8.213208
12096	5.8	5.081813	52.200000	-4.016401	9.436011

	AQS	temp	humidity	dew	PAS
194	59.1	23.860340	35.887500	7.812386	133.754410
196	52.7	28.782407	36.045833	12.268621	114.234139
197	60.0	24.871142	51.743056	14.246790	123.463042
261	53.4	28.782407	36.045833	12.268621	114.234139
1720	108.8	14.383810	38.623134	0.375354	256.309609

Bias, Variance, Bias, and Bias¶

Bias–variance tradeoff¶

Imbalanced data¶

Algorithmic bias¶

Optional: Creating custom comparison plots¶