import pandas as pd
import seaborn as sns

sns.set_theme()

sensor_data = pd.read_csv("sensor_data.csv")
sensor_data

# Do we want to predict AQS from the PAS value?
feature = "PAS"
target  = "AQS"

# Or predict PAS from the AQS value?
feature = "AQS"
target  = "PAS"

sns.relplot(sensor_data, x=feature, y=target)

<seaborn.axisgrid.FacetGrid at 0x7d31515dce10>

def plot_line(slope, intercept=0):
    grid = sns.relplot(sensor_data, x=feature, y=target)
    max_x = sensor_data[feature].max()
    grid.facet_axis(0, 0).plot([0, max_x], [intercept, slope * max_x + intercept])
    grid.set(title=f"Slope = {slope:.2f}, intercept = {intercept:.2f}")
    return grid


plot_line(2.5)

<seaborn.axisgrid.FacetGrid at 0x7d315138b4d0>

# Need a metric to measure how good of a line that we have
squared_errors = (sensor_data[feature] * s - sensor_data[target]) ** 2
mean_squared_error = sum(squared_errors) / len(squared_errors)
# Mean absolute error
# Mean squared error

def plot_loss(slopes):
    from sklearn.metrics import mean_squared_error, mean_absolute_error
    losses = [mean_squared_error(sensor_data[feature] * s, sensor_data[target]) for s in slopes]
    grid = sns.relplot(x=slopes, y=losses)
    grid.set(title="Loss surface", xlabel="Slope", ylabel="MSE", xlim=[1, 3], ylim=[0, None])
    return grid


plot_loss([2, 2.1, 2.2, 2.5, 1.9, 1.8, 2.15, 2.08, 2.4, 1.5, 1.25, 1.3, 1.6, 1.65])

<seaborn.axisgrid.FacetGrid at 0x7d3147e2c2d0>

import numpy as np


def grad_mse(theta, X, y):
    return np.array(- 2 / len(X) * (X.T @ y - X.T @ X * theta))


thetas = [np.random.random()]
print("Random initial theta value:", thetas[-1])

Random initial theta value: 0.03631116004542245

plot_line(thetas[-1])

<seaborn.axisgrid.FacetGrid at 0x7d31482efc90>

plot_line(thetas[-1])
plot_loss(thetas)

# Take a small step in the opposite direction of the gradient to roll downhill
thetas.append(thetas[-1] - 0.002 * grad_mse(thetas[-1], sensor_data[feature], sensor_data[target]))

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X = sensor_data[[feature]]
y = sensor_data[target]
# Linear regression algorithm
# When trained on data (i.e. you've picked a slope), you have a linear regression model
reg = LinearRegression().fit(X, y)

print("Model:", " + ".join([f"{reg.intercept_:.2f}"] + [f"{coef:.2f}({X.columns[i]})" for i, coef in enumerate(reg.coef_)]))
print("Error:", mean_squared_error(y, reg.predict(X)))
plot_line(reg.coef_[0], reg.intercept_)

Model: -3.14 + 1.93(AQS)
Error: 30.259833701206063

<seaborn.axisgrid.FacetGrid at 0x7d3147cb4b10>

sns.lmplot(sensor_data, x=feature, y=target)

<seaborn.axisgrid.FacetGrid at 0x7d3147dcee90>

# temp	humidity	dew	
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X = sensor_data[[feature, "temp", "humidity", "dew"]]
y = sensor_data[target]
reg = LinearRegression().fit(X, y)

print("Model:", " + ".join([f"{reg.intercept_:.2f}"] + [f"{coef:.2f}({X.columns[i]})" for i, coef in enumerate(reg.coef_)]))
print("Error:", mean_squared_error(y, reg.predict(X)))

Model: -11.57 + 1.91(AQS) + 0.02(temp) + 0.17(humidity) + -0.04(dew)
Error: 23.01902262685766

# temp	humidity	dew	
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Question: Which features are actually useful for predicting the target value?
# Next time, we'll figure out how we might evaluate which variables to include in our model
X = sensor_data[[feature, "temp", "humidity"]]
y = sensor_data[target]
reg = LinearRegression().fit(X, y)

print("Model:", " + ".join([f"{reg.intercept_:.2f}"] + [f"{coef:.2f}({X.columns[i]})" for i, coef in enumerate(reg.coef_)]))
print("Error:", mean_squared_error(y, reg.predict(X)))
# The error is in units square of the target value (ug/m^3 of PM2.5)^2

Model: -10.35 + 1.91(AQS) + -0.02(temp) + 0.16(humidity)
Error: 23.02688160738098

mean_squared_error(y, reg.predict(X)) ** (0.5)
# Here, we've converted this back to ug/m^3 of PM2.5 by taking the square root

4.798633306200942

homes = pd.read_csv("homes.csv")
homes
# There are forms of linear classifiers (like logistic models)

from sklearn.tree import DecisionTreeClassifier, plot_tree
X = homes[["beds", "bath", "price", "year_built", "sqft", "price_per_sqft", "elevation"]]
y = homes["city"]
# Code for creating a new DecisionTreeClassifier specifies what we call "hyperparameters"
clf = DecisionTreeClassifier(max_depth=1).fit(X, y)

import matplotlib.pyplot as plt
plt.figure(dpi=300)
plot_tree(
    clf,
    feature_names=X.columns,
    class_names=["NY", "SF"],
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False
);

from sklearn.tree import DecisionTreeClassifier, plot_tree
X = homes[["beds", "bath", "price", "year_built", "sqft", "price_per_sqft", "elevation"]]
y = homes["city"]
# Code for creating a new DecisionTreeClassifier specifies what we call "hyperparameters"
clf = DecisionTreeClassifier().fit(X, y)

import matplotlib.pyplot as plt
plt.figure(dpi=300)
plot_tree(
    clf,
    feature_names=X.columns,
    class_names=["NY", "SF"],
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False
);

from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor().fit(homes.drop("price", axis=1), homes["price"])
reg

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_211/297773281.py in ?()
      1 from sklearn.tree import DecisionTreeRegressor
      2 
----> 3 reg = DecisionTreeRegressor().fit(homes.drop("price", axis=1), homes["price"])
      4 reg

/opt/conda/lib/python3.11/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
   1470                 skip_parameter_validation=(
   1471                     prefer_skip_nested_validation or global_skip_validation
   1472                 )
   1473             ):
-> 1474                 return fit_method(estimator, *args, **kwargs)

/opt/conda/lib/python3.11/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input)
   1373         self : DecisionTreeRegressor
   1374             Fitted estimator.
   1375         """
   1376 
-> 1377         super()._fit(
   1378             X,
   1379             y,
   1380             sample_weight=sample_weight,

/opt/conda/lib/python3.11/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input, missing_values_in_feature_mask)
    248             check_X_params = dict(
    249                 dtype=DTYPE, accept_sparse="csc", force_all_finite=False
    250             )
    251             check_y_params = dict(ensure_2d=False, dtype=None)
--> 252             X, y = self._validate_data(
    253                 X, y, validate_separately=(check_X_params, check_y_params)
    254             )
    255 

/opt/conda/lib/python3.11/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    641                 # :(
    642                 check_X_params, check_y_params = validate_separately
    643                 if "estimator" not in check_X_params:
    644                     check_X_params = {**default_check_params, **check_X_params}
--> 645                 X = check_array(X, input_name="X", **check_X_params)
    646                 if "estimator" not in check_y_params:
    647                     check_y_params = {**default_check_params, **check_y_params}
    648                 y = check_array(y, input_name="y", **check_y_params)

/opt/conda/lib/python3.11/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    994                         )
    995                     array = xp.astype(array, dtype, copy=False)
    996                 else:
    997                     array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 998             except ComplexWarning as complex_warning:
    999                 raise ValueError(
   1000                     "Complex data not supported\n{}\n".format(array)
   1001                 ) from complex_warning

/opt/conda/lib/python3.11/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
    517         # Use NumPy API to support order
    518         if copy is True:
    519             array = numpy.array(array, order=order, dtype=dtype)
    520         else:
--> 521             array = numpy.asarray(array, order=order, dtype=dtype)
    522 
    523         # At this point array is a NumPy ndarray. We convert it to an array
    524         # container that is consistent with the input's namespace.

/opt/conda/lib/python3.11/site-packages/pandas/core/generic.py in ?(self, dtype, copy)
   2149     def __array__(
   2150         self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None
   2151     ) -> np.ndarray:
   2152         values = self._values
-> 2153         arr = np.asarray(values, dtype=dtype)
   2154         if (
   2155             astype_is_view(values.dtype, arr.dtype)
   2156             and using_copy_on_write()

ValueError: could not convert string to float: 'NY'

	AQS	temp	humidity	dew	PAS
0	6.7	18.027263	38.564815	3.629662	8.616954
1	3.8	16.115280	49.404315	5.442318	3.493916
2	4.0	19.897634	29.972222	1.734051	3.799601
3	4.7	21.378334	32.474513	4.165624	4.369691
4	3.2	18.443822	43.898226	5.867611	3.191071
...	...	...	...	...	...
12092	5.5	-12.101337	54.188889	-19.555834	2.386120
12093	16.8	4.159967	56.256030	-3.870659	32.444987
12094	15.6	1.707895	65.779221	-4.083768	25.297018
12095	14.0	-14.380144	48.206481	-23.015378	8.213208
12096	5.8	5.081813	52.200000	-4.016401	9.436011

	beds	bath	price	year_built	sqft	price_per_sqft	elevation	city
0	2.0	1.0	999000	1960	1000	999	10	NY
1	2.0	2.0	2750000	2006	1418	1939	0	NY
2	2.0	2.0	1350000	1900	2150	628	9	NY
3	1.0	1.0	629000	1903	500	1258	9	NY
4	0.0	1.0	439000	1930	500	878	10	NY
...	...	...	...	...	...	...	...	...
487	5.0	2.5	1800000	1890	3073	586	76	SF
488	2.0	1.0	695000	1923	1045	665	106	SF
489	3.0	2.0	1650000	1922	1483	1113	106	SF
490	1.0	1.0	649000	1983	850	764	163	SF
491	3.0	2.0	995000	1956	1305	762	216	SF

Learning Algorithms¶

Guessing game¶

Gradient descent¶

Linear regression models¶

Classification versus regression¶