import pandas as pd
import seaborn as sns

sns.set_theme()

sensor_data = pd.read_csv("sensor_data.csv")
sensor_data

sns.relplot(sensor_data, x="PAS", y="AQS")

<seaborn.axisgrid.FacetGrid at 0x7d4d2d24e050>

sns.lmplot(sensor_data, x="PAS", y="AQS")

<seaborn.axisgrid.FacetGrid at 0x7d4de8efae90>

def plot_line(slope, intercept=0):
    grid = sns.relplot(sensor_data, x="PAS", y="AQS")
    grid.facet_axis(0, 0).plot([0, 260], [intercept, slope * 260 + intercept], c="orange")
    grid.set(title=f"Slope = {slope:.2f}, intercept = {intercept:.2f}")
    return grid


plot_line(0.4)

<seaborn.axisgrid.FacetGrid at 0x7d4d24e41330>

sensor_data["AQS"]

0         6.7
1         3.8
2         4.0
3         4.7
4         3.2
         ... 
12092     5.5
12093    16.8
12094    15.6
12095    14.0
12096     5.8
Name: AQS, Length: 12097, dtype: float64

sensor_data["PAS"] * 0.4 # f(x) = 0.4x

0         3.446781
1         1.397567
2         1.519840
3         1.747876
4         1.276428
           ...    
12092     0.954448
12093    12.977995
12094    10.118807
12095     3.285283
12096     3.774405
Name: PAS, Length: 12097, dtype: float64

absolute_error = sensor_data["AQS"] - sensor_data["PAS"] * 0.4
absolute_error

0         3.253219
1         2.402433
2         2.480160
3         2.952124
4         1.923572
           ...    
12092     4.545552
12093     3.822005
12094     5.481193
12095    10.714717
12096     2.025595
Length: 12097, dtype: float64

absolute_error[absolute_error < 0]

1081    -1.658926
1088    -1.476082
1325    -0.325128
1328    -0.503032
1581    -0.004646
           ...   
11785   -1.914624
11786   -2.722031
11821   -0.029849
12005   -2.131291
12006   -3.051198
Length: 662, dtype: float64

absolute_error.abs().mean()

3.377812356842188

(absolute_error ** 2).mean()

16.815882470137378

def plot_loss(slopes):
    losses = []#[... for s in slopes]
    for slope in slopes:
        absolute_error = sensor_data["AQS"] - sensor_data["PAS"] * slope
        losses.append((absolute_error**2).mean())
    grid = sns.relplot(x=slopes, y=losses)
    grid.set(title="Loss surface", xlabel="Slope", ylabel="MSE", xlim=[0, 1], ylim=[0, None])
    return grid


plot_loss([0.1 * i for i in range(11)])

<seaborn.axisgrid.FacetGrid at 0x7d4d24cd3c10>

import numpy as np


def grad_mse(theta, X, y):
    return np.array(- 2 / len(X) * (X.T @ y - X.T @ X * theta))


thetas = [np.random.random()]
print("Random initial theta value:", thetas[-1])

Random initial theta value: 0.7511928515894967

plot_line(thetas[-1])
plot_loss(thetas)

# Take a small step in the opposite direction of the gradient to roll downhill
thetas.append(thetas[-1] - 0.002 * grad_mse(thetas[-1], sensor_data["PAS"], sensor_data["AQS"]))

f(x) = Ax + b

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X = sensor_data[["PAS"]]
y = sensor_data["AQS"]
reg = LinearRegression().fit(X, y)

print("Model:", " + ".join([f"{reg.intercept_:.2f}"] + [f"{coef:.2f}({X.columns[i]})" for i, coef in enumerate(reg.coef_)]))
print("Error:", mean_squared_error(y, reg.predict(X)))
plot_line(reg.coef_[0], reg.intercept_)

Model: 3.18 + 0.40(PAS)
Error: 6.343125879204806

<seaborn.axisgrid.FacetGrid at 0x7d4d249c4af0>

sns.lmplot(sensor_data, x="PAS", y="AQS")

<seaborn.axisgrid.FacetGrid at 0x7d4d24e95c90>

X = sensor_data[["PAS", "humidity"]]
y = sensor_data["AQS"]
reg = LinearRegression().fit(X, y)

print("Model:", " + ".join([f"{reg.intercept_:.2f}"] + [f"{coef:.2f}({X.columns[i]})" for i, coef in enumerate(reg.coef_)]))
print("Error:", mean_squared_error(y, reg.predict(X)))
plot_line(reg.coef_[0], reg.intercept_)

Model: 6.25 + 0.43(PAS) + -0.07(humidity)
Error: 5.144725853340175

<seaborn.axisgrid.FacetGrid at 0x7d4d23daf670>

homes = pd.read_csv("homes.csv")
homes

from sklearn.tree import DecisionTreeClassifier, plot_tree
X = homes.drop("city", axis=1)
y = homes["city"]
clf = DecisionTreeClassifier(max_depth=2).fit(X, y)

import matplotlib.pyplot as plt
plt.figure(dpi=300)
plot_tree(
    clf,
    feature_names=X.columns,
    class_names=["NY", "SF"],
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False
);

homes.drop("price", axis=1)

from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor().fit(homes.drop("price", axis=1), homes["price"])
reg

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_167/297773281.py in ?()
      1 from sklearn.tree import DecisionTreeRegressor
      2 
----> 3 reg = DecisionTreeRegressor().fit(homes.drop("price", axis=1), homes["price"])
      4 reg

/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(estimator, *args, **kwargs)
   1148                 skip_parameter_validation=(
   1149                     prefer_skip_nested_validation or global_skip_validation
   1150                 )
   1151             ):
-> 1152                 return fit_method(estimator, *args, **kwargs)

/opt/conda/lib/python3.10/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input)
   1316         self : DecisionTreeRegressor
   1317             Fitted estimator.
   1318         """
   1319 
-> 1320         super()._fit(
   1321             X,
   1322             y,
   1323             sample_weight=sample_weight,

/opt/conda/lib/python3.10/site-packages/sklearn/tree/_classes.py in ?(self, X, y, sample_weight, check_input, missing_values_in_feature_mask)
    238             check_X_params = dict(
    239                 dtype=DTYPE, accept_sparse="csc", force_all_finite=False
    240             )
    241             check_y_params = dict(ensure_2d=False, dtype=None)
--> 242             X, y = self._validate_data(
    243                 X, y, validate_separately=(check_X_params, check_y_params)
    244             )
    245 

/opt/conda/lib/python3.10/site-packages/sklearn/base.py in ?(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    613                 # :(
    614                 check_X_params, check_y_params = validate_separately
    615                 if "estimator" not in check_X_params:
    616                     check_X_params = {**default_check_params, **check_X_params}
--> 617                 X = check_array(X, input_name="X", **check_X_params)
    618                 if "estimator" not in check_y_params:
    619                     check_y_params = {**default_check_params, **check_y_params}
    620                 y = check_array(y, input_name="y", **check_y_params)

/opt/conda/lib/python3.10/site-packages/sklearn/utils/validation.py in ?(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
    912                         )
    913                     array = xp.astype(array, dtype, copy=False)
    914                 else:
    915                     array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
--> 916             except ComplexWarning as complex_warning:
    917                 raise ValueError(
    918                     "Complex data not supported\n{}\n".format(array)
    919                 ) from complex_warning

/opt/conda/lib/python3.10/site-packages/sklearn/utils/_array_api.py in ?(array, dtype, order, copy, xp)
    376         # Use NumPy API to support order
    377         if copy is True:
    378             array = numpy.array(array, order=order, dtype=dtype)
    379         else:
--> 380             array = numpy.asarray(array, order=order, dtype=dtype)
    381 
    382         # At this point array is a NumPy ndarray. We convert it to an array
    383         # container that is consistent with the input's namespace.

/opt/conda/lib/python3.10/site-packages/pandas/core/generic.py in ?(self, dtype)
   2082     def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
   2083         values = self._values
-> 2084         arr = np.asarray(values, dtype=dtype)
   2085         if (
   2086             astype_is_view(values.dtype, arr.dtype)
   2087             and using_copy_on_write()

ValueError: could not convert string to float: 'NY'

# how to use "city" column?
homes_with_city_encoded = pd.get_dummies(homes, columns=["city"], dummy_na=True)
homes_with_city_encoded

reg = DecisionTreeClassifier(max_depth=3).fit(homes_with_city_encoded.drop(columns=["price"]), homes_with_city_encoded["price"])
reg

DecisionTreeClassifier(max_depth=3)

DecisionTreeClassifier(max_depth=3)

plt.figure(dpi=300)
plot_tree(
    reg,
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False
);

	AQS	temp	humidity	dew	PAS
0	6.7	18.027263	38.564815	3.629662	8.616954
1	3.8	16.115280	49.404315	5.442318	3.493916
2	4.0	19.897634	29.972222	1.734051	3.799601
3	4.7	21.378334	32.474513	4.165624	4.369691
4	3.2	18.443822	43.898226	5.867611	3.191071
...	...	...	...	...	...
12092	5.5	-12.101337	54.188889	-19.555834	2.386120
12093	16.8	4.159967	56.256030	-3.870659	32.444987
12094	15.6	1.707895	65.779221	-4.083768	25.297018
12095	14.0	-14.380144	48.206481	-23.015378	8.213208
12096	5.8	5.081813	52.200000	-4.016401	9.436011

	beds	bath	price	year_built	sqft	price_per_sqft	elevation	city
0	2.0	1.0	999000	1960	1000	999	10	NY
1	2.0	2.0	2750000	2006	1418	1939	0	NY
2	2.0	2.0	1350000	1900	2150	628	9	NY
3	1.0	1.0	629000	1903	500	1258	9	NY
4	0.0	1.0	439000	1930	500	878	10	NY
...	...	...	...	...	...	...	...	...
487	5.0	2.5	1800000	1890	3073	586	76	SF
488	2.0	1.0	695000	1923	1045	665	106	SF
489	3.0	2.0	1650000	1922	1483	1113	106	SF
490	1.0	1.0	649000	1983	850	764	163	SF
491	3.0	2.0	995000	1956	1305	762	216	SF

	beds	bath	year_built	sqft	price_per_sqft	elevation	city
0	2.0	1.0	1960	1000	999	10	NY
1	2.0	2.0	2006	1418	1939	0	NY
2	2.0	2.0	1900	2150	628	9	NY
3	1.0	1.0	1903	500	1258	9	NY
4	0.0	1.0	1930	500	878	10	NY
...	...	...	...	...	...	...	...
487	5.0	2.5	1890	3073	586	76	SF
488	2.0	1.0	1923	1045	665	106	SF
489	3.0	2.0	1922	1483	1113	106	SF
490	1.0	1.0	1983	850	764	163	SF
491	3.0	2.0	1956	1305	762	216	SF

	beds	bath	price	year_built	sqft	price_per_sqft	elevation	city_NY	city_SF	city_nan
0	2.0	1.0	999000	1960	1000	999	10	True	False	False
1	2.0	2.0	2750000	2006	1418	1939	0	True	False	False
2	2.0	2.0	1350000	1900	2150	628	9	True	False	False
3	1.0	1.0	629000	1903	500	1258	9	True	False	False
4	0.0	1.0	439000	1930	500	878	10	True	False	False
...	...	...	...	...	...	...	...	...	...	...
487	5.0	2.5	1800000	1890	3073	586	76	False	True	False
488	2.0	1.0	695000	1923	1045	665	106	False	True	False
489	3.0	2.0	1650000	1922	1483	1113	106	False	True	False
490	1.0	1.0	649000	1983	850	764	163	False	True	False
491	3.0	2.0	995000	1956	1305	762	216	False	True	False

Learning Algorithms¶

Guessing game¶

Gradient descent¶

Linear regression models¶

Classification versus regression¶