import pandas as pd
import seaborn as sns

sns.set_theme()

sensor_data = pd.read_csv("sensor_data.csv")
sensor_data

sns.relplot(sensor_data, x="PAS", y="AQS")

<seaborn.axisgrid.FacetGrid at 0x7c584597a080>

sns.lmplot(sensor_data, x="PAS", y="AQS")

<seaborn.axisgrid.FacetGrid at 0x7c5902633070>

def plot_line(slope, intercept=0):
    grid = sns.relplot(sensor_data, x="PAS", y="AQS")
    grid.facet_axis(0, 0).plot([0, 260], [intercept, slope * 260 + intercept], c="orange")
    grid.set(title=f"Slope = {slope:.2f}, intercept = {intercept:.2f}")
    return grid


plot_line(0.45)

<seaborn.axisgrid.FacetGrid at 0x7c583d5077f0>

sensor_data["PAS"]

0         8.616954
1         3.493916
2         3.799601
3         4.369691
4         3.191071
           ...    
12092     2.386120
12093    32.444987
12094    25.297018
12095     8.213208
12096     9.436011
Name: PAS, Length: 12097, dtype: float64

sensor_data["AQS"]

0         6.7
1         3.8
2         4.0
3         4.7
4         3.2
         ... 
12092     5.5
12093    16.8
12094    15.6
12095    14.0
12096     5.8
Name: AQS, Length: 12097, dtype: float64

sensor_data["PAS"] * 0.45

0         3.877629
1         1.572262
2         1.709820
3         1.966361
4         1.435982
           ...    
12092     1.073754
12093    14.600244
12094    11.383658
12095     3.695944
12096     4.246205
Name: PAS, Length: 12097, dtype: float64

# "Error" represents the difference between our guess and the actual label
error_point45 = sensor_data["PAS"] * 0.45 - sensor_data["AQS"]
error_point45

0        -2.822371
1        -2.227738
2        -2.290180
3        -2.733639
4        -1.764018
           ...    
12092    -4.426246
12093    -2.199756
12094    -4.216342
12095   -10.304056
12096    -1.553795
Length: 12097, dtype: float64

# Metric 1: "Mean Error" --- it can cancel out positives with negatives
error_point45.mean()

-2.559337341646414

# Metric 2: "Mean Absolute Error"
error_point45.abs().mean()

2.8898755399452463

# Metric 3: "Mean Squared Error" --- tend to penalize big errors moreso than small errors
(error_point45 ** 2).mean()
# Supervised machine learning: using a metric to tell the quality of your learning

13.182456516471547

(error_slope2 ** 2).mean()

693.5889006554958

def plot_loss(slopes):
    from sklearn.metrics import mean_squared_error as mse
    losses = [mse(sensor_data["PAS"] * s, sensor_data["AQS"]) for s in slopes]
    grid = sns.relplot(x=slopes, y=losses)
    grid.set(title="Loss surface", xlabel="Slope", ylabel="MSE", xlim=[0, 1], ylim=[0, None])
    return grid


plot_loss([1, 0.5, 0.25, 0.45, 0.6, 0.75])

<seaborn.axisgrid.FacetGrid at 0x7c58538c4940>

import numpy as np


def grad_mse(theta, X, y):
    return np.array(- 2 / len(X) * (X.T @ y - X.T @ X * theta))


thetas = [np.random.random()]
print("Random initial theta value:", thetas[-1])

Random initial theta value: 0.964483396122684

plot_line(thetas[-1])
plot_loss(thetas)

# Take a small step in the opposite direction of the gradient to roll downhill
thetas.append(thetas[-1] - 0.002 * grad_mse(thetas[-1], sensor_data["PAS"], sensor_data["AQS"]))

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
X = sensor_data[["PAS"]]
y = sensor_data["AQS"]
reg = LinearRegression().fit(X, y)

print("Model:", " + ".join([f"{reg.intercept_:.2f}"] + [f"{coef:.2f}({X.columns[i]})" for i, coef in enumerate(reg.coef_)]))
print("Error:", mean_squared_error(y, reg.predict(X)))
plot_line(reg.coef_[0], reg.intercept_)

Model: 3.18 + 0.40(PAS)
Error: 6.343125879204806

<seaborn.axisgrid.FacetGrid at 0x7c583d640850>

sns.lmplot(sensor_data, x="PAS", y="AQS")

<seaborn.axisgrid.FacetGrid at 0x7c583d6422f0>

X = sensor_data[["PAS", "humidity"]]
y = sensor_data["AQS"]
reg = LinearRegression().fit(X, y)

print("Model:", " + ".join([f"{reg.intercept_:.2f}"] + [f"{coef:.2f}({X.columns[i]})" for i, coef in enumerate(reg.coef_)]))
print("Error:", mean_squared_error(y, reg.predict(X)))
plot_line(reg.coef_[0], reg.intercept_)

Model: 6.25 + 0.43(PAS) + -0.07(humidity)
Error: 5.144725853340175

<seaborn.axisgrid.FacetGrid at 0x7c583d6dc4f0>

homes = pd.read_csv("homes.csv")
homes

# How does it work?
for bed_count in homes["beds"].unique():
    # try dividing the dataset by that bed count

array([ 2. ,  1. ,  0. ,  3. ,  4. ,  5. , 10. ,  7. ,  8. ,  0.5,  6. ])

from sklearn.tree import DecisionTreeClassifier, plot_tree
X = homes.drop("city", axis=1)
y = homes["city"]
clf = DecisionTreeClassifier(max_depth=2).fit(X, y)

import matplotlib.pyplot as plt
plt.figure(dpi=300)
plot_tree(
    clf,
    feature_names=X.columns,
    class_names=["NY", "SF"],
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False
) and None # Hide return value of plot_tree

from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor().fit(homes.drop(["price", "city"], axis=1), homes["price"])
reg

DecisionTreeRegressor()

DecisionTreeRegressor()

plt.figure(dpi=300)
plot_tree(
    reg,
    feature_names=X.columns,
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False,
    max_depth=3
) and None # Hide return value of plot_tree

# How to incorporate the city value? It's a string so we have to convert it to columns with numbers
pd.concat([homes.drop(["price", "city"], axis=1), pd.get_dummies(homes["city"])], axis=1)

	beds	bath	price	year_built	sqft	price_per_sqft	elevation	city
0	2.0	1.0	999000	1960	1000	999	10	NY
1	2.0	2.0	2750000	2006	1418	1939	0	NY
2	2.0	2.0	1350000	1900	2150	628	9	NY
3	1.0	1.0	629000	1903	500	1258	9	NY
4	0.0	1.0	439000	1930	500	878	10	NY
...	...	...	...	...	...	...	...	...
487	5.0	2.5	1800000	1890	3073	586	76	SF
488	2.0	1.0	695000	1923	1045	665	106	SF
489	3.0	2.0	1650000	1922	1483	1113	106	SF
490	1.0	1.0	649000	1983	850	764	163	SF
491	3.0	2.0	995000	1956	1305	762	216	SF

Learning Algorithms¶

Guessing game¶

Gradient descent¶

Linear regression models¶

Classification versus regression¶

	AQS	temp	humidity	dew	PAS
0	6.7	18.027263	38.564815	3.629662	8.616954
1	3.8	16.115280	49.404315	5.442318	3.493916
2	4.0	19.897634	29.972222	1.734051	3.799601
3	4.7	21.378334	32.474513	4.165624	4.369691
4	3.2	18.443822	43.898226	5.867611	3.191071
...	...	...	...	...	...
12092	5.5	-12.101337	54.188889	-19.555834	2.386120
12093	16.8	4.159967	56.256030	-3.870659	32.444987
12094	15.6	1.707895	65.779221	-4.083768	25.297018
12095	14.0	-14.380144	48.206481	-23.015378	8.213208
12096	5.8	5.081813	52.200000	-4.016401	9.436011

	beds	bath	year_built	sqft	price_per_sqft	elevation	NY	SF
0	2.0	1.0	1960	1000	999	10	True	False
1	2.0	2.0	2006	1418	1939	0	True	False
2	2.0	2.0	1900	2150	628	9	True	False
3	1.0	1.0	1903	500	1258	9	True	False
4	0.0	1.0	1930	500	878	10	True	False
...	...	...	...	...	...	...	...	...
487	5.0	2.5	1890	3073	586	76	False	True
488	2.0	1.0	1923	1045	665	106	False	True
489	3.0	2.0	1922	1483	1113	106	False	True
490	1.0	1.0	1983	850	764	163	False	True
491	3.0	2.0	1956	1305	762	216	False	True