import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error

# Load in Earthquake data
earthquakes = pd.read_csv("earthquakes.csv").set_index("id")
earthquakes = gpd.GeoDataFrame(
    earthquakes,
    # crs="EPSG:4326" specifies WGS84 or GPS coordinate system, see https://epsg.io/4326
    geometry=gpd.points_from_xy(earthquakes["longitude"], earthquakes["latitude"], crs="EPSG:4326")
)
earthquakes["month"] = earthquakes["month"].astype('category')

# Load in Country data
columns = ["POP_EST", "GDP_MD", "CONTINENT", "SUBREGION", "geometry"]
countries = gpd.read_file("ne_110m_admin_0_countries.shp").set_index("NAME")[columns]
earthquakes

fig, ax = plt.subplots(figsize=(13, 5))
countries.plot(ax=ax, color="#EEE")
earthquakes.plot(ax=ax, column="magnitude", markersize=0.1, legend=True)
ax.set(title="Earthquakes between July 27, 2016 and August 25, 2016")
ax.set_axis_off()

# TODO: Split the Earthquake data into the training, testing sets
X = earthquakes[["longitude", "latitude"]]
y = earthquakes["magnitude"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# TODO: Fit the model
regr_coords = LinearRegression().fit(X_train, y_train)
y_predict = regr_coords.predict(X_test)
print("Mean squared error " + str(mean_squared_error(y_test, y_predict)))



# Do not alter
grid = sns.relplot(x=y_test, y=y_predict)
grid.set(title="Predicted Magnitude v. Observed Magnitude",
       xlabel="Observed Magnitude (test data)",
       ylabel="Predicted Magnitude (predictions)",
       yticks=list(range(0, 8)), xticks=list(range(0, 8)))
grid.ax.axline((0, 0), slope=1, color='k', ls='--')

Mean squared error 0.7892207373455713

<matplotlib.lines.AxLine at 0x7e599a3687d0>

# TODO: Create "dummy" variables for categorical features
X = earthquakes[["longitude", "latitude", "name", "month"]]
Y = earthquakes["magnitude"]
X = pd.get_dummies(X)

# TODO: Split the Earthquake data into the training testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

# TODO: Fit the model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("Mean squared error " + str(mean_squared_error(y_test, predictions)))


# Do not alter
grid = sns.relplot(x=y_test, y=predictions)
grid.set(title="Observed Magnitude v. Predicted Magnitude",
       xlabel='Observed Magnitude (test data)',
       ylabel='Predicted Magnitude (predictions)',
       yticks=list(range(0, 8)), xticks=list(range(0, 8)))
grid.ax.axline((0, 0), slope=1, color='k', ls='--')

Mean squared error 0.4364461865528423

<matplotlib.lines.AxLine at 0x7e599a258690>

plt.figure(dpi=300)
plot_tree(
    model,
    feature_names=X.columns,
    label="root",
    filled=True,
    impurity=False,
    proportion=True,
    rounded=False,
    max_depth=2,
    fontsize=5
) and None # Hide return value of plot_tree

	year	month	day	latitude	longitude	name	magnitude	geometry
id
nc72666881	2016	7	27	37.672333	-121.619000	California	1.43	POINT (-121.619 37.67233)
us20006i0y	2016	7	27	21.514600	94.572100	Burma	4.90	POINT (94.5721 21.5146)
nc72666891	2016	7	27	37.576500	-118.859167	California	0.06	POINT (-118.85917 37.5765)
nc72666896	2016	7	27	37.595833	-118.994833	California	0.40	POINT (-118.99483 37.59583)
nn00553447	2016	7	27	39.377500	-119.845000	Nevada	0.30	POINT (-119.845 39.3775)
...	...	...	...	...	...	...	...	...
nc72685246	2016	8	25	36.515499	-121.099831	California	2.42	POINT (-121.09983 36.5155)
ak13879193	2016	8	25	61.498400	-149.862700	Alaska	1.40	POINT (-149.8627 61.4984)
nc72685251	2016	8	25	38.805000	-122.821503	California	1.06	POINT (-122.8215 38.805)
ci37672328	2016	8	25	34.308000	-118.635333	California	1.55	POINT (-118.63533 34.308)
ci37672360	2016	8	25	34.119167	-116.933667	California	0.89	POINT (-116.93367 34.11917)

Machine Learning Practice¶

Dataset: Earthquakes and Countries¶

Predicting Earthquake Magnitude¶

Iteration 1: Using `longitude` and `latitude`¶

Linear Regression Using Longitude and Latitude¶

Iteration 2: Using `longitude`, `latitude`, `name`, and `month`¶

Decision Tree Regression Using Longitude, Latitude, Name, and Month¶

Debrief¶

Machine Learning Practice¶

Dataset: Earthquakes and Countries¶

Predicting Earthquake Magnitude¶

Iteration 1: Using longitude and latitude¶

Linear Regression Using Longitude and Latitude¶

Iteration 2: Using longitude, latitude, name, and month¶

Decision Tree Regression Using Longitude, Latitude, Name, and Month¶

Debrief¶

Iteration 1: Using `longitude` and `latitude`¶

Iteration 2: Using `longitude`, `latitude`, `name`, and `month`¶