This notebook explores the backpropagation algorithm and the use of PyTorch for neural networks.
Last updated by Ethan Chau, November 2020.
!pip install torchviz
from collections import OrderedDict
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchviz import make_dot
Collecting torchviz Downloading https://files.pythonhosted.org/packages/8f/8e/a9630c7786b846d08b47714dd363a051f5e37b4ea0e534460d8cdfc1644b/torchviz-0.0.1.tar.gz (41kB) |████████████████████████████████| 51kB 2.0MB/s Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchviz) (1.7.0+cu101) Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from torchviz) (0.10.1) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.6/dist-packages (from torch->torchviz) (3.7.4.3) Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch->torchviz) (1.18.5) Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch->torchviz) (0.16.0) Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from torch->torchviz) (0.7) Building wheels for collected packages: torchviz Building wheel for torchviz (setup.py) ... done Created wheel for torchviz: filename=torchviz-0.0.1-cp36-none-any.whl size=3520 sha256=61677756d0e5bc2ba5318121bc5cb2430c9be79ac3c5c17e0400751433f8d094 Stored in directory: /root/.cache/pip/wheels/2a/c2/c5/b8b4d0f7992c735f6db5bfa3c5f354cf36502037ca2b585667 Successfully built torchviz Installing collected packages: torchviz Successfully installed torchviz-0.0.1
This section visualizes the backpropagation algorithm as it occurs in PyTorch. We begin with an example of the computation graph for simple functions, then apply to a neural network with more complicated derivatives.
def f(x):
return x ** 2
def f_prime_analytical(x):
return 2 * x
x = torch.tensor([3.5], requires_grad=True)
y = f(x)
y
tensor([12.2500], grad_fn=<PowBackward0>)
make_dot(y, params={'x': x, 'y': y})
y.backward()
x.grad, f_prime_analytical(x)
(tensor([7.]), tensor([7.], grad_fn=<MulBackward0>))
Side note: if we don't want gradients, we can switch them off with the torch.no_grad()
flag.
with torch.no_grad():
no_grad_y = f_prime_analytical(x)
no_grad_y
tensor([7.])
Recall the following function: $$f(x, y) = \frac{6 \exp (-y)}{1 + x^2 + y^2} + 2x^3$$ Let's see how the computational graph looks, in action.
First, we'll see that breaking up the computation yields the same results.
# f(x, y) implemented as a one-liner
def f_all_in_one(x, y):
return 2 * (x ** 3) + (6 * torch.exp(-y)) / (1 + x ** 2 + y ** 2)
# Declare the variables here so that we can use them later on
z_1 = None
z_2 = None
z_3 = None
# f(x, y) implemented using intermediate variables
def f_piece_by_piece(x, y):
global z_1, z_2, z_3
z_1 = torch.exp(-y)
z_1.retain_grad()
z_2 = 1 + x ** 2 + y ** 2
z_2.retain_grad()
z_3 = x ** 3
z_3.retain_grad()
z_4 = 6 * z_1 / z_2 + 2 * z_3
z_4.retain_grad()
return z_4
x = torch.randn(1)
y = torch.randn(1)
x, y
(tensor([0.9829]), tensor([1.8251]))
with torch.no_grad():
# Make sure that the all-in-one closed form solution and the piece-by-piece solution are equal
print(f_all_in_one(x, y), f_piece_by_piece(x, y))
tensor([2.0817]) tensor([2.0817])
Now, let's visualize what the computation graph looks like.
x = torch.randn(1, requires_grad=True)
y = torch.randn(1, requires_grad=True)
x, y
(tensor([2.], requires_grad=True), tensor([4.], requires_grad=True))
fxy_piece = f_piece_by_piece(x, y)
make_dot(fxy_piece, params={'x': x, 'y': y, 'z_1': z_1})
Finally, we'll see that the gradients that PyTorch computes w.r.t. each parameter can be used to reconstruct the autograd-based solution, which is also equal to the closed form solutions: $$\nabla_x f(x, y) = 6x^2 - \frac{12x \exp(-y)}{(x^2 + y^2 + 1)^2}$$ $$\nabla_y f(x, y) = - \frac{6 \exp (-y) (x^2 + (y + 1)^2)}{(x^2 + y^2 + 1)^2}$$
# Closed-form definitions of gradients
def dfdx_analytical(x, y):
return 6 * (x ** 2) - (12 * x * torch.exp(-y)) / ((x ** 2 + y ** 2 + 1) ** 2)
def dfdy_analytical(x, y):
return - (6 * torch.exp(-y) * (x ** 2 + (y + 1) ** 2)) / ((x ** 2 + y ** 2 + 1) ** 2)
fxy_piece.backward()
with torch.no_grad():
dz4z3 = torch.FloatTensor([2])
dz4z2 = -6 * z_1 / (z_2 ** 2)
dz4z1 = 6 / z_2
dz3x = 3 * (x ** 2)
dz2x = 2 * x
dz2y = 2 * y
dz1y = -torch.exp(-y)
dz4x = z_3.grad * dz3x + z_2.grad * dz2x
dz4y = z_2.grad * dz2y + z_1.grad * dz1y
print("Variable\tPyTorch\t\t\tAutodiff\t\tAnalytical")
print(f"dz4/dz3\t\t{z_3.grad}\t\t{dz4z3}")
print(f"dz4/dz2\t\t{z_2.grad}\t{dz4z2}")
print(f"dz4/dz1\t\t{z_1.grad}\t{dz4z1}")
print(f"dz4/dx\t\t{x.grad}\t{dz4x}\t{dfdx_analytical(x, y)}")
print(f"dz4/dy\t\t{y.grad}\t{dz4y}\t{dfdy_analytical(x, y)}")
Variable PyTorch Autodiff Analytical dz4/dz3 tensor([2.]) tensor([2.]) dz4/dz2 tensor([-0.0002]) tensor([-0.0002]) dz4/dz1 tensor([0.2857]) tensor([0.2857]) dz4/dx tensor([23.9990]) tensor([23.9990]) tensor([23.9990]) dz4/dy tensor([-0.0072]) tensor([-0.0072]) tensor([-0.0072])
Let's implement the neural network from class:
Note that we can write this as follows (why?): $$h_\theta (x) = g(b_2 + W_2^T g(b_1 + W_1^T x))$$ where $g$ is the activation function, $x \in \mathbb{R}^3$, $W_1 \in \mathbb{R}^{3 \times 3}$, $W_2 \in \mathbb{R}^{3 \times 1}$, $b_1 \in \mathbb{R}^3$, and $b_2 \in \mathbb{R}$.
model = nn.Sequential(OrderedDict([
('linear1', nn.Linear(3, 3)),
('sig1', nn.Sigmoid()),
('linear2', nn.Linear(3, 1)),
('sig2', nn.Sigmoid())
]))
model.requires_grad_()
model
Sequential( (linear1): Linear(in_features=3, out_features=3, bias=True) (sig1): Sigmoid() (linear2): Linear(in_features=3, out_features=1, bias=True) (sig2): Sigmoid() )
# Not requiring grad here! We don't need to change our inputs.
x = torch.randn(3)
x
tensor([-0.1691, -0.2432, 0.2890])
y = model(x)
make_dot(y, params=dict([('x', x)] + list(model.named_parameters())))
Now, let's backpropagate the gradients through our graph, automagically.
y.backward()
for name, param in model.named_parameters():
print(name)
print(param.grad)
print(param.grad.shape == param.shape)
print('\n')
print(f"x.grad: {x.grad}")
linear1.weight tensor([[-0.0011, 0.0057, -0.0029], [ 0.0043, -0.0191, 0.0092], [ 0.0085, -0.0370, 0.0177]]) True linear1.bias tensor([-0.0079, 0.0246, 0.0473]) True linear2.weight tensor([[0.3100, 0.2712, 0.1890]]) True linear2.bias tensor([0.4511]) True x.grad: None
This section demonstrates how we can train a neural network from scratch with PyTorch.
It is copied from the PyTorch tutorial for compactness.
First, let's generate some data based on the function: $$y(x) = 4 \sin(x \pi) \cos(6\pi x^2)$$ for random values of $x$.
%matplotlib inline
d = 1
n = 200
X = torch.rand(n,d)
y = 4 * torch.sin(np.pi * X) * torch.cos(6*np.pi*X**2)
plt.scatter(X.numpy(), y.numpy())
plt.title('plot of $f(x)$')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.show()
Here we define a simple two hidden layer neural network with Tanh activations. There are a few hyper parameters to play with to get a feel for how they change the results.
# feel free to play with these parameters
step_size = 0.05
n_epochs = 6000
n_hidden_1 = 32
n_hidden_2 = 32
d_out = 1
neural_network = nn.Sequential(
nn.Linear(d, n_hidden_1),
nn.Tanh(),
nn.Linear(n_hidden_1, n_hidden_2),
nn.Tanh(),
nn.Linear(n_hidden_2, d_out)
)
loss_func = nn.MSELoss()
optim = torch.optim.SGD(neural_network.parameters(), lr=step_size)
print('iter,\tloss')
for i in range(n_epochs):
y_hat = neural_network(X)
loss = loss_func(y_hat, y)
optim.zero_grad()
loss.backward()
optim.step()
if i % (n_epochs // 10) == 0:
print('{},\t{:.2f}'.format(i, loss.item()))
iter, loss 0, 3.53 600, 3.24 1200, 1.68 1800, 1.21 2400, 0.95 3000, 0.63 3600, 0.42 4200, 0.18 4800, 0.13 5400, 0.12
X_grid = torch.from_numpy(np.linspace(0,1,50)).float().view(-1, d)
y_hat = neural_network(X_grid)
plt.scatter(X.numpy(), y.numpy())
plt.plot(X_grid.detach().numpy(), y_hat.detach().numpy(), 'r')
plt.title('plot of $f(x)$ and $\hat{f}(x)$')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.show()
So far, we have been considering regression tasks and have used the MSELoss module. For the homework, we will be performing a classification task and will use the cross entropy loss.
PyTorch implements a version of the cross entropy loss in one module called CrossEntropyLoss. Its usage is slightly different than MSE, so we will break it down here.
loss = nn.CrossEntropyLoss()
input = torch.tensor([[-1., 1],[-1, 1],[1, -1]]) # raw scores correspond to the correct class
# input = torch.tensor([[-3., 3],[-3, 3],[3, -3]]) # raw scores correspond to the correct class with higher confidence
# input = torch.tensor([[1., -1],[1, -1],[-1, 1]]) # raw scores correspond to the incorrect class
# input = torch.tensor([[3., -3],[3, -3],[-3, 3]]) # raw scores correspond to the incorrect class with incorrectly placed confidence
target = torch.tensor([1, 1, 0])
output = loss(input, target)
print(output)
tensor(0.1269)