import torch
import torch.nn as nn
import torch.nn.functional as F

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

import numpy as np

torch.manual_seed(446)
np.random.seed(446)


def visualize_fun(w, title, num_pts=20):
    
    x1, x2 = np.meshgrid(np.linspace(-2,2, num_pts), np.linspace(-2,2, num_pts))
    X_plane = torch.tensor(np.stack([np.reshape(x1, (num_pts**2)), np.reshape(x2, (num_pts**2))], axis=1)).float()
    y_plane = np.reshape((X_plane @ w).detach().numpy(), (num_pts, num_pts))
    
    plt3d = plt.figure().gca(projection='3d')
    plt3d.plot_surface(x1, x2, y_plane, alpha=0.2)

    ax = plt.gca()
    ax.scatter(X[:,0].numpy(), X[:,1].numpy(), y.numpy(), c='r', marker='o')

    ax.set_xlabel('$X_1$')
    ax.set_ylabel('$X_2$')
    ax.set_zlabel('$Y$')
    
    plt.title(title)
    plt.show()


# make a simple linear dataset with some noise

d = 2
n = 50
X = torch.randn(n,d)
true_w = torch.tensor([[-1.0], [2.0]])
y = X @ true_w + torch.randn(n,1) * 0.1
print('X shape', X.shape)
print('y shape', y.shape)
print('w shape', true_w.shape)

X shape torch.Size([50, 2])
y shape torch.Size([50, 1])
w shape torch.Size([2, 1])


d_in = 3
d_out = 4
linear_module = nn.Linear(d_in, d_out)

example_tensor = torch.tensor([[1.,2,3], [4,5,6]])
# applys a linear transformation to the data
transformed = linear_module(example_tensor)
print('example_tensor', example_tensor.shape)
print('transormed', transformed.shape)
print()
print('We can see that the weights exist in the background\n')
print('W:', linear_module.weight)
print('b:', linear_module.bias)

example_tensor torch.Size([2, 3])
transormed torch.Size([2, 4])

We can see that the weights exist in the background

W: Parameter containing:
tensor([[ 0.3270,  0.2183,  0.2269],
        [-0.5094, -0.4306,  0.2483],
        [-0.0776, -0.5372,  0.0966],
        [-0.1610,  0.2270, -0.0063]], requires_grad=True)
b: Parameter containing:
tensor([ 0.1384, -0.1959, -0.2587,  0.0353], requires_grad=True)


activation_fn = nn.ReLU() # we instantiate an instance of the ReLU module
example_tensor = torch.tensor([-1.0, 1.0, 0.0])
activated = activation_fn(example_tensor)
print('example_tensor', example_tensor)
print('activated', activated)

example_tensor tensor([-1.,  1.,  0.])
activated tensor([0., 1., 0.])


d_in = 3
d_hidden = 4
d_out = 1
model = torch.nn.Sequential(
                            nn.Linear(d_in, d_hidden),
                            nn.Tanh(),
                            nn.Linear(d_hidden, d_out),
                            nn.Sigmoid()
                           )

example_tensor = torch.tensor([[1.,2,3],[4,5,6]])
transformed = model(example_tensor)
print('transformed', transformed.shape)

transformed torch.Size([2, 1])


params = model.parameters()

for param in params:
    print(param)

Parameter containing:
tensor([[ 0.5478, -0.5734,  0.2589],
        [ 0.5739, -0.4392, -0.0377],
        [ 0.2290,  0.0529,  0.4021],
        [ 0.3153, -0.4802,  0.3067]], requires_grad=True)
Parameter containing:
tensor([ 0.4905,  0.3743,  0.4069, -0.2514], requires_grad=True)
Parameter containing:
tensor([[-0.1443, -0.1406,  0.0414, -0.4699]], requires_grad=True)
Parameter containing:
tensor([0.2149], requires_grad=True)


mse_loss_fn = nn.MSELoss()

input = torch.tensor([[0., 0, 0]])
target = torch.tensor([[1., 0, -1]])

loss = mse_loss_fn(input, target)

print(loss)

tensor(0.6667)


# create a simple model
model = nn.Linear(1, 1)

# create a simple dataset
X_simple = torch.tensor([[1.]])
y_simple = torch.tensor([[2.]])

# create our optimizer
optim = torch.optim.SGD(model.parameters(), lr=1e-2)
mse_loss_fn = nn.MSELoss()

y_hat = model(X_simple)
print('model params before:', model.weight)
loss = mse_loss_fn(y_hat, y_simple)
optim.zero_grad()
loss.backward()
optim.step()
print('model params after:', model.weight)

model params before: Parameter containing:
tensor([[-0.3881]], requires_grad=True)
model params after: Parameter containing:
tensor([[-0.3603]], requires_grad=True)


step_size = 0.1

linear_module = nn.Linear(d, 1, bias=False)

loss_func = nn.MSELoss()

optim = torch.optim.SGD(linear_module.parameters(), lr=step_size)

print('iter,\tloss,\tw')

for i in range(20):
    y_hat = linear_module(X)
    loss = loss_func(y_hat, y)
    optim.zero_grad()
    loss.backward()
    optim.step()
    
    print('{},\t{:.2f},\t{}'.format(i, loss.item(), linear_module.weight.view(2).detach().numpy()))

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', linear_module.weight.view(2).detach().numpy())

iter,	loss,	w
0,	3.45,	[-0.6277163  0.5246437]
1,	2.23,	[-0.6903032   0.81286395]
2,	1.45,	[-0.74182737  1.0444099 ]
3,	0.94,	[-0.7842876  1.2304163]
4,	0.61,	[-0.8193146  1.3798317]
5,	0.40,	[-0.8482402  1.4998472]
6,	0.26,	[-0.8721527  1.5962417]
7,	0.17,	[-0.8919422  1.6736592]
8,	0.11,	[-0.9083374  1.7358311]
9,	0.08,	[-0.92193544  1.785756  ]
10,	0.05,	[-0.93322587  1.825843  ]
11,	0.04,	[-0.9426106  1.8580279]
12,	0.03,	[-0.95041996  1.8838661 ]
13,	0.02,	[-0.95692545  1.9046069 ]
14,	0.02,	[-0.9623507  1.9212543]
15,	0.01,	[-0.9668801  1.9346145]
16,	0.01,	[-0.9706655  1.9453354]
17,	0.01,	[-0.97383255  1.953937  ]
18,	0.01,	[-0.976485   1.9608375]
19,	0.01,	[-0.9787088  1.9663724]

true w		 [-1.  2.]
estimated w	 [-0.9787088  1.9663724]


visualize_fun(linear_module.weight.t(), 'Dataset with learned $w$ (PyTorch GD)')


step_size = 0.01

linear_module = nn.Linear(d, 1)
loss_func = nn.MSELoss()
optim = torch.optim.SGD(linear_module.parameters(), lr=step_size)
print('iter,\tloss,\tw')
for i in range(200):
    rand_idx = np.random.choice(n) # take a random point from the dataset
    x = X[rand_idx] 
    y_hat = linear_module(x)
    loss = loss_func(y_hat, y[rand_idx]) # only compute the loss on the single point
    optim.zero_grad()
    loss.backward()
    optim.step()
    
    if i % 20 == 0:
        print('{},\t{:.2f},\t{}'.format(i, loss.item(), linear_module.weight.view(2).detach().numpy()))

print('\ntrue w\t\t', true_w.view(2).numpy())
print('estimated w\t', linear_module.weight.view(2).detach().numpy())

iter,	loss,	w
0,	0.01,	[0.02993712 0.40586257]
20,	7.11,	[-0.21203727  0.6746018 ]
40,	0.13,	[-0.5276561  1.248206 ]
60,	0.00,	[-0.64190584  1.4578575 ]
80,	0.01,	[-0.7105578  1.644668 ]
100,	0.01,	[-0.84861267  1.8285881 ]
120,	0.00,	[-0.884992   1.8483113]
140,	0.00,	[-0.8915096  1.8892854]
160,	0.01,	[-0.9286745  1.9184113]
180,	0.00,	[-0.93960834  1.9385221 ]

true w		 [-1.  2.]
estimated w	 [-0.94389266  1.9486643 ]


visualize_fun(linear_module.weight.t(), 'Dataset with learned $w$ (PyTorch SGD)')


%matplotlib inline

d = 1
n = 200
X = torch.rand(n,d)
y = 4 * torch.sin(np.pi * X) * torch.cos(6*np.pi*X**2)

plt.scatter(X.numpy(), y.numpy())
plt.title('plot of $f(x)$')
plt.xlabel('$x$')
plt.ylabel('$y$')

plt.show()


# feel free to play with these parameters

step_size = 0.05
n_epochs = 6000
n_hidden_1 = 32
n_hidden_2 = 32
d_out = 1

neural_network = nn.Sequential(
                            nn.Linear(d, n_hidden_1), 
                            nn.Tanh(),
                            nn.Linear(n_hidden_1, n_hidden_2),
                            nn.Tanh(),
                            nn.Linear(n_hidden_2, d_out)
                            )

loss_func = nn.MSELoss()

optim = torch.optim.SGD(neural_network.parameters(), lr=step_size)
print('iter,\tloss')
for i in range(n_epochs):
    y_hat = neural_network(X)
    loss = loss_func(y_hat, y)
    optim.zero_grad()
    loss.backward()
    optim.step()
    
    if i % (n_epochs // 10) == 0:
        print('{},\t{:.2f}'.format(i, loss.item()))

iter,	loss
0,	3.49
600,	3.40
1200,	2.95
1800,	1.60
2400,	1.37
3000,	0.82
3600,	0.70
4200,	0.48
4800,	0.32
5400,	0.24


X_grid = torch.from_numpy(np.linspace(0,1,50)).float().view(-1, d)
y_hat = neural_network(X_grid)
plt.scatter(X.numpy(), y.numpy())
plt.plot(X_grid.detach().numpy(), y_hat.detach().numpy(), 'r')
plt.title('plot of $f(x)$ and $\hat{f}(x)$')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.show()


# feel free to play with these parameters

step_size = 0.05
momentum = 0.9
n_epochs = 1500
n_hidden_1 = 32
n_hidden_2 = 32
d_out = 1

neural_network = nn.Sequential(
                            nn.Linear(d, n_hidden_1), 
                            nn.Tanh(),
                            nn.Linear(n_hidden_1, n_hidden_2),
                            nn.Tanh(),
                            nn.Linear(n_hidden_2, d_out)
                            )

loss_func = nn.MSELoss()

optim = torch.optim.SGD(neural_network.parameters(), lr=step_size, momentum=momentum)
print('iter,\tloss')
for i in range(n_epochs):
    y_hat = neural_network(X)
    loss = loss_func(y_hat, y)
    optim.zero_grad()
    loss.backward()
    optim.step()
    
    if i % (n_epochs // 10) == 0:
        print('{},\t{:.2f}'.format(i, loss.item()))

iter,	loss
0,	3.47
150,	2.94
300,	0.83
450,	0.55
600,	0.12
750,	0.10
900,	0.06
1050,	0.03
1200,	0.00
1350,	0.00


X_grid = torch.from_numpy(np.linspace(0,1,50)).float().view(-1, d)
y_hat = neural_network(X_grid)
plt.scatter(X.numpy(), y.numpy())
plt.plot(X_grid.detach().numpy(), y_hat.detach().numpy(), 'r')
plt.title('plot of $f(x)$ and $\hat{f}(x)$')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.show()


loss = nn.CrossEntropyLoss()

input = torch.tensor([[-1., 1],[-1, 1],[1, -1]]) # raw scores correspond to the correct class
# input = torch.tensor([[-3., 3],[-3, 3],[3, -3]]) # raw scores correspond to the correct class with higher confidence
# input = torch.tensor([[1., -1],[1, -1],[-1, 1]]) # raw scores correspond to the incorrect class
# input = torch.tensor([[3., -3],[3, -3],[-3, 3]]) # raw scores correspond to the incorrect class with incorrectly placed confidence

target = torch.tensor([1, 1, 0])
output = loss(input, target)
print(output)

tensor(0.1269)


# an entire mnist digit
image = np.array([0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.3803922 , 0.37647063, 0.3019608 ,0.46274513, 0.2392157 , 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.3529412 , 0.5411765 , 0.9215687 ,0.9215687 , 0.9215687 , 0.9215687 , 0.9215687 , 0.9215687 ,0.9843138 , 0.9843138 , 0.9725491 , 0.9960785 , 0.9607844 ,0.9215687 , 0.74509805, 0.08235294, 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.54901963,0.9843138 , 0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 ,0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 ,0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 ,0.7411765 , 0.09019608, 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.8862746 , 0.9960785 , 0.81568635,0.7803922 , 0.7803922 , 0.7803922 , 0.7803922 , 0.54509807,0.2392157 , 0.2392157 , 0.2392157 , 0.2392157 , 0.2392157 ,0.5019608 , 0.8705883 , 0.9960785 , 0.9960785 , 0.7411765 ,0.08235294, 0., 0., 0., 0.,0., 0., 0., 0., 0.,0.14901961, 0.32156864, 0.0509804 , 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.13333334,0.8352942 , 0.9960785 , 0.9960785 , 0.45098042, 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0.32941177, 0.9960785 ,0.9960785 , 0.9176471 , 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0.32941177, 0.9960785 , 0.9960785 , 0.9176471 ,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0.4156863 , 0.6156863 ,0.9960785 , 0.9960785 , 0.95294124, 0.20000002, 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0.09803922, 0.45882356, 0.8941177 , 0.8941177 ,0.8941177 , 0.9921569 , 0.9960785 , 0.9960785 , 0.9960785 ,0.9960785 , 0.94117653, 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.26666668, 0.4666667 , 0.86274517,0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 ,0.9960785 , 0.9960785 , 0.9960785 , 0.9960785 , 0.5568628 ,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0.14509805, 0.73333335,0.9921569 , 0.9960785 , 0.9960785 , 0.9960785 , 0.8745099 ,0.8078432 , 0.8078432 , 0.29411766, 0.26666668, 0.8431373 ,0.9960785 , 0.9960785 , 0.45882356, 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0.4431373 , 0.8588236 , 0.9960785 , 0.9490197 , 0.89019614,0.45098042, 0.34901962, 0.12156864, 0., 0.,0., 0., 0.7843138 , 0.9960785 , 0.9450981 ,0.16078432, 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0.6627451 , 0.9960785 ,0.6901961 , 0.24313727, 0., 0., 0.,0., 0., 0., 0., 0.18823531,0.9058824 , 0.9960785 , 0.9176471 , 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0.07058824, 0.48627454, 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.32941177, 0.9960785 , 0.9960785 ,0.6509804 , 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0.54509807, 0.9960785 , 0.9333334 , 0.22352943, 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.8235295 , 0.9803922 , 0.9960785 ,0.65882355, 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0.9490197 , 0.9960785 , 0.93725497, 0.22352943, 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.34901962, 0.9843138 , 0.9450981 ,0.3372549 , 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.01960784,0.8078432 , 0.96470594, 0.6156863 , 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0.01568628, 0.45882356, 0.27058825,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0., 0.,0., 0., 0., 0.], dtype=np.float32)
image_torch = torch.from_numpy(image).view(1, 1, 28, 28)

# a gaussian blur kernel
gaussian_kernel = torch.tensor([[1., 2, 1],[2, 4, 2],[1, 2, 1]]) / 16.0

conv = nn.Conv2d(1, 1, 3)
# manually set the conv weight
conv.weight.data[:] = gaussian_kernel

convolved = conv(image_torch)

plt.title('original image')
plt.imshow(image_torch.view(28,28).detach().numpy())
plt.show()

plt.title('blurred image')
plt.imshow(convolved.view(26,26).detach().numpy())
plt.show()


im_channels = 3 # if we are working with RGB images, there are 3 input channels, with black and white, 1
out_channels = 16 # this is a hyperparameter we can tune
kernel_size = 3 # this is another hyperparameter we can tune
batch_size = 4
image_width = 32
image_height = 32

im = torch.randn(batch_size, im_channels, image_width, image_height)

m = nn.Conv2d(im_channels, out_channels, kernel_size)
convolved = m(im) # it is a module so we can call it

print('im shape', im.shape)
print('convolved im shape', convolved.shape)

im shape torch.Size([4, 3, 32, 32])
convolved im shape torch.Size([4, 16, 30, 30])


# Feel free to play with parameters
embedding_size = 3
num_unique_words = 10
sentence_length = 2
batch_size = 1
hidden_size = 5

# Let's generate data of shape (sentence_length, batch_size)
data = torch.randint(high=num_unique_words, size=(sentence_length, batch_size))

embedding = nn.Embedding(num_unique_words, embedding_size)
rnn_layer = nn.RNN(embedding_size, hidden_size)

print(f"Input Data shape: {data.shape}")
embedded_vec = embedding(data)
print(f"After Embedding shape: {embedded_vec.shape}")
result, hidden = rnn_layer(embedded_vec)
print(f"After RNN output shape: {result.shape}")  # (sequence length, batch size, hidden_size)
print(f"After RNN hidden shape: {hidden.shape}")  # (# layers, batch size, hidden_size)

Input Data shape: torch.Size([2, 1])
After Embedding shape: torch.Size([2, 1, 3])
After RNN output shape: torch.Size([2, 1, 5])
After RNN hidden shape: torch.Size([1, 1, 5])


def process_corpus(corpus, sentence_length):
    """
    Arguments:
        corpus (str) -- Continous text. Can be anything but should be relatively long.
        sentence_length (int) -- Size of each sentence in the output.
            Does not have to be divisible by # of words in corpus, in which case end will be padded.
    Returns:
        Tuple of size 4 containing:
            - Train Input - shape (batch, sentence) containing indexes of words for each sentence.
            - Train Truth - Same as Train Input but contains index of the next word in a given sentence.
            - Word to Index Dictionary - Dictionary for each word containing a corresponding integer.
            - Index to Word Dictionary - Reverse of Word to Index Dictionary.
            
    Example:
        process_corpus("Sam likes cats", 2) outputs:
            - [[1, 2], [3, 0]]
            - [[2, 3], [0, 0]]
            - {"": 0, "Sam": 1, "likes": 2, "cats": 3}
            - {0: "", 1: "Sam", 2: "likes", 3: "cats"}
    """
    # Let's make corpus a list of words
    corpus = corpus.split()
    # QUESTION: Should we also trim/lowercase the words here? Is "You," vs. "you" very different?

    # Then split it into smaller sentences of size sentence_length
    x = []
    y = []
    for idx in range(0, len(corpus), sentence_length):
        x.append(corpus[idx: idx + sentence_length])
        # Since we are trying to predict the next word y's are just x's shifted by one
        y.append(corpus[idx + 1: idx + sentence_length + 1])
    # Last sentences might be shorter. Let's pad it with something smaller
    x[-1] += ["" for _ in range(sentence_length - len(x[-1]))]
    y[-1] += ["" for _ in range(sentence_length - len(y[-1]))]

    # Create dictionary from words to indices and vice-versa
    # QUESTION: Is "" a good choice for end-of-sentence tag? Maybe we should pad beginning of the sentences too?
    idx_to_word = {0: ""}
    word_to_idx = {"": 0}
    idx = 1
    for sentence in x:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = idx
                idx_to_word[idx] = word
                idx += 1

    x_idx = torch.tensor([[word_to_idx[w] for w in s] for s in x]).long()
    y_idx = torch.tensor([[word_to_idx[w] for w in s] for s in y]).long()

    return x_idx, y_idx, word_to_idx, idx_to_word


# Feel free to play with parameters
embedding_size = 10
sentence_length = 5
hidden_size = 5
n_epochs = 1000

# Dataset
corpus = "Hey, you. You’re finally awake. " \
    "You were trying to cross the border, right? " \
    "Walked right into that Imperial ambush, " \
    "same as us, and that thief over there. " \
    "Skyrim was fine until you came along. " \
    "Empire was nice and lazy. " \
    "If they hadn’t been looking for you, " \
    "I could’ve stolen that horse and been half way to Hammerfell. " \
    "You there. You and me — we should be here. " \
    "It’s these Stormcloaks the Empire wants. "

x, y, _, idx_to_word = process_corpus(corpus, sentence_length)

model_rnn = nn.Sequential(
    nn.Embedding(len(idx_to_word), embedding_size),
    nn.RNN(embedding_size, hidden_size, batch_first=True),
)
# Linear model has to be separate, because we'll be using only first output of the RNN
linear = nn.Linear(hidden_size, len(idx_to_word))
print(model_rnn)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(model_rnn.parameters()) + list(linear.parameters()))

for i in range(n_epochs):
    x_mid, _ = model_rnn(x)
    y_hat = linear(x_mid).transpose(1, 2)  # This makes shape correct for the Loss
    loss = criterion(y_hat, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i % (n_epochs // 10) == 0:
        print('{},\t{:.2f}'.format(i, loss.item()))

Sequential(
  (0): Embedding(61, 10)
  (1): RNN(10, 5, batch_first=True)
)
0,	4.29
100,	3.84
200,	3.41
300,	3.00
400,	2.66
500,	2.40
600,	2.18
700,	2.00
800,	1.83
900,	1.69


# Let's see a prediction for the first sentence
with torch.no_grad():
    y_hat = linear(model_rnn(x)[0])
    y_hat = torch.argmax(y_hat, dim=2)
    sentences_hat = [[idx_to_word[int(w)] for w in s] for s in y_hat]
    sentences_true = [[idx_to_word[int(w)] for w in s] for s in y]

    sentence_idx = 0
    print(f"Truth: {sentences_true[sentence_idx]}")
    print(f"Predict: {sentences_hat[sentence_idx]}")

Truth: ['you.', 'You’re', 'finally', 'awake.', 'You']
Predict: ['', 'Empire', 'finally', 'awake.', 'You']

PyTorch Introduction - Neural Networks¶

Import and Helper Functions¶

torch.nn.Module¶

Linear Module¶

Activation functions¶

Sequential¶

Loss functions¶

torch.optim¶

Linear regression using GD with automatically computed derivatives and PyTorch's Modules¶

Linear regression using SGD¶

Neural Network Basics in PyTorch¶

Things that might help on the homework¶

Brief Sidenote: Momentum¶

Briefer Sidenote: Learning rate schedulers¶

CrossEntropyLoss¶

Convolutions¶

Recurrent Neural Networks¶

Beyond - More advanced example. Predicting next word.¶