import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
from torchvision import transforms
from tqdm import tqdm


mnist = datasets.MNIST(root="./data", train=True, download=True, transform=transforms.ToTensor())

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 503: Service Unavailable

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

Processing...
Done!

/usr/local/lib/python3.7/dist-packages/torchvision/datasets/mnist.py:502: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:143.)
  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


data_loader = torch.utils.data.DataLoader(mnist, batch_size=64, shuffle=True)


class MNISTNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.linear_0 = nn.Linear(784, hidden_size)
        self.linear_1 = nn.Linear(hidden_size, 10)
        
    def forward(self, inputs):
        x = self.linear_0(inputs)
        x = torch.sigmoid(x)
        return self.linear_1(x)


small_net = MNISTNetwork(1)
large_net = MNISTNetwork(64)
large_net_rand = MNISTNetwork(64)


for p in zip(small_net.parameters(), large_net.parameters()):
    p1, p2 = p
    p1.data = torch.zeros_like(p1.data)
    p2.data = torch.zeros_like(p2.data)


epochs = 32

optimizer_small = optim.Adam(small_net.parameters(), lr=5e-3)
optimizer_large = optim.Adam(large_net.parameters(), lr=5e-3)
optimizer_large_rand = optim.Adam(large_net_rand.parameters(), lr=5e-3)

for i in range(epochs):
    loss_small_epoch = 0.
    loss_large_epoch = 0.
    loss_large_rand_epoch = 0.
    
    for batch in tqdm(data_loader):
        images, labels = batch
        images, labels = images, labels
        
        images = images.view(-1, 784)
        
        optimizer_small.zero_grad()
        optimizer_large.zero_grad()
        optimizer_large_rand.zero_grad()
        
        y_small = small_net(images)
        y_large = large_net(images)
        y_large_rand = large_net_rand(images)
        
        loss_small = F.cross_entropy(y_small, labels)
        loss_large = F.cross_entropy(y_large, labels)
        loss_large_rand = F.cross_entropy(y_large_rand, labels)
        
        loss_small_epoch += loss_small.item()
        loss_large_epoch += loss_large.item()
        loss_large_rand_epoch += loss_large_rand.item()
        
        loss_small.backward()
        loss_large.backward()
        loss_large_rand.backward()
        
        optimizer_small.step()
        optimizer_large.step()
        optimizer_large_rand.step()
        
    print("Small Loss:", loss_small_epoch / len(data_loader))
    print("Large Loss:", loss_large_epoch / len(data_loader))
    print("Large rand Loss:", loss_large_rand_epoch / len(data_loader))

100%|██████████| 938/938 [00:07<00:00, 120.08it/s]
  1%|▏         | 13/938 [00:00<00:07, 127.86it/s]

Small Loss: 1.992125940856649
Large Loss: 1.846776630578519
Large rand Loss: 0.32522186862110203

100%|██████████| 938/938 [00:07<00:00, 123.48it/s]
  1%|▏         | 14/938 [00:00<00:07, 131.81it/s]

Small Loss: 1.8445630946647384
Large Loss: 1.6948263950185227
Large rand Loss: 0.14667500496140992

100%|██████████| 938/938 [00:07<00:00, 122.84it/s]
  1%|▏         | 12/938 [00:00<00:08, 112.32it/s]

Small Loss: 1.8232598135720437
Large Loss: 1.6582392909125225
Large rand Loss: 0.10941013419675007

100%|██████████| 938/938 [00:07<00:00, 123.38it/s]
  1%|▏         | 13/938 [00:00<00:07, 126.03it/s]

Small Loss: 1.810512869342812
Large Loss: 1.647986725957663
Large rand Loss: 0.08550640116078354

100%|██████████| 938/938 [00:07<00:00, 124.94it/s]
  1%|          | 11/938 [00:00<00:08, 106.52it/s]

Small Loss: 1.7984815566524515
Large Loss: 1.6425841501526741
Large rand Loss: 0.07038939598355014


W_0 = large_net.linear_0.weight
b_0 = large_net.linear_0.bias

W_1 = large_net.linear_1.weight
b_1 = large_net.linear_1.bias


print("W_0 => All weights equal for each hidden unit:", (W_0[0, :].unsqueeze(0) == W_0).all().item())
print("Example of weights:")
print(W_0[:, 256])

W_0 => All weights equal for each hidden unit: True
Example of weights:
tensor([-0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256,
        -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256,
        -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256,
        -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256,
        -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256,
        -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256,
        -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256,
        -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256, -0.0256],
       grad_fn=<SelectBackward>)


print("W_1 => All weights equal for each hidden unit:", (W_1[:, 0].unsqueeze(-1) == W_1).all().item())
print("Weights:")
print(W_1[8])

W_1 => All weights equal for each hidden unit: True
Weights:
tensor([-0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049,
        -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049,
        -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049,
        -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049,
        -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049,
        -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049,
        -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049,
        -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049, -0.2049],
       grad_fn=<SelectBackward>)


print("b_0 => All biases equal for each hidden unit:", (b_0[0] == b_0).all().item())
print("Bias:")
print(b_0)

b_0 => All biases equal for each hidden unit: True
Bias:
Parameter containing:
tensor([-1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603,
        -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603,
        -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603,
        -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603,
        -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603,
        -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603,
        -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603,
        -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603, -1.5603],
       requires_grad=True)


print("b_1 => All biases equal for each hidden unit:", (b_1[0] == b_1).all().item())
print("Bias:")
print(b_1)

b_1 => All biases equal for each hidden unit: False
Bias:
Parameter containing:
tensor([ 3.2585, -7.8880,  2.7643,  1.4902, -0.2802,  1.9114,  3.9217, -3.0606,
         1.2542, -1.7107], requires_grad=True)


class DemoNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(3, 16, 3, 1, 1)
        self.max1 = nn.MaxPool2d(2, 2, 0)
        self.conv2 = nn.Conv2d(16, 32, 3, 1, 0)
        self.max2 = nn.MaxPool2d(2, 2, 1)
        self.conv3 = nn.Conv2d(32, 8, 1, 1, 0)
        self.conv4 = nn.Conv2d(8, 4, 5, 1, 0)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(576, 10)
        
    def forward(self, inputs):
        x = self.conv1(inputs)
        print("Data shape:", x.shape)
        print("Weight shape:", self.conv1.weight.shape, "Bias shape:", self.conv1.bias.shape)
        x = self.max1(x)
        print("Data shape:", x.shape)
        x = self.conv2(x)
        print("Data shape:", x.shape)
        print("Weight shape:", self.conv2.weight.shape, "Bias shape:", self.conv2.bias.shape)
        x = self.max2(x)
        print("Data shape:", x.shape)
        x = self.conv3(x)
        print("Data shape:", x.shape)
        print("Weight shape:", self.conv3.weight.shape, "Bias shape:", self.conv3.bias.shape)
        x = self.conv4(x)
        print("Data shape:", x.shape)
        print("Weight shape:", self.conv4.weight.shape, "Bias shape:", self.conv4.bias.shape)
        x = self.flatten(x)
        print("Data shape:", x.shape)
        x = self.linear1(x)
        print("Data shape:", x.shape)
        print("Weight shape:", self.linear1.weight.shape, "Bias shape:", self.linear1.bias.shape)
        return x

demo = DemoNetwork()
_ = demo(torch.zeros(9, 3, 64, 64))

Data shape: torch.Size([9, 16, 64, 64])
Weight shape: torch.Size([16, 3, 3, 3]) Bias shape: torch.Size([16])
Data shape: torch.Size([9, 16, 32, 32])
Data shape: torch.Size([9, 32, 30, 30])
Weight shape: torch.Size([32, 16, 3, 3]) Bias shape: torch.Size([32])
Data shape: torch.Size([9, 32, 16, 16])
Data shape: torch.Size([9, 8, 16, 16])
Weight shape: torch.Size([8, 32, 1, 1]) Bias shape: torch.Size([8])
Data shape: torch.Size([9, 4, 12, 12])
Weight shape: torch.Size([4, 8, 5, 5]) Bias shape: torch.Size([4])
Data shape: torch.Size([9, 576])
Data shape: torch.Size([9, 10])
Weight shape: torch.Size([10, 576]) Bias shape: torch.Size([10])

Defining the Network¶

Instantiating the Networks¶

Training¶

Tensor and Layer sizes¶