import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
from torchvision import transforms
import torchvision.utils
from tqdm import tqdm
import matplotlib.pyplot as plt


mnist = datasets.MNIST(root="./data", train=True, download=True, transform=transforms.ToTensor())


data_loader = torch.utils.data.DataLoader(mnist, batch_size=64, shuffle=True)

# Show one batch of images. Each batch of images has shape [batch_size, 1, 28, 28],
# where 1 is the "channels" dimension of the image.
for images,labels in data_loader:
    grid_img = torchvision.utils.make_grid(images)
    plt.imshow(grid_img.permute(1, 2, 0))
    plt.title("A single batch of images")
    break


class MNISTNetwork(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.linear_0 = nn.Linear(784, hidden_size)
        self.linear_1 = nn.Linear(hidden_size, 10)
        
    def forward(self, inputs):
        x = self.linear_0(inputs)
        x = torch.sigmoid(x)
        return self.linear_1(x)


small_net = MNISTNetwork(1)
large_net = MNISTNetwork(64)
large_net_rand = MNISTNetwork(64)


for p in zip(small_net.parameters(), large_net.parameters()):
    p1, p2 = p
    p1.data = torch.zeros_like(p1.data)
    p2.data = torch.zeros_like(p2.data)


epochs = 32

optimizer_small = optim.Adam(small_net.parameters(), lr=5e-3)
optimizer_large = optim.Adam(large_net.parameters(), lr=5e-3)
optimizer_large_rand = optim.Adam(large_net_rand.parameters(), lr=5e-3)

for i in range(epochs):
    loss_small_epoch = 0.
    loss_large_epoch = 0.
    loss_large_rand_epoch = 0.
    
    for batch in tqdm(data_loader):
        images, labels = batch
        images, labels = images, labels
        
        images = images.view(-1, 784)
        
        optimizer_small.zero_grad()
        optimizer_large.zero_grad()
        optimizer_large_rand.zero_grad()
        
        y_small = small_net(images)
        y_large = large_net(images)
        y_large_rand = large_net_rand(images)
        
        loss_small = F.cross_entropy(y_small, labels)
        loss_large = F.cross_entropy(y_large, labels)
        loss_large_rand = F.cross_entropy(y_large_rand, labels)
        
        loss_small_epoch += loss_small.item()
        loss_large_epoch += loss_large.item()
        loss_large_rand_epoch += loss_large_rand.item()
        
        loss_small.backward()
        loss_large.backward()
        loss_large_rand.backward()
        
        optimizer_small.step()
        optimizer_large.step()
        optimizer_large_rand.step()
        
    print("Small Loss:", loss_small_epoch / len(data_loader))
    print("Large Loss:", loss_large_epoch / len(data_loader))
    print("Large rand Loss:", loss_large_rand_epoch / len(data_loader))

100%|██████████| 938/938 [00:08<00:00, 117.18it/s]
  1%|▏         | 12/938 [00:00<00:08, 113.19it/s]

Small Loss: 1.9653773850469447
Large Loss: 1.7691148120457176
Large rand Loss: 0.32225943708232346

100%|██████████| 938/938 [00:08<00:00, 114.81it/s]
  1%|▏         | 12/938 [00:00<00:08, 114.64it/s]

Small Loss: 1.8007648818528474
Large Loss: 1.591421776361811
Large rand Loss: 0.14307911678163737

100%|██████████| 938/938 [00:08<00:00, 113.59it/s]
  1%|          | 11/938 [00:00<00:08, 106.89it/s]

Small Loss: 1.7754631050105796
Large Loss: 1.541870578646914
Large rand Loss: 0.10535131760144126

100%|██████████| 938/938 [00:08<00:00, 108.32it/s]
  1%|▏         | 12/938 [00:00<00:08, 111.93it/s]

Small Loss: 1.7660997954767141
Large Loss: 1.5250169689467212
Large rand Loss: 0.0832289993804671

100%|██████████| 938/938 [00:08<00:00, 111.50it/s]
  1%|          | 11/938 [00:00<00:09, 102.26it/s]

Small Loss: 1.7597641337400816
Large Loss: 1.520738815702101
Large rand Loss: 0.06850804357892716


W_0 = large_net.linear_0.weight
b_0 = large_net.linear_0.bias

W_1 = large_net.linear_1.weight
b_1 = large_net.linear_1.bias


print("W_0 => All weights equal for each hidden unit:", (W_0[0, :].unsqueeze(0) == W_0).all().item())
print("Example of weights:")
print(W_0[:, 256])

W_0 => All weights equal for each hidden unit: True
Example of weights:
tensor([-0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529,
        -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529,
        -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529,
        -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529,
        -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529,
        -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529,
        -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529,
        -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529, -0.0529],
       grad_fn=<SelectBackward>)


print("W_1 => All weights equal for each hidden unit:", (W_1[:, 0].unsqueeze(-1) == W_1).all().item())
print("Weights:")
print(W_1[8])

W_1 => All weights equal for each hidden unit: True
Weights:
tensor([-0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697,
        -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697,
        -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697,
        -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697,
        -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697,
        -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697,
        -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697,
        -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697, -0.3697],
       grad_fn=<SelectBackward>)


print("b_0 => All biases equal for each hidden unit:", (b_0[0] == b_0).all().item())
print("Bias:")
print(b_0)

b_0 => All biases equal for each hidden unit: True
Bias:
Parameter containing:
tensor([-1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563,
        -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563,
        -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563,
        -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563,
        -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563,
        -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563,
        -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563,
        -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563, -1.5563],
       requires_grad=True)


print("b_1 => All biases equal for each hidden unit:", (b_1[0] == b_1).all().item())
print("Bias:")
print(b_1)

b_1 => All biases equal for each hidden unit: False
Bias:
Parameter containing:
tensor([ 3.7292, -0.0961,  3.4068,  2.0403, -2.9260,  2.3082,  4.5227, -7.2908,
         1.9778, -5.0011], requires_grad=True)


from torchinfo import summary

class DemoNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.conv1 = nn.Conv2d(3, 16, 3, 1, 1)
        self.max1 = nn.MaxPool2d(2, 2, 0)
        self.conv2 = nn.Conv2d(16, 32, 3, 1, 0)
        self.max2 = nn.MaxPool2d(2, 2, 1)
        self.conv3 = nn.Conv2d(32, 8, 1, 1, 0)
        self.conv4 = nn.Conv2d(8, 4, 5, 1, 0)
        self.flatten = nn.Flatten()
        self.linear1 = nn.Linear(576, 10)
    
    @property
    def trainable_layers(self):
        """A utility property to easily access a list of all model layers."""
        return [self.conv1, self.conv2, self.conv3, self.conv4, self.linear1]
        
    def forward(self, inputs):
        """Implements the forward pass."""
        x = self.conv1(inputs)
        x = self.max1(x)
        x = self.conv2(x)
        x = self.max2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        x = self.linear1(x)
        return x

    def print_weight_shapes(self):
        """Utility function to print the shapes of weights in trainable layers."""
        for layer in self.trainable_layers:
            print(f"Weight shape: {layer.weight.shape}; Bias shape: {layer.bias.shape}")

demo = DemoNetwork()
batch_size = 64
summary(demo, input_size=(batch_size, 3, 64, 64))

==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
├─Conv2d: 1-1                            [64, 16, 64, 64]          448
├─MaxPool2d: 1-2                         [64, 16, 32, 32]          --
├─Conv2d: 1-3                            [64, 32, 30, 30]          4,640
├─MaxPool2d: 1-4                         [64, 32, 16, 16]          --
├─Conv2d: 1-5                            [64, 8, 16, 16]           264
├─Conv2d: 1-6                            [64, 4, 12, 12]           804
├─Flatten: 1-7                           [64, 576]                 --
├─Linear: 1-8                            [64, 10]                  5,770
==========================================================================================
Total params: 11,926
Trainable params: 11,926
Non-trainable params: 0
Total mult-adds (M): 6.10
==========================================================================================
Input size (MB): 3.15
Forward/backward pass size (MB): 49.65
Params size (MB): 0.05
Estimated Total Size (MB): 52.84
==========================================================================================


demo.print_weight_shapes()

Weight shape: torch.Size([16, 3, 3, 3]); Bias shape: torch.Size([16])
Weight shape: torch.Size([32, 16, 3, 3]); Bias shape: torch.Size([32])
Weight shape: torch.Size([8, 32, 1, 1]); Bias shape: torch.Size([8])
Weight shape: torch.Size([4, 8, 5, 5]); Bias shape: torch.Size([4])
Weight shape: torch.Size([10, 576]); Bias shape: torch.Size([10])

Constructing the DataLoader¶

Defining the Network¶

Instantiating the Networks¶

Training¶

Tensor and Layer sizes¶