This project explores image classification using the FashionMNIST dataset, focusing on comparing CNN architectures to improve accuracy and training efficiency. It began as a practical extension of previous work in computer vision, driven by curiosity to deepen my understanding of how convolutional layers and activation functions impact learning. I designed a CNN inspired by the TinyVGG architecture and benchmarked it against two other models on performance and training time.
torchvision.datasets, transforming PIL images into PyTorch tensors.matplotlib.pyplot to visualise images in grayscale and display class names, gaining familiarity with the data distribution (Figures AโC). DataLoader objects with shuffling and batching (batch size = 32) for both training and test datasets.nn.Sequential blocks for convolution, activation, and pooling, with a dynamically calculated flattened feature size for the final Linear layer.CrossEntropyLoss. Tracked model accuracy with a custom accuracy_function.# Importing the libraries
import torch
from torch import nn
# Import torchvision
import torchvision
# Import matplotlib for visualization
import matplotlib.pyplot as plt
# Setup device Agnostic Code
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using Device: {device}")
# Setup training data
from torchvision import datasets
from torchvision.transforms import ToTensor
train_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
target_transform=None
)
# Setup testing data
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
target_transform=None
)
# Explore Data
len(train_data), len(test_data)
# See first training sample
image, label = train_data[0]
print(label)
print(image)
# See classes of data
class_names = train_data.classes
class_names
# Labels and their corresponding index
class_to_idx = train_data.class_to_idx
class_to_idx
# Check shape of our Image
image.shape, label
print(f"Image shape: {image.shape} -> [colour_channels, height, width]")
print(f"Image label: {class_names[label]}")
# Visualise data (Figure A)
image, label = train_data[0]
print(f"Image shape: {image.shape}")
plt.imshow(image.squeeze())
plt.title(label)
# Observe image in grayscale (Figure B)
plt.imshow(image.squeeze(), cmap="gray")
plt.title(class_names[label])
plt.axis(False)
# Plot more images (Figure C)
torch.manual_seed(42)
fig = plt.figure(figsize=(9, 9))
rows, cols = 4, 4
for i in range(1, rows * cols + 1):
random_idx = torch.randint(0, len(train_data), size=[1]).item()
img, label = train_data[random_idx]
fig.add_subplot(rows, cols, i)
plt.imshow(img.squeeze(), cmap="gray")
plt.title(class_names[label])
plt.axis(False)
# Prepare Dataloader
from torch.utils.data import DataLoader
# Setup the batch size hyperparameter
BATCH_SIZE = 32
# Turn datasets into iterables (batches)
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)
# Check out created dataloader
print(f"Dataloaders: {train_dataloader, test_dataloader}")
print(f"Length of train dataloader: {len(train_dataloader)} batches of {BATCH_SIZE}")
print(f"Length of test dataloader: {len(test_dataloader)} batches of {BATCH_SIZE}")
# Check what's inside the training dataloader
train_features_batch, train_labels_batch = next(iter(train_dataloader))
train_features_batch.shape, train_labels_batch.shape
# Create a convolutional neural network
class ImageModelV2(nn.Module):
"""
Model architecture copying TinyVGG from:
https://poloclub.github.io/cnn-explainer/
"""
def __init__(self, input_shape: int, hidden_units: int, output_shape: int):
super().__init__()
self.conv_block_1 = nn.Sequential(
nn.Conv2d(in_channels=input_shape, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.conv_block_2 = nn.Sequential(
nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
# Dynamically calculate flattened feature size
with torch.no_grad():
dummy_input = torch.randn(1, input_shape, 28, 28)
x = self.conv_block_1(dummy_input)
x = self.conv_block_2(x)
self.flattened_size = x.view(1, -1).shape[1]
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=self.flattened_size, out_features=output_shape)
)
def forward(self, x):
x = self.conv_block_1(x)
x = self.conv_block_2(x)
x = self.classifier(x)
return x
# Create instance of the model_2
torch.manual_seed(42)
model_2 = ImageModelV2(input_shape=1, hidden_units=10, output_shape=len(class_names)).to(device)
# Create loss function, optimizer and evaluation
def accuracy_function(y_true, y_pred):
correct = torch.eq(y_true, y_pred).sum().item()
acc = (correct / len(y_pred)) * 100
return acc
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=model_2.parameters(), lr=0.1)
# Training & Testing CNN Model
from timeit import default_timer as timer
train_time_start_model_2 = timer()
# Train and test model (Figure 1)
from tqdm.auto import tqdm
epochs = 3
for epoch in tqdm(range(epochs)):
print(f"Epoch: {epoch}\n---------")
train_step(model=model_2,
data_loader=train_dataloader,
loss_fn=loss_fn,
optimizer=optimizer,
accuracy_fn=accuracy_function,
device=device)
test_step(model=model_2,
data_loader=test_dataloader,
loss_fn=loss_fn,
accuracy_fn=accuracy_function,
device=device)
train_time_end_model_2 = timer()
total_train_time_model_2 = print_train_time(start=train_time_start_model_2,
end=train_time_end_model_2,
device=device)
# Get model_2 results
model_2_results = eval_model(
model=model_2,
data_loader=test_dataloader,
loss_fn=loss_fn,
accuracy_fn=accuracy_function
)
# Create dataframe to view results of Models
import pandas as pd
compare_results = pd.DataFrame([
model_0_results,
model_1_results,
model_2_results
])
compare_results["training_time"] = [
total_train_time_model_0,
total_train_time_model_1,
total_train_time_model_2
]
# Visualise Model Results (Figure 2)
compare_results.set_index("model_name")["model_acc"].plot(kind="barh")
plt.xlabel("Accuracy(%)")
plt.ylabel("Model Name")
plt.title("Model Accuracy Comparison")
# Make and Evaluate Random Predictions
def make_predictions(model: torch.nn.Module, data: list, device: torch.device = device):
pred_probs = []
model.eval()
with torch.inference_mode():
for sample in data:
sample = torch.unsqueeze(sample, dim=0).to(device)
pred_logit = model(sample)
pred_prob = torch.softmax(pred_logit.squeeze(), dim=0)
pred_probs.append(pred_prob.cpu())
return torch.stack(pred_probs)
import random
random.seed(51)
test_samples = []
test_labels = []
for sample, label in random.sample(list(test_data), k=9):
test_samples.append(sample)
test_labels.append(label)
# Make Predictions
pred_probs = make_predictions(model=model_2, data=test_samples)
pred_classes = pred_probs.argmax(dim=1)
# Plot predictions (Figure 2)
plt.figure(figsize=(9, 9))
nrows, ncols = 3, 3
for i, sample in enumerate(test_samples):
plt.subplot(nrows, ncols, i + 1)
plt.imshow(sample.squeeze(), cmap="gray")
pred_label = class_names[pred_classes[i]]
truth_label = class_names[test_labels[i]]
title_text = f"Pred: {pred_label} | Truth: {truth_label}"
plt.title(title_text, fontsize=10, c="g" if pred_label == truth_label else "r")
plt.axis(False)
# Making Confusion Matrix (Figure 3)
y_preds = []
model_2.eval()
with torch.inference_mode():
for X, y in tqdm(test_dataloader, desc="Making predictions"):
X, y = X.to(device), y.to(device)
y_logit = model_2(X)
y_pred = torch.softmax(y_logit, dim=1).argmax(dim=1)
y_preds.append(y_pred.cpu())
y_pred_tensor = torch.cat(y_preds)
# pip install dependencies
!pip install -q torchmetrics -U mlxtend
import torchmetrics
import mlxtend
print(f"mlxtend version: {mlxtend.__version__}")
from torchmetrics import ConfusionMatrix
from mlxtend.plotting import plot_confusion_matrix
confmat = ConfusionMatrix(num_classes=len(class_names), task='multiclass')
confmat_tensor = confmat(preds=y_pred_tensor, target=test_data.targets)
fig, ax = plot_confusion_matrix(
conf_mat=confmat_tensor.numpy(),
class_names=class_names,
figsize=(10, 7)
)
A key challenge was dynamically computing the flattened input size after the convolutional layers to correctly set up the first Linear layer. To solve this, I used a with torch.no_grad() block to pass dummy input through the conv layers and automatically extract the output shape. This approach prevented manual miscalculation and made the model reusable for different input sizes.