FashionMNIST Image Classification

This project explores image classification using the FashionMNIST dataset, focusing on comparing CNN architectures to improve accuracy and training efficiency. It began as a practical extension of previous work in computer vision, driven by curiosity to deepen my understanding of how convolutional layers and activation functions impact learning. I designed a CNN inspired by the TinyVGG architecture and benchmarked it against two other models on performance and training time.

๐Ÿ’ป Tech Stack:

๐Ÿงช Data Pipeline:

๐Ÿ“Š Code Snippets & Visualisations:

# Importing the libraries
import torch
from torch import nn

# Import torchvision
import torchvision

# Import matplotlib for visualization
import matplotlib.pyplot as plt

# Setup device Agnostic Code
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using Device: {device}")

# Setup training data
from torchvision import datasets
from torchvision.transforms import ToTensor

train_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
    target_transform=None
)

# Setup testing data
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
    target_transform=None
)

# Explore Data
len(train_data), len(test_data)

# See first training sample
image, label = train_data[0]
print(label)
print(image)

# See classes of data
class_names = train_data.classes
class_names

# Labels and their corresponding index
class_to_idx = train_data.class_to_idx
class_to_idx

# Check shape of our Image
image.shape, label
print(f"Image shape: {image.shape} -> [colour_channels, height, width]")
print(f"Image label: {class_names[label]}")

# Visualise data (Figure A)
image, label = train_data[0]
print(f"Image shape: {image.shape}")
plt.imshow(image.squeeze())
plt.title(label)

# Observe image in grayscale (Figure B)
plt.imshow(image.squeeze(), cmap="gray")
plt.title(class_names[label])
plt.axis(False)

# Plot more images (Figure C)
torch.manual_seed(42)
fig = plt.figure(figsize=(9, 9))
rows, cols = 4, 4
for i in range(1, rows * cols + 1):
    random_idx = torch.randint(0, len(train_data), size=[1]).item()
    img, label = train_data[random_idx]
    fig.add_subplot(rows, cols, i)
    plt.imshow(img.squeeze(), cmap="gray")
    plt.title(class_names[label])
    plt.axis(False)

# Prepare Dataloader
from torch.utils.data import DataLoader

# Setup the batch size hyperparameter
BATCH_SIZE = 32

# Turn datasets into iterables (batches)
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

# Check out created dataloader
print(f"Dataloaders: {train_dataloader, test_dataloader}")
print(f"Length of train dataloader: {len(train_dataloader)} batches of {BATCH_SIZE}")
print(f"Length of test dataloader: {len(test_dataloader)} batches of {BATCH_SIZE}")

# Check what's inside the training dataloader
train_features_batch, train_labels_batch = next(iter(train_dataloader))
train_features_batch.shape, train_labels_batch.shape

# Create a convolutional neural network
class ImageModelV2(nn.Module):
    """
    Model architecture copying TinyVGG from:
    https://poloclub.github.io/cnn-explainer/
    """
    def __init__(self, input_shape: int, hidden_units: int, output_shape: int):
        super().__init__()
        self.conv_block_1 = nn.Sequential(
            nn.Conv2d(in_channels=input_shape, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.conv_block_2 = nn.Sequential(
            nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=hidden_units, out_channels=hidden_units, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        # Dynamically calculate flattened feature size
        with torch.no_grad():
            dummy_input = torch.randn(1, input_shape, 28, 28)
            x = self.conv_block_1(dummy_input)
            x = self.conv_block_2(x)
            self.flattened_size = x.view(1, -1).shape[1]

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=self.flattened_size, out_features=output_shape)
        )

    def forward(self, x):
        x = self.conv_block_1(x)
        x = self.conv_block_2(x)
        x = self.classifier(x)
        return x

# Create instance of the model_2
torch.manual_seed(42)
model_2 = ImageModelV2(input_shape=1, hidden_units=10, output_shape=len(class_names)).to(device)

# Create loss function, optimizer and evaluation
def accuracy_function(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct / len(y_pred)) * 100
    return acc

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(params=model_2.parameters(), lr=0.1)

# Training & Testing CNN Model
from timeit import default_timer as timer
train_time_start_model_2 = timer()

# Train and test model (Figure 1)
from tqdm.auto import tqdm

epochs = 3
for epoch in tqdm(range(epochs)):
    print(f"Epoch: {epoch}\n---------")
    train_step(model=model_2,
               data_loader=train_dataloader,
               loss_fn=loss_fn,
               optimizer=optimizer,
               accuracy_fn=accuracy_function,
               device=device)

    test_step(model=model_2,
              data_loader=test_dataloader,
              loss_fn=loss_fn,
              accuracy_fn=accuracy_function,
              device=device)

train_time_end_model_2 = timer()
total_train_time_model_2 = print_train_time(start=train_time_start_model_2,
                                            end=train_time_end_model_2,
                                            device=device)

# Get model_2 results
model_2_results = eval_model(
    model=model_2,
    data_loader=test_dataloader,
    loss_fn=loss_fn,
    accuracy_fn=accuracy_function
)

# Create dataframe to view results of Models
import pandas as pd

compare_results = pd.DataFrame([
    model_0_results,
    model_1_results,
    model_2_results
])

compare_results["training_time"] = [
    total_train_time_model_0,
    total_train_time_model_1,
    total_train_time_model_2
]

# Visualise Model Results (Figure 2)
compare_results.set_index("model_name")["model_acc"].plot(kind="barh")
plt.xlabel("Accuracy(%)")
plt.ylabel("Model Name")
plt.title("Model Accuracy Comparison")

# Make and Evaluate Random Predictions
def make_predictions(model: torch.nn.Module, data: list, device: torch.device = device):
    pred_probs = []
    model.eval()
    with torch.inference_mode():
        for sample in data:
            sample = torch.unsqueeze(sample, dim=0).to(device)
            pred_logit = model(sample)
            pred_prob = torch.softmax(pred_logit.squeeze(), dim=0)
            pred_probs.append(pred_prob.cpu())
    return torch.stack(pred_probs)

import random
random.seed(51)
test_samples = []
test_labels = []

for sample, label in random.sample(list(test_data), k=9):
    test_samples.append(sample)
    test_labels.append(label)

# Make Predictions
pred_probs = make_predictions(model=model_2, data=test_samples)
pred_classes = pred_probs.argmax(dim=1)

# Plot predictions (Figure 2)
plt.figure(figsize=(9, 9))
nrows, ncols = 3, 3
for i, sample in enumerate(test_samples):
    plt.subplot(nrows, ncols, i + 1)
    plt.imshow(sample.squeeze(), cmap="gray")
    pred_label = class_names[pred_classes[i]]
    truth_label = class_names[test_labels[i]]
    title_text = f"Pred: {pred_label} | Truth: {truth_label}"
    plt.title(title_text, fontsize=10, c="g" if pred_label == truth_label else "r")
    plt.axis(False)

# Making Confusion Matrix (Figure 3)
y_preds = []
model_2.eval()
with torch.inference_mode():
    for X, y in tqdm(test_dataloader, desc="Making predictions"):
        X, y = X.to(device), y.to(device)
        y_logit = model_2(X)
        y_pred = torch.softmax(y_logit, dim=1).argmax(dim=1)
        y_preds.append(y_pred.cpu())

y_pred_tensor = torch.cat(y_preds)

# pip install dependencies
!pip install -q torchmetrics -U mlxtend

import torchmetrics
import mlxtend
print(f"mlxtend version: {mlxtend.__version__}")

from torchmetrics import ConfusionMatrix
from mlxtend.plotting import plot_confusion_matrix

confmat = ConfusionMatrix(num_classes=len(class_names), task='multiclass')
confmat_tensor = confmat(preds=y_pred_tensor, target=test_data.targets)

fig, ax = plot_confusion_matrix(
    conf_mat=confmat_tensor.numpy(),
    class_names=class_names,
    figsize=(10, 7)
)
						

๐ŸŒŸ Key Insights:

  • The custom TinyVGG-style model performed significantly better in terms of accuracy than simpler baselines, demonstrating the importance of deeper architecture in capturing spatial hierarchies.
  • A higher-resolution grid (0.01 step size) in the visualisation allowed finer decision boundary analysis.
  • The confusion matrix revealed that similar-looking classes (e.g., shirts vs. T-shirts) were more prone to misclassification, informing ideas for future data augmentation or attention layers.
  • ๐Ÿง—๐Ÿพ Challenge Faced:

    A key challenge was dynamically computing the flattened input size after the convolutional layers to correctly set up the first Linear layer. To solve this, I used a with torch.no_grad() block to pass dummy input through the conv layers and automatically extract the output shape. This approach prevented manual miscalculation and made the model reusable for different input sizes.

    View on GitHub

    โ† Back to Projects