# Download utils from GitHub
!wget -q --show-progress https://raw.githubusercontent.com/CLDiego/uom_fse_dl_workshop/main/colab_utils.txt -O colab_utils.txt
!wget -q --show-progress -x -nH --cut-dirs=3 -i colab_utils.txt
from pathlib import Path
import sys
repo_path = Path.cwd()
if str(repo_path) not in sys.path:
sys.path.append(str(repo_path))
import utils
import shutil
import requests
import random
import torch
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
print("GPU device:", torch.cuda.get_device_name(0))
else:
print("No GPU available. Please ensure you've enabled GPU in Runtime > Change runtime type")
ascent_url = 'https://raw.githubusercontent.com/CLDiego/uom_fse_dl_workshop/main/figs/ascent.jpg'
response = requests.get(ascent_url)
response.raise_for_status()
checker = utils.core.ExerciseChecker("SE04")
1. Convolutional Neural Networks (CNNs)
Definition: Convolutional Neural Networks (CNNs) are a specialized type of neural network designed for processing structured grid-like data, such as images, by using mathematical operations called convolutions.
CNNs have revolutionized computer vision tasks and are the foundation of many modern systems for image recognition, object detection, segmentation, and more. Their architecture is inspired by the organization of the visual cortex in animals, where individual neurons respond to stimuli in restricted regions called receptive fields.
1.1 Why Standard Neural Networks Struggle with Images
Images present unique challenges that make standard fully-connected neural networks inefficient:
Challenge | Description |
---|---|
Spatial Relationships | Standard networks don’t account for spatial relationships between pixels |
Parameter Explosion | A 224×224×3 image would require over 150,000 weights per neuron |
Translation Invariance | Objects can appear anywhere in an image but have the same meaning |
Feature Hierarchy | Images contain low-level features (edges, textures) that compose into higher-level features |

CNNs address these challenges through specialized architecture components that we’ll explore in this workshop.
Let’s begin by understanding the core operation that gives CNNs their name: convolution.
2. The Convolution Operation
Definition: A convolution in the context of CNNs is a mathematical operation that combines two functions by multiplying them and integrating over their overlapping regions.
In simple terms, convolution involves sliding a small window (called a filter or kernel) over an image and performing an element-wise multiplication between the filter and the pixel values, then summing the results to produce a single output value for each position.
2.1 How Convolution Works
Step | Description |
---|---|
1 | Position the filter at the top-left corner of the image |
2 | Perform element-wise multiplication between the filter and the corresponding image pixels |
3 | Sum all the resulting values to get a single output value |
4 | Move the filter to the next position (typically one pixel to the right) |
5 | Repeat steps 2-4 until the entire image has been covered |

This process creates what’s called a feature map, which highlights specific patterns or features in the image that match the filter pattern.
Note: In deep learning libraries, what’s actually implemented is technically cross-correlation rather than convolution (the filter is not flipped). However, since the filters are learned during training, this distinction doesn’t matter in practice.
Let’s first load an example image to work with:
asc_image = Image.open(BytesIO(response.content)).resize((256, 256))
asc_image
2.2 Key Parameters in Convolution
The convolution operation is governed by several key parameters that affect the output dimensions and characteristics of the feature map. PyTorch provides a convenient way to implement convolutional layers using the torch.nn.Conv2d
class. The key parameters include:
Parameter | Description | Effect on Output Dimensions |
---|---|---|
Kernel Size | The dimensions of the filter (e.g., 3×3, 5×5) | Larger kernels reduce output size more |
Stride | How many pixels the filter shifts at each step | Larger strides reduce output dimensions |
Padding | Adding extra pixels around the border | Can preserve input dimensions |
Dilation | Spacing between kernel elements | Increases receptive field without increasing parameters |
Understanding how these parameters affect the output dimensions is crucial for designing effective CNN architectures. The formula for calculating the output dimensions of a convolutional layer is:
\[\text{Output Size} = \left\lfloor\frac{\text{Input Size} - \text{Kernel Size} + 2 \times \text{Padding}}{\text{Stride}} + 1\right\rfloor\]where $\lfloor \cdot \rfloor$ represents the floor operation (rounding down to the nearest integer).
Note: This formula assumes that both the input and kernel are square, but it can be applied separately to height and width for rectangular inputs and kernels.
Let’s implement a function to calculate the output size of a convolutional layer for various parameter combinations.
# Exercise 1: Calculating Convolutional Output Dimensions 🎯
# Implement a function to calculate the output dimensions after applying convolution
# with different kernel sizes, strides, and padding values.
def calculate_output_size(input_height:int, input_width:int,
kernel_size:int, stride:int=1, padding:int=0) -> tuple:
"""Calculate the output dimensions after applying convolution.
Args:
input_height (int): Height of the input feature map
input_width (int): Width of the input feature map
kernel_size (int): Size of the square kernel
stride (int, optional): Convolution stride. Defaults to 1.
padding (int, optional): Padding size. Defaults to 0.
Returns:
tuple: (output_height, output_width)
"""
# Your code here: Implement the formula for calculating output dimensions
output_height = # Your code here
output_width = # Your code here
return output_height, output_width
# Test the function with different parameters
# Case 1: Standard convolution with a 3x3 kernel, stride=1, no padding
input1 = (28, 28) # e.g., MNIST image size
output1 = # Your code here
# Case 2: Convolution with padding=1 to preserve dimensions
input2 = (224, 224) # e.g., Standard ImageNet size
output2 = # Your code here
# Case 3: Convolution with stride=2 for downsampling
input3 = (128, 128)
output3 = # Your code here
# Case 4: Custom parameters
input4 = (64, 64)
output4 = # Your code here
print(f"Case 1: {input1} → {output1} (3x3 kernel, stride=1, no padding)")
print(f"Case 2: {input2} → {output2} (3x3 kernel, stride=1, padding=1)")
print(f"Case 3: {input3} → {output3} (5x5 kernel, stride=2, padding=2)")
print(f"Case 4: {input4} → {output4} (7x7 kernel, stride=2, padding=3)")
# ✅ Check your answer
answer = {
'output1': output1[0],
'output2': output2[0],
'output3': output3[0],
'output4': output4[0]
}
checker.check_exercise(1, answer)
2.3 What is a Filter?
Definition: A filter (or kernel) is a small matrix used in convolutional operations to extract features from an image. It slides over the image, performing element-wise multiplication and summing the results to produce a single output value.
Filters allow CNNs to learn and detect specific patterns, such as edges, textures, and shapes, by adjusting their weights during training. The concept of filters is central to computer vision tasks, and there are existing filters for common tasks, such as edge detection and blurring. Let’s explore some of these filters and their effects on images.
We are going to try the following filters:
Filter | Kernel | Description |
---|---|---|
Edge Detection | \(\begin{bmatrix} 1 & 0 & -1 \\ 1 & 0 & -1 \\ 1 & 0 & -1 \end{bmatrix}\) | Detects vertical edges |
Sharpening | \(\begin{bmatrix} 0 & -1 & 0 \\ -1 & 5 & -1 \\ 0 & -1 & 0 \end{bmatrix}\) | Enhances edges and details |
Embossing | \(\begin{bmatrix} -2 & -1 & 0 \\ -1 & 1 & 1 \\ 0 & 1 & 2 \end{bmatrix}\) | Creates a 3D effect |
# Exercise 2: Designing Convolutional Filters with PyTorch 🎯
# In this exercise, you will implement common filters used in image processing using PyTorch
def apply_filter_pytorch(image, kernel):
"""Apply a convolutional filter to an image using PyTorch.
Args:
image (numpy.ndarray): Input image (grayscale or RGB)
kernel (numpy.ndarray): Convolutional kernel/filter
Returns:
numpy.ndarray: Filtered image
"""
# Make a copy of the image to avoid modifying the original
image_copy = image.copy().astype(np.float32)
# For RGB: rearrange to PyTorch format (B, C, H, W)
image_tensor = # Your code here
channels = # Your code here
# Convert kernel to PyTorch tensor
kernel_tensor = # Your code here
# Create a convolutional layer with our kernel as weights
# Use groups=channels to apply the same kernel to each channel independently
conv_layer = torch.nn.Conv2d(in_channels=channels,
out_channels=channels,
kernel_size=kernel.shape[0],
bias=False,
padding=kernel.shape[0]//2,
groups=channels)
# Set the weights to our kernel
with torch.no_grad():
for i in range(channels):
# Yout code here
# Apply convolution
with torch.no_grad():
filtered = # Your code here
# Convert back to numpy array in correct format
filtered_image = # Your code here
# Clip values to be in valid range for images (0-255)
filtered_image = # Your code here
return filtered_image
# Design several common convolutional filters
# 1. Edge detection filter (should highlight boundaries between different regions)
edge_detection_kernel = np.array([
[1, 0, -1],
[0, 0, 0],
[-1, 0, 1]
])
# 2. Sharpening filter (should enhance details by increasing contrast)
sharpen_kernel = np.array([
[0, -1, 0],
[-1, 5, -1],
[0, -1, 0]
])
# 3. Embossing filter (should give a 3D effect by highlighting edges with shadows)
emboss_kernel = np.array([
[-2, -1, 0],
[-1, 1, 1],
[0, 1, 2]
])
# Load a test image - use the ascent image we loaded earlier
test_image = np.array(asc_image)
# Apply the filters to the test image using PyTorch
edge_detect_image = apply_filter_pytorch(test_image, edge_detection_kernel)
sharpened_image = apply_filter_pytorch(test_image, sharpen_kernel)
embossed_image = apply_filter_pytorch(test_image, emboss_kernel)
# Display the results
fig, axes = plt.subplots(2, 2, figsize=(6, 6))
axes[0, 0].imshow(test_image)
axes[0, 0].set_title("Original Image")
axes[0, 0].axis('off')
axes[0, 1].imshow(edge_detect_image)
axes[0, 1].set_title("Edge Detection")
axes[0, 1].axis('off')
axes[1, 0].imshow(sharpened_image)
axes[1, 0].set_title("Sharpening")
axes[1, 0].axis('off')
axes[1, 1].imshow(embossed_image)
axes[1, 1].set_title("Embossing")
axes[1, 1].axis('off')
plt.tight_layout()
plt.show()
# ✅ Check your answer
answer = {
'edge_detection_kernel': edge_detection_kernel,
'sharpen_kernel': sharpen_kernel,
'emboss_kernel': emboss_kernel
}
checker.check_exercise(2, answer)
Above we used predefined filters, but in practice, the filters are learned during training. The network learns to adjust the filter weights to detect relevant features for the specific task at hand.
Let’s see how the output of a simple convolution operation looks like.
conv2d = torch.nn.Conv2d(
in_channels=3,
out_channels=3,
kernel_size=3,
stride=1,
padding=12,
)
torch.nn.init.xavier_uniform_(conv2d.weight)
# Change the shape of the image to (C, H, W)
torch_asc = torch.from_numpy(np.array(asc_image)).permute(2,0,1)
torch_asc = torch_asc.unsqueeze(0).float() # Add batch dimension
conv2d.eval()
filtered_asc = conv2d(torch_asc)
# Reverse the transformation to get back to (H, W, C)
filtered_asc = filtered_asc.squeeze(0).detach().numpy().transpose(1, 2, 0)
# Make sure the values are in the range [0, 255]
# and convert to uint8 for PIL
filtered_asc = np.clip(filtered_asc, 0, 255).astype(np.uint8)
filtered_asc_img = Image.fromarray(filtered_asc)
filtered_asc_img
3. Preparing image data
Since images can be seen as 3D tensors, we need to convert them into a format suitable for processing. In PyTorch, images are typically represented as 4D tensors with the shape (batch_size, channels, height, width)
. For a single image, the shape would be (1, 3, height, width)
.
To prepare the image data, we will use the torchvision
library, which provides convenient functions for loading and transforming images.
3.1 Torchvision transforms
Python uses PIL
(Python Imaging Library) to handle images, and while PIL
is great for basic image manipulation, it can be slow for large datasets. To speed up the process, we can use torchvision.transforms
, which provides a set of common image transformations that can be applied to images in a more efficient way.
Transform | PyTorch Function | Description |
---|---|---|
Resize | transforms.Resize(size) |
Resizes the image to the specified size |
CenterCrop | transforms.CenterCrop(size) |
Crops the image at the center to the specified size |
RandomCrop | transforms.RandomCrop(size) |
Crops the image randomly to the specified size |
RandomHorizontalFlip | transforms.RandomHorizontalFlip(p) |
Flips the image horizontally with probability p |
RandomRotation | transforms.RandomRotation(degrees) |
Rotates the image randomly within the specified degrees |
Normalize | transforms.Normalize(mean, std) |
Normalizes the image tensor with the specified mean and standard deviation |
ColorJitter | transforms.ColorJitter(brightness, contrast, saturation, hue) |
Randomly changes the brightness, contrast, saturation, and hue of the image |
ToTensor | transforms.ToTensor() |
Converts the image to a PyTorch tensor |
These transformations can be combined to create a preprocessing pipeline that prepares the images for training. The transforms.Compose
function allows us to chain multiple transformations together.
Notes:
- Resizing is important because CNNs require fixed-size inputs.
- The
ToTensor
transformation converts the image to a PyTorch tensor, and it also scales the pixel values to the range [0, 1].- Normalization is a common practice in deep learning to ensure that the input data has a mean of 0 and a standard deviation of 1. This helps the model converge faster during training. ***
Snippet 1: Composing transformations
from torchvision import transforms
ts = transforms.Compose([
transforms.Resize((224, 224)), # Resize to 224x224
transforms.ToTensor(), # Convert to tensor
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) # Normalize
])
# Exercise 3: Implementing Image Transformations 🎯
# In this exercise, you will implement and visualize various image transformations
# commonly used in computer vision tasks
from torchvision import transforms
def apply_transformations(image_path):
"""Apply and visualize various image transformations.
Args:
image_path (str or Path): Path to the input image
Returns:
dict: Dictionary of transformed images
"""
# Load the image
img = Image.open(image_path) if isinstance(
image_path, (str, Path)) else image_path
# Define transformations
# 1. Basic resize to 128x128
resize_transform = transforms.Compose([
# Your code here
# Your code here
])
# 2. Center crop transformation
center_crop_transform = transforms.Compose([
# Your code here: Resize the smaller edge to 150 pixels
# Your code here: Crop a 100x100 square from the center
# Your code here
])
# 3. Random crop transformation
random_crop_transform = transforms.Compose([
transforms.Resize(150),
# Your code here: random crop of size 100x100
# Your code here
])
# 4. Random horizontal flip transformation
hflip_transform = transforms.Compose([
transforms.Resize((128, 128)),
# Your code here: 100% flip probability
# Your code here
])
# 5. Random rotation transformation
rotate_transform = transforms.Compose([
transforms.Resize((128, 128)),
# Your code here: random rotation
# Your code here
])
# 6. Color jitter transformation
color_transform = transforms.Compose([
transforms.Resize((128, 128)),
# Your code here: color jitter with brightness, contrast, saturation and hue
# Your code here
])
# 7. Combined transformations (practical data augmentation)
combined_transform = transforms.Compose([
# Your code here: random resized crop of size 128x128
# Your code here: random horizontal flip
# Your code here: random rotation of 15 degrees
# Your code here: color jitter with brightness and contrast
transforms.ToTensor()
])
# 8. Normalization transformation
norm_transform = transforms.Compose([
transforms.Resize((128, 128)),
transforms.ToTensor(),
# Your code here: ImageNet normalization with mean and std [0.485, 0.456, 0.406] and [0.229, 0.224, 0.225]
# Your code here
])
transforms_list = {
'Original': transforms.ToTensor(),
'Resized': resize_transform,
'Center Crop': center_crop_transform,
'Random Crop': random_crop_transform,
'Horizontal Flip': hflip_transform,
'Random Rotation': rotate_transform,
'Color Jitter': color_transform,
'Combined': combined_transform,
'Normalized': norm_transform
}
# Apply transformations
transforms_dict = {
'Original': transforms.ToTensor()(img),
'Resized': resize_transform(img),
'Center Crop': center_crop_transform(img),
'Random Crop': random_crop_transform(img),
'Horizontal Flip': hflip_transform(img),
'Random Rotation': rotate_transform(img),
'Color Jitter': color_transform(img),
'Combined': combined_transform(img),
'Normalized': norm_transform(img)
}
return transforms_dict, transforms_list
# Apply transformations to an image and visualize the results
# Use the ascent image we loaded earlier as a test image
ts_dict, ts_list = apply_transformations(asc_image)
utils.plotting.se04_visualize_transformations(ts_dict)
# ✅ Check your answer
answer = {
'resize_transform': ts_list['Resized'],
'center_crop_transform': ts_list['Center Crop'],
'random_crop_transform': ts_list['Random Crop'],
'hflip_transform': ts_list['Horizontal Flip'],
'rotate_transform': ts_list['Random Rotation'],
'color_transform': ts_list['Color Jitter'],
'norm_transform': ts_list['Normalized'],
}
checker.check_exercise(3, answer)
3.2 Historical Crack Dataset
In this session, we are going to be working with the Historical-crack18-19 dataset. The dataset contains annotated images for non-invasive surface crack detection in historical buildings. The goal is to train a model that can accurately identify cracks in these images. The current manual visual inspection of built environments is time-consuming, labor-intensive, prone to errors, costly, and lacks scalability. Therefore, the dataset is designed to facilitate the development of deep learning models for automatic crack detection.
The dataset contains:
Attribute | Number of Images |
---|---|
Crack | 757 |
No crack | 3,139 |
As we can see, the dataset is highly imbalanced, with a significant number of images without cracks. This imbalance can affect the performance of the model, as it may learn to predict the majority class (no crack) more often than the minority class (crack). In a first instance, we are going to take a subset of the dataset to balance the classes.
data_path = Path.cwd() / "datasets"
dataset_path = utils.data.download_dataset("historical cracks",
dest_path=data_path,
extract=True,
remove_compressed=True)
img_crack = dataset_path / "crack"
img_no_crack = dataset_path / "non-crack"
# Create a new folder for the balanced dataset
balanced_data_path = dataset_path / "balanced"
balanced_data_path.mkdir(parents=True, exist_ok=True)
# Create train, test, and validation folders
train_folder = balanced_data_path / "train"
train_folder.mkdir(parents=True, exist_ok=True)
test_folder = balanced_data_path / "test"
test_folder.mkdir(parents=True, exist_ok=True)
val_folder = balanced_data_path / "val"
val_folder.mkdir(parents=True, exist_ok=True)
for folder in [train_folder, test_folder, val_folder]:
(folder / "crack").mkdir(parents=True, exist_ok=True)
(folder / "no_crack").mkdir(parents=True, exist_ok=True)
crack_images = list(img_crack.glob("*.jpg"))
no_crack_images = random.sample(list(img_no_crack.glob("*.jpg")), len(crack_images))
# Shuffle the images
random.shuffle(crack_images)
random.shuffle(no_crack_images)
# Split the images into train, test, and validation sets
train_ix = int(0.8 * len(crack_images))
val_ix = int(0.9 * len(crack_images))
test_ix = len(crack_images)
train_crack_images = crack_images[:train_ix]
val_crack_images = crack_images[train_ix:val_ix]
test_crack_images = crack_images[val_ix:test_ix]
train_no_crack_images = no_crack_images[:train_ix]
val_no_crack_images = no_crack_images[train_ix:val_ix]
test_no_crack_images = no_crack_images[val_ix:test_ix]
# Copy the images to the new folders
for img in train_crack_images:
shutil.copy(img, train_folder / "crack")
for img in train_no_crack_images:
shutil.copy(img, train_folder / "no_crack")
for img in val_crack_images:
shutil.copy(img, val_folder / "crack")
for img in val_no_crack_images:
shutil.copy(img, val_folder / "no_crack")
for img in test_crack_images:
shutil.copy(img, test_folder / "crack")
for img in test_no_crack_images:
shutil.copy(img, test_folder / "no_crack")
# Randomly select 5 images
random_images = random.sample(crack_images, 5)
# Display the images
fig, axes = plt.subplots(1, 5, figsize=(15, 5))
for ax, img_path in zip(axes, random_images):
img = Image.open(img_path)
ax.imshow(img)
ax.axis('off')
ax.set_title(img_path.stem)
plt.tight_layout()
print (f"Images size: {img.size}")
3.3 PyTorch ImageFolder
While we can load images using PIL
, PyTorch provides a more efficient way to handle large datasets through the torchvision.datasets
module. This module contains the ImageFolder
class, which allows us to load images from a directory structure where each subdirectory represents a class. The ImageFolder
class automatically assigns labels based on the subdirectory names.
The ImageFolder
class requires a root directory containing subdirectories for each class. The directory structure should look like this:
dataset/
├── class-1/
│ ├── image1.jpg
│ ├── image2.jpg
│ └── ...
└── class-2/
├── image1.jpg
├── image2.jpg
└── ...
The function automatically assigns labels to the images based on the subdirectory names. Moreover, it can also apply transformations to the images using the transform
parameter.
The key parameters of the ImageFolder
class are:
Parameter | Description |
---|---|
root | The root directory containing the dataset |
transform | A function/transform to apply to the images |
target_transform | A function/transform to apply to the target (label) |
loader | A function to load the images (default is PIL.Image.open ) |
is_valid_file | A function to check if a file is valid (default is None ) |
Snippet 2: Using ImageFolder
from torchvision.datasets import ImageFolder
dataset = ImageFolder(root='path/to/dataset', transform=ts)
# Accessing the first image and its label
image, label = dataset[0]
print(f"Image shape: {image.shape}, Label: {label}")
# Exercise 4: Data Augmentation and Loading with PyTorch 🎯
# Implement:
# 1. Data augmentation techniques
# 2. Data loading with ImageFolder
from torch.utils.data import random_split, DataLoader
from torchvision.datasets import ImageFolder
ts_train = transforms.Compose([
# Your code here: Add transforms for training data
# Resize to 64x64, add random horizontal flip
# Add random rotation, add color jitter
# Convert to tensor
# Your code here
])
ts_test_val = transforms.Compose([
# Your code here: Add transforms for test/val data
# We only need to resize and convert to tensor for test/val
# Your code here
])
# Create datasets using ImageFolder
train_data = # Your code here
test_data = # Your code here
val_data = # Your code here
# ✅ Check your answer
answer = {
'train_transforms': ts_train,
'test_val_transforms': ts_test_val,
'train_data': train_data,
'test_data': test_data,
'val_data': val_data
}
checker.check_exercise(4, answer)
3.4 PyTorch DataLoaders
As we discussed in the previous session, when training a model we need to load the data in batches. PyTorch provides the DataLoader
class to handle this efficiently. The DataLoader
class takes a dataset and provides an iterable over the dataset, allowing us to load data in batches.
The model expects our image data to be formatted as a 4D tensor with the shape (batch_size, channels, height, width)
.

The DataLoader
class provides several key parameters to customize the data loading process:
Parameter | Description |
---|---|
dataset | The dataset to load data from (e.g., ImageFolder ) |
batch_size | The number of samples per batch |
shuffle | Whether to shuffle the data at every epoch |
num_workers | The number of subprocesses to use for data loading |
pin_memory | Whether to pin memory for faster data transfer to GPU |
drop_last | Whether to drop the last incomplete batch |
Snippet 3: Creating a DataLoader
from torch.utils.data import DataLoader
# Create a DataLoader for the dataset
train_loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
# Iterate through the DataLoader
for images, labels in train_loader:
print(f"Batch shape: {images.shape}, Labels: {labels}")
break # Just to show the first batch
# Exercise 5: DataLoader 🎯
# Create DataLoaders for train, test, and validation data
# Your code here: create train_dl with batch_size=32 and shuffle=True
train_dl = # Your code here
# Your code here: create test_dl with batch_size=32 and shuffle=False
test_dl = # Your code here
# Your code here: create val_dl with batch_size=32 and shuffle=False
val_dl = # Your code here
# ✅ Check your answer
answer = {
'train_dataloader': train_dl,
'test_dataloader': test_dl,
'val_dataloader': val_dl,
'batch_size': train_dl.batch_size
}
checker.check_exercise(5, answer)
4. Implementing CNNs
The architecture of a CNN is not that different from a standard neural network. The main difference is that CNNs use convolutional layers instead of fully connected layers. This means that after each convolutional layer, we typically apply a non-linear activation function (like ReLU).
The output of the CNN is then passed through one or more fully connected layers to produce the final output. Thus, we need to keep track of the output size after each layer to ensure that the dimensions match up correctly.
A diagram of a conventional CNN architecture is shown below.

We are going to implement a simple CNN architecture for the crack detection task. The architecture consists of the following layers:
Type | Layer | Input Size | Output Size | Activation Function |
---|---|---|---|---|
Convolution | Conv2d |
(3, 64, 64) |
(16, 64, 64) |
ReLU |
Fully Connected | Linear |
(16 * 64 * 64) |
16 |
ReLU |
Fully Connected | Linear |
16 |
2 |
None |
# Exercise 6: Implementing a Simple CNN Model 🎯
class simpleCNN(torch.nn.Module):
def __init__(self, n_classes):
super(simpleCNN, self).__init__()
# Your code here: create a convolutional layer with 3 input channels, 16 output channels
# kernel_size=3, stride=1, padding=1
self.conv1 = # Your code here
# Your code here: create a fully connected layer (Linear) with input 16*64*64 and output 16
self.fc1 = # Your code here
# Your code here: create a fully connected layer (Linear) with input 16 and output n_classes
self.fc2 = # Your code here
def forward(self, x):
# Your code here: Apply ReLU activation to conv1 output
x = # Your code here
# Your code here: Flatten the tensor
x = # Your code here
# Your code here: Apply ReLU activation to fc1 output
x = # Your code here
# Your code here: Feed through fc2
x = # Your code here
return x
model_v1 = simpleCNN(len(train_data.classes))
criterion = torch.nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model_v1.parameters(), lr=3e-3)
num_epochs = 10
# ✅ Check your answer
answer = {
'model_architecture': model_v1,
'conv_layer': model_v1.conv1,
'linear_layers': {'count': 2, 'output_features': model_v1.fc2.out_features},
'activation': {'function': 'ReLU', 'count': 2},
}
checker.check_exercise(6, answer)
model_v1 = utils.ml.train_model(model_v1,
criterion,
optimiser,
train_loader=train_dl,
val_loader=val_dl,
num_epochs=num_epochs,
plot_loss=True)
4.1 Getting predictions
The output of the last layer gives us the predicted class probabilities for the two classes: crack and no crack. Therefore, in order for us to get the predicted class, we need to apply a softmax function to the output of the last layer. However, PyTorch CrossEntropyLoss
combines the softmax and the negative log-likelihood loss in a single function, so we don’t need to apply softmax explicitly. The CrossEntropyLoss
function expects the raw logits (the output of the last layer) as input, and it will apply softmax internally.
To predict the class, we can use the torch.argmax
function to get the index of the maximum value in the output tensor. This index corresponds to the predicted class.
Snippet 4: Obtaining the predicted class
model_v1.eval()
with torch.no_grad():
for images, labels in test_dl:
outputs = model_v1(images.to(device))
print(outputs[:-1])
_, predicted = torch.max(outputs, dim=1)
print(predicted)
break
# Exercise 7: Evaluating the Model 🎯
# Compute accuracy and classification report
# Your code here: Use the utils.ml.compute_accuracy function to compute accuracy on the test set
acc = # Your code here
print(f"Test accuracy: {acc*100:.2f}%")
# Your code here: Use the utils.ml.compute_classification_report function to compute the classification report
cls_report = # Your code here
print('-' * 60)
print(f"Classification Report:\n{cls_report}")
# ✅ Check your answer
answer = {
'test_accuracy': acc,
'classification_report': cls_report
}
checker.check_exercise(7, answer)
# Visualize the model predictions
utils.plotting.show_model_predictions(model_v1, test_dl, class_names=train_data.classes)
4.2 Recreating CNN architectures
There are many different CNN architectures that have been proposed in the literature, each with its own strengths and weaknesses. Some of the most popular architectures include:
Architecture | Description | Key Features |
---|---|---|
LeNet | One of the first CNN architectures, designed for handwritten digit recognition | 5 layers, small kernel sizes |
AlexNet | A deeper architecture that won the ImageNet competition in 2012 | 8 layers, ReLU activation, dropout, data augmentation |
VGG | A very deep architecture with small kernel sizes | 16-19 layers, uniform architecture, small kernels |
ResNet | Introduced residual connections to allow for very deep networks | 50-152 layers, skip connections, batch normalization |
Inception | Introduced the inception module for multi-scale feature extraction | 22-164 layers, parallel convolutions, pooling layers |
DenseNet | Introduced dense connections between layers | 121-201 layers, dense connections, feature reuse |
These architectures have been shown to perform well on a variety of tasks, and they can be used as a starting point for building custom CNNs. Furthermore, many of these architectures are often visualised as a series of blocks, where each block consists of a convolutional layer followed by an activation function and a pooling layer. We are going to implement a version of the original VGG architecture, which looks like this:

4.3 Pooling
Definition: Pooling is a downsampling operation used in CNNs to reduce the spatial dimensions of feature maps while retaining important information.
Pooling helps to reduce the number of parameters and computations in the network, making it more efficient and less prone to overfitting. There are several types of pooling operations, but the most common ones are:
Pooling Type | PyTorch Function | Description |
---|---|---|
Max Pooling | torch.nn.MaxPool2d(kernel_size, stride) |
Takes the maximum value in each region defined by the kernel size |
Average Pooling | torch.nn.AvgPool2d(kernel_size, stride) |
Takes the average value in each region defined by the kernel size |
Global Average Pooling | torch.nn.AdaptiveAvgPool2d(output_size) |
Reduces each feature map to a single value by averaging over the entire map |
Global Max Pooling | torch.nn.AdaptiveMaxPool2d(output_size) |
Reduces each feature map to a single value by taking the maximum over the entire map |
The pooling operation can be visualised like this:

Most commonly, we use max pooling, as it helps to retain the most important features while discarding less relevant information. The pooling operation is typically applied after a convolutional layer and an activation function.
4.4 Regularisation
As briefly mentioned in the previous session, regularisation is a technique used to prevent overfitting in machine learning models. Overfitting occurs when a model learns the training data too well, including noise and outliers, leading to poor generalisation on unseen data. In CNNs, regularisation techniques are crucial due to the large number of parameters and the complexity of the models. Some common regularisation techniques used in CNNs include:
Regularisation Technique | Pytorch Function | Description |
---|---|---|
Dropout | torch.nn.Dropout(p) |
Randomly sets a fraction of input units to 0 at each update during training time, which helps prevent overfitting |
L2 Regularisation | torch.nn.functional.mse_loss() |
Adds a penalty on the size of the weights to the loss function. This is also known as weight decay |
Batch Normalisation | torch.nn.BatchNorm2d(num_features) |
Normalises the output of a previous activation layer by subtracting the batch mean and dividing by the batch standard deviation. This helps to stabilise the learning process and can lead to faster convergence |
Data Augmentation | torchvision.transforms |
Increases the diversity of the training set by applying random transformations to the input data, such as rotation, translation, and scaling. This helps to improve the generalisation of the model |
Early Stopping | torch.nn.utils |
Stops training when the validation loss stops improving, preventing overfitting |
Weight Decay | torch.optim.AdamW |
Adds a penalty on the size of the weights to the loss function, similar to L2 regularisation. This is also known as weight decay |
For our tiny VGG architecture, we are going to use dropout and batch normalisation. The dropout layer is applied after the activation function of the fully connected layers, while the batch normalisation layer is applied after the convolutional layers.
Our tiny VGG architecture will look like this:

# Exercise 8: Implementing a More Complex CNN Model 🎯
class tinyVGG(torch.nn.Module):
def __init__(self, n_classes):
super().__init__()
# Create convolutional layers
# Your code here: conv1 with 3 input channels, 16 output channels
self.conv1 = # Your code here
# Your code here: conv2 with 16 input channels, 32 output channels
self.conv2 = # Your code here
# Your code here: conv3 with 32 input channels, 64 output channels
self.conv3 = # Your code here
# Add pooling layers
# Your code here: Create max pooling layer with kernel_size=2, stride=2
self.pool = # Your code here
# Your code here: Create flatten layer
self.flat = # Your code here
# Adjust input size for fully connected layer due to pooling
# Your code here: Create fc1 with input 64*8*8 and output 128
self.fc1 = # Your code here
# Your code here: Create fc2 with input 128 and output n_classes
self.fc2 = # Your code here
# dropout for regularization
# Your code here: Create dropout1 with p=0.05
self.dropout1 = # Your code here
# Your code here: Create dropout2 with p=0.1
self.dropout2 = # Your code here
# batch normalization for more stable training
# Your code here: Create batch_norm1 for 16 features
self.batch_norm1 = # Your code here
# Your code here: Create batch_norm2 for 32 features
self.batch_norm2 = # Your code here
# Your code here: Create batch_norm3 for 64 features
self.batch_norm3 = # Your code here
def forward(self, x):
# Your code here: Apply conv1, batch_norm1, ReLU, and pooling
x = # Your code here
x = # Your code here
# Your code here: Apply conv2, batch_norm2, ReLU, and pooling
x = # Your code here
x = # Your code here
# Your code here: Apply conv3, batch_norm3, ReLU, and pooling
x = # Your code here
x = # Your code here
# Your code here: Flatten the tensor
x = # Your code here
# Your code here: Apply fc1, ReLU, and dropout2
x = # Your code here
# Your code here: Apply fc2
x = # Your code here
return x
# ✅ Check your answer
model_v2 = tinyVGG(len(train_data.classes))
answer = {
'model_architecture': model_v2,
'conv_layers': model_v2,
'pooling_layers': model_v2.pool,
'batch_norm_layers': [model_v2.batch_norm1, model_v2.batch_norm2, model_v2.batch_norm3],
'dropout_layers': [model_v2.dropout1, model_v2.dropout2],
'flatten_operation': model_v2.flat
}
checker.check_exercise(8, answer)
model_v2 = tinyVGG(len(train_data.classes))
criterion_reg = torch.nn.CrossEntropyLoss()
optimiser_reg = torch.optim.Adam(model_v2.parameters(),
lr=1e-3,
betas=(0.9, 0.999),
# weight_decay=1e-5, # L2 regularization (weight decay)
)
num_epochs_reg = 45
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimiser_reg,
mode='min',
factor=0.1,
patience=2,
)
model_v2 = utils.ml.train_model(model_v2,
criterion_reg,
optimiser_reg,
train_loader=train_dl,
val_loader=val_dl,
num_epochs=num_epochs_reg,
early_stopping=True,
patience=5,
tolerance=1e-2,
save_path= Path.cwd() / "my_models" / "se04_model_v2.pt",
plot_loss=True)
# Load the best model
model_v2.load_state_dict(torch.load(Path.cwd() / "my_models" / "se04_model_v2.pt"))
# Exercise 9: Evaluating the tiny VGG🎯
# Your code here: Use the utils.ml.compute_accuracy function to compute accuracy on the test set
acc = # Your code here
print(f"Test accuracy: {acc*100:.2f}%")
# Your code here: Use the utils.ml.compute_classification_report function to compute the classification report
cls_report = # Your code here
print('-' * 60)
print(f"Classification Report:\n{cls_report}")
# ✅ Check your answer
answer = {
'test_accuracy': acc,
'classification_report': cls_report
}
checker.check_exercise(9, answer)
utils.plotting.show_model_predictions(model_v2, test_dl, class_names=train_data.classes, num_images=12)