# Download utils from GitHub
!wget -q --show-progress https://raw.githubusercontent.com/CLDiego/uom_fse_dl_workshop/main/colab_utils.txt -O colab_utils.txt
!wget -q --show-progress -x -nH --cut-dirs=3 -i colab_utils.txt

from pathlib import Path
import sys

repo_path = Path.cwd()
if str(repo_path) not in sys.path:
    sys.path.append(str(repo_path))

import utils
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import pandas as pd
import torch

print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU device:", torch.cuda.get_device_name(0))
else:
    print("No GPU available. Please ensure you've enabled GPU in Runtime > Change runtime type")

checker = utils.core.ExerciseChecker("SE03")
quizzer = utils.core.QuizManager("SE03")

1. PyTorch workflow

The previous session we had a look at the basics of neural networks and how to train a single layer perceptron. In this session we will look at the PyTorch framework and how to use it to build and train neural networks.

Most deep learning projects follow a similar workflow. The following figure illustrates the typical workflow of a PyTorch project:

The workflow consists of the following steps:

Step	Description
Obtain Data	Collect and preprocess the data for training and testing
Prepare Data	Setup data in PyTorch format
Pre-process Data	Normalize and augment the data. This may involve data cleaning, normalization, and splitting the data into training, validation, and test sets.
Activation Function	Choose an activation function for the model. This may involve selecting a suitable activation function for the model, such as ReLU, sigmoid, or tanh.
Model	Define the model architecture.
Choose optimiser	Select an optimiser for the model.
Choose loss function	Select a loss function for the model.
Create training loop	Define the training steps, including forward pass, backward pass, and parameter updates.
Fit model	Train the model using the training data.
Evaluate model	Evaluate the model using the validation and test data to make predictions
Improve model	Fine-tune the model by adjusting hyperparameters, adding regularization, or modifying the architecture.
Save or deploy model	Save the trained model for future use or deploy it in a production environment.

Step 1: Obtain Data

In this notebook we are going to be using the ARKOMA dataset. The dataset is intended to be used as a benchmark for the creation of Neural Networks to perform inverse kinematics for robotic arms using a NAO robot. The dataset contains data for two different robotic arms: the left arm and the right arm. The data is generated using a physics engine that simulates the movement of the robotic arms in a 3D environment. The dataset contain 10,000 input-output data pairs for both arms. The input data is the end-effector position of the robotic arm, and the output data is the joint angles of the robotic arm.

The input parameters are:

Notation	Description
$ P_{x} $	The end-effector position with respect to the torso’s x-axis
$ P_{y} $	The end-effector position with respect to the torso’s y-axis
$ P_{z} $	The end-effector position with respect to the torso’s z-axis
$ R_{x} $	The end-effector orientation relative to the torso’s x-axis
$ R_{y} $	The end-effector orientation relative to the torso’s y-axis
$ R_{z} $	The end-effector orientation relative to the torso’s z-axis

The output parameters are:

Notation	Left Arm Joint	Left Arm Range(rad)	Right Arm Joint	Right Arm Range(rad)
$ \theta_{1} $	LShoulder Pitch	[-2.0857, 2.0857]	RShoulder Pitch	[-2.0857, 2.0857]
$ \theta_{2} $	LShoulder Roll	[-0.3142, 1.3265]	RShoulder Roll	[-1.3265, 0.3142]
$ \theta_{3} $	LElbow Yaw	[-2.0857, 2.0857]	RElbow Yaw	[-2.0857, 2.0857]
$ \theta_{4} $	LElbow Roll	[-1.5446, 0.0349]	RElbow Roll	[-0.0349, 1.5446]
$ \theta_{5} $	LWrist Yaw	[-1.8238, 1.8238]	RWrist Yaw	[-1.8238, 1.8238]

In this notebook, we are going to focus on the right arm. The data is stored in CSV format. To load the data, we will use the pandas library.

data_path = Path(Path.cwd(), 'datasets')
dataset_path = utils.data.download_dataset('ARKOMA',
                                   dest_path=data_path,
                                   extract=True,
                                   remove_compressed=True)

# Set the path to the datasets (already provided above)
right_arm_path = dataset_path / 'Right Arm Dataset'

# Create file paths using a dictionary comprehension and format strings
file_parts = ['Train', 'Val', 'Test']
dataset_files = {
    part: {
        'features': right_arm_path / f'R{part}_x.csv',
        'targets': right_arm_path / f'R{part}_y.csv'
    } for part in file_parts
}

# Unpack into individual variables for compatibility with existing code
feats_train = dataset_files['Train']['features']
targets_train = dataset_files['Train']['targets']
feats_val = dataset_files['Val']['features']
targets_val = dataset_files['Val']['targets']
feats_test = dataset_files['Test']['features']
targets_test = dataset_files['Test']['targets']

Step 2 and 3: Prepare and Pre-process Data

The next step is to pre-process the data. This involves normalizing the data and splitting it into training, validation, and test sets.

Training, Validation, and Test Sets

One of the crucial steps in machine learning is to split the data into training, validation, and test sets. Each of these sets serves a specific purpose in the model development process:

Dataset	Purpose	Typical Split	Usage	Analogy
Training Set	Used to train the model by adjusting weights and biases through backpropagation	60-80%	Every training iteration	Like studying materials to learn a subject
Validation Set	Used to tune hyperparameters and monitor model performance during training to prevent overfitting	10-20%	During model development	Like practice exams to gauge learning progress
Test Set	Used only once for final model evaluation; never used for training or tuning	10-20%	Once, after training	Like a final exam with new, unseen questions

The ARKOMA dataset has already been split into these three sets for us, which simplifies our workflow.

Note: The test set is our generalisation benchmark. It is important to keep the test set separate from the training and validation sets to ensure that the model’s performance is evaluated on unseen data. This helps us understand how well the model will perform in real-world scenarios.

# Load the datasets
# Training set
X_train = pd.read_csv(feats_train)
y_train = pd.read_csv(targets_train)
# Test set
X_test = pd.read_csv(feats_test)
y_test = pd.read_csv(targets_test)
# Validation set
X_val = pd.read_csv(feats_val)
y_val = pd.read_csv(targets_val)

print(f"X_train shape: {X_train.shape} | y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape} | y_test shape: {y_test.shape}")
print(f"X_val shape: {X_val.shape} | y_val shape: {y_val.shape}")

X_train.head()

y_train.head()

Normalisation

Normalisation is a crucial step in the pre-processing of data for machine learning models. It involves scaling the input features to a similar range, which helps improve the convergence speed and performance of the model. In this notebook, we will use Min-Max normalization to scale the input features to a range of [0, 1]. The formula for Min-Max normalization is as follows:

\[X_{norm} = \frac{X - X_{min}}{X_{max} - X_{min}}\]

Where:

$ X_{norm} $ is the normalized value.
$ X$ is the original value.
$ X_{min} $ is the minimum value of the feature.
$ X_{max} $ is the maximum value of the feature.

The normalisation parameters will be computed from the training set and then applied to the validation and test sets. This helps to prevent data leakage and ensures that the model is evaluated on unseen data.

Benefit	Description	Impact on Training
Faster Convergence	Normalized inputs lead to better-conditioned optimization	Reduces training time
Numerical Stability	Prevents extremely large or small values	Reduces risk of gradient explosions/vanishing
Feature Scaling	Makes all features contribute equally to the model	Prevents certain features from dominating
Better Generalization	Helps models transfer between different images	Improves performance on unseen data

Snippet 1: Normalisation using Min-Max scaling

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform the training
X_train_scaled = scaler.transform(X_train)

# Inverse transform the scaled data to get the original values
X_train_original = scaler.inverse_transform(X_train_scaled)

# Exercise 1: Data Loading and Preprocessing 🎯

# Create PyTorch tensors from the training, validation, and test data
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)  
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Create MinMaxScalers for feature and target normalization
x_scaler = # Your code here
y_scaler = # Your code here

# Fit the scalers on training data 
x_scaler = # Your code here
y_scaler = # Your code here


# Transform all datasets and put them into tensors
X_train_scaled = # Your code here
X_val_scaled = # Your code here
X_test_scaled = # Your code here


y_train_scaled = # Your code here
y_val_scaled = # Your code here
y_test_scaled = # Your code here

# Check the normalized data range
print(f"X_train normalized range: [{X_train_scaled.min().item():.4f}, {X_train_scaled.max().item():.4f}]")
print(f"y_train normalized range: [{y_train_scaled.min().item():.4f}, {y_train_scaled.max().item():.4f}]")

# ✅ Check your answer
answer = {
    'X_train_tensor': X_train_tensor,
    'y_train_tensor': y_train_tensor,
    'X_train_scaled': X_train_scaled,
    'y_train_scaled': y_train_scaled,
    'scale_range_min': X_train_scaled.min().item(),
    'scale_range_max': X_train_scaled.max().item(),
}
checker.check_exercise(1, answer)

Step 4: Activation Function

The next step is to choose an activation function for the model. The activation function introduces non-linearity to the model, allowing it to learn complex relationships in the data. The following table lists some common activation functions used in neural networks, along with their characteristics and best use cases:

Function	Formula	Range	PyTorch Implementation	Best Used For
ReLU	$\displaystyle f(x) = \max(0, x)$	$\displaystyle [0, \infty)$	`torch.nn.ReLU()`	Hidden layers in most networks
Sigmoid	$\displaystyle f(x) = \frac{1}{1+e^{-x}}$	$\displaystyle (0, 1)$	`torch.nn.Sigmoid()`	Binary classification, gates in LSTMs
Tanh	$\displaystyle f(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}$	$\displaystyle (-1, 1)$	`torch.nn.Tanh()`	Hidden layers when output normalization is needed
Leaky ReLU	$\displaystyle f(x) = \max(\alpha x, x)$	$\displaystyle (-\infty, \infty)$	`torch.nn.LeakyReLU(negative_slope=0.01)`	Preventing “dead neurons” problem
Softmax	$\displaystyle f(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}}$	$\displaystyle (0, 1)$	`torch.nn.Softmax(dim=1)`	Multi-class classification output layer

The choice of activation function depends on the specific problem and the architecture of the neural network.

Tips:

ReLU is the most commonly used activation function in hidden layers of deep networks due to its simplicity and effectiveness.

The activation function for the output layer depends on the type of problem being solved (e.g., regression, binary classification, multi-class classification). * **Common Mistakes to Avoid:

Mixing activation functions in the same layer (e.g., using ReLU and sigmoid together) can lead to unexpected behavior.

Using activation functions that saturate (like sigmoid) in hidden layers can lead to vanishing gradients, making training difficult.

Forgetting to apply the activation function to the output layer can lead to incorrect predictions (e.g., not using softmax for multi-class classification).

Not considering the range of the output when choosing the activation function (e.g., using sigmoid for regression tasks).

print("\n🧠 Quiz 1: Choosing the right activation function")
quizzer.run_quiz(1)

print("\n🧠 Quiz 2: Combining activation functions")
quizzer.run_quiz(2)

Step 5: Model

The next step is to define the model architecture. In order to create a Neural Network we need to stack multiple neurons together. This is known as a layer. A layer is a collection of neurons that work together to process the input data. A simple ANN is formed by three types of layers:

Input Layer: Receives the input data.
Hidden Layers: Intermediate layers that process the data.
Output Layer: Produces the final output.

The following table summarises the different types of layers available in PyTorch:

Layer Type	Class	Description	Common Uses
Fully Connected	`torch.nn.Linear(in_features, out_features)`	Standard dense layer	Classification, regression
Convolutional	`torch.nn.Conv2d(in_channels, out_channels, kernel_size)`	Spatial feature extraction	Image processing
Recurrent	`torch.nn.RNN(input_size, hidden_size)`	Sequential data processing	Time series, text
LSTM	`torch.nn.LSTM(input_size, hidden_size)`	Long-term dependencies	Complex sequences
Embedding	`torch.nn.Embedding(num_embeddings, embedding_dim)`	Word vector representations	NLP tasks
BatchNorm	`torch.nn.BatchNorm2d(num_features)`	Normalizes layer inputs	Training stability
Dropout	`torch.nn.Dropout(p=0.5)`	Randomly zeros elements	Regularization

The choice of layer type depends on the specific problem and the architecture of the neural network. For example, convolutional layers are commonly used in image processing tasks, while recurrent layers are used for sequential data processing.

Number of Layers and Neurons

The number of layers and neurons in each layer is a hyperparameter that needs to be tuned. The following table summarises the common practices for choosing the number of layers and neurons:

Layer Type	Common Practices
Input Layer	Number of neurons = number of input features
Hidden Layers	1-3 hidden layers are common for most tasks. More complex tasks may require more layers.
Output Layer	Number of neurons = number of output features (e.g., 1 for regression, number of classes for classification)
Number of Neurons	Common practices: 2^n, where n is the number of layers. A common practice is to start with a number of neurons equal to the number of input features and then reduce the number of neurons in each subsequent layer.

Tips:

Start with a simple architecture and gradually increase complexity as needed.

The number of neurons in each layer can be adjusted based on the complexity of the problem.

Use activation functions after each layer to introduce non-linearity.

Experiment with different layer types and configurations to find the best architecture for your problem.

# Quiz 3: Network Width
print("\n🧠 Quiz 3: Understanding Network Width for Inverse Kinematics")
quizzer.run_quiz(3)

# Quiz 4: Network Depth
print("\n🧠 Quiz 4: Understanding Network Depth for Inverse Kinematics")
quizzer.run_quiz(4)

# Quiz 5: Regularization Techniques
print("\n🧠 Quiz 5: Regularization Techniques for Kinematics Models")
quizzer.run_quiz(5)

Initialising Weights and Biases

In the previous session we looked at the concept of weights and biases. With our Perceptron we initialised the weights and biases to random values. In PyTorch, we can use different methods to initialise the weights and biases of a neural network.

The importance of initialising weights and biases lies in the fact that they can significantly affect the convergence speed and performance of the neural network. Proper initialisation can help prevent issues such as vanishing or exploding gradients, which can hinder the training process.

Initialisation Method	Formula	PyTorch Code	Description
Xavier/Glorot Initialisation	$\displaystyle W \sim \mathcal{U}(-\sqrt{\frac{6}{n_{in} + n_{out}}}, \sqrt{\frac{6}{n_{in} + n_{out}}})$	`torch.nn.init.xavier_uniform_(tensor)`	Suitable for sigmoid and tanh activations.
He Initialisation	$\displaystyle W \sim \mathcal{U}(-\sqrt{\frac{6}{n_{in}}}, \sqrt{\frac{6}{n_{in}}})$	`torch.nn.init.kaiming_uniform_(tensor)`	Suitable for ReLU activations.
Kaiming Normal Initialisation	$\displaystyle W \sim \mathcal{N}(0, \sqrt{\frac{2}{n_{in}}})$	`torch.nn.init.kaiming_normal_(tensor)`	Suitable for ReLU activations.
Kaiming Uniform Initialisation	$\displaystyle W \sim \mathcal{U}(-\sqrt{\frac{6}{n_{in}}}, \sqrt{\frac{6}{n_{in}}})$	`torch.nn.init.kaiming_uniform_(tensor)`	Suitable for ReLU activations.
Zero Initialisation	$\displaystyle W = 0$	`torch.nn.init.zeros_(tensor)`	All weights are set to zero. Not recommended.
Random Initialisation	$\displaystyle W \sim \mathcal{U}(-1, 1)$	`torch.nn.init.uniform_(tensor)`	Weights are randomly initialised between -1 and 1.

Tips:

Use Xavier or He initialisation for most cases, as they are designed to maintain the variance of activations across layers.

Avoid zero initialisation, as it can lead to symmetry problems where all neurons learn the same features.

PyTorch uses Kaiming initialisation by default for torch.nn.Linear layers, which is suitable for ReLU activations.

Experiment with different initialisation methods to see their impact on training speed and model performance.

# Exercise 2: Model Creation with Weight Initialization 🎯
# In this exercise, you will:
# 1. Create a simple neural network model using PyTorch
# 2. Initialize weights and biases properly
# 3. Define layers with appropriate activation functions
# 4. Implement a forward method

class RobotArmNetwork(torch.nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        """Initialize a neural network for robotic arm inverse kinematics
        
        Args:
            input_size: Number of input features
            hidden_size: Number of neurons in the hidden layer
            output_size: Number of output features
        """
        # Initialize the parent class
        # Your code here
        
        # Define the layers of your neural network (simple architecture to avoid overfitting)
        self.fc1 = # Your code here
        self.hidden_activation = # Your code here
        self.fc2 = # Your code here
        
        # Initialize the weights using appropriate initialization techniques
        # He/Kaiming initialization for layers with ReLU activation
        # Your code here
        # Your code here
        
        # Xavier/Glorot initialization for the output layer
        # Your code here
        # Your code here
        
    def forward(self, x):
        """Forward pass through the network"""
        # Process input through first fully connected layer and activation function
        x = # Your code here
        
        # Process through output layer (no activation - we want raw values for regression)
        x = # Your code here
        
        return x

# Initialize your model with appropriate dimensions
model = # Your code here

# Print your model architecture
print(model)

# Print weight statistics to verify initialization
print("\nWeight initialization validation:")
print(f"First layer weight stats: mean={model.fc1.weight.mean().item():.4f}, std={model.fc1.weight.std().item():.4f}")
print(f"First layer bias: mean={model.fc1.bias.mean().item():.4f}, std={model.fc1.bias.std().item():.4f}")
print(f"Output layer weight stats: mean={model.fc2.weight.mean().item():.4f}, std={model.fc2.weight.std().item():.4f}")
print(f"Output layer bias: mean={model.fc2.bias.mean().item():.4f}, std={model.fc2.bias.std().item():.4f}")

# ✅ Check your answer
answer = {
    'model': model,
    'input_layer_size': model.fc1.in_features,
    'output_layer_size': model.fc2.out_features,
    'activation_type': model.hidden_activation.__class__,
    'fc1_weight_stats': {
        'mean': model.fc1.weight.mean().item(),
        'std': model.fc1.weight.std().item()
    },
    'fc2_weight_stats': {
        'mean': model.fc2.weight.mean().item(),
        'std': model.fc2.weight.std().item()
    },
    'fc1_bias_stats': {
        'mean': model.fc1.bias.mean().item(),
        'std': model.fc1.bias.std().item()
    },
    'fc2_bias_stats': {
        'mean': model.fc2.bias.mean().item(),
        'std': model.fc2.bias.std().item()
    }
}
checker.check_exercise(2, answer)

Step 6: Choose Optimiser

Definition: Optimisers are algorithms used to update the model parameters during training to minimise the loss function.

The next step is to choose an optimiser for the model.

The optimiser algorithm is used to update the model parameters during training. Most optimizers use a version of gradient descent to update the model parameters. The goal of the optimiser is to minimize the loss function by adjusting the weights and biases of the model. The most commonly used optimizers include:

Optimizer	PyTorch Implementation	Best Used For
Stochastic Gradient Descent (SGD)	`torch.optim.SGD(params, lr)`	Simple problems, good with momentum
Adam	`torch.optim.Adam(params, lr)`	Most deep learning tasks
RMSProp	`torch.optim.RMSprop(params, lr)`	Deep neural networks
Adagrad	`torch.optim.Adagrad(params, lr)`	Sparse data tasks
AdamW	`torch.optim.AdamW(params, lr)`	When regularization is important

The Adam optimiser is a popular choice for training deep learning models due to its efficiency and effectiveness. It combines the benefits of both SGD and RMSProp, making it suitable for a wide range of tasks.

Learning Rate

The learning rate is a hyperparameter that determines the step size at each iteration while moving toward a minimum of the loss function. A small learning rate may lead to slow convergence, while a large learning rate may cause the model to diverge. It is important to choose an appropriate learning rate for the optimizer to work effectively

Step 7: Choose Loss Function

The next step is to choose a loss function for the model. The choice of loss function depends on the type of problem being solved. The loss function measures how well the model is performing and guides the optimisation process. The most commonly used loss functions include:

Loss Function	PyTorch Implementation	Best Used For
Mean Squared Error (MSE)	`torch.nn.MSELoss()`	Regression tasks
Mean Absolute Error (MAE)	`torch.nn.L1Loss()`	Regression tasks
Binary Cross-Entropy	`torch.nn.BCELoss()`	Binary classification tasks
Categorical Cross-Entropy	`torch.nn.CrossEntropyLoss()`	Multi-class classification tasks
Hinge Loss	`torch.nn.HingeEmbeddingLoss()`	Support Vector Machines (SVM)
Kullback-Leibler Divergence	`torch.nn.KLDivLoss()`	Probabilistic models

The loss works in conjunction with the optimiser. While there are loss functions that can work for the same task, the choice of loss will have an effect on the final performance of the model. For instance, using MSE (L2-Norm) loss for a regression task will penalise larger errors more than smaller ones, while MAE (L1-Norm) loss treats all errors equally. This can lead to different model performance depending on the distribution of the data.

Tips:

Choose a loss function that is appropriate for the type of problem being solved (e.g., regression, classification).

Monitor the loss during training to ensure that the model is converging and not overfitting.

Experiment with different loss functions to see their impact on model performance.

# Exercise 3: Optimizer and Loss Function Selection 🎯
# In this exercise, you will:
# 1. Select an appropriate optimizer for your model
# 2. Choose a suitable loss function
# 3. Set the learning rate

# Create an Adam optimizer for your model
optimizer = # Your code here

# Create a Mean Squared Error loss function
loss_function = # Your code here

# Store the optimizer and loss function in the model for easy access
model.optimizer = optimizer
model.loss_function = loss_function

# Print the optimizer and loss function configuration
print(f"Optimizer: {type(model.optimizer).__name__}")
print(f"Learning rate: {model.optimizer.param_groups[0]['lr']}")
print(f"Loss function: {type(model.loss_function).__name__}")

# ✅ Check your answer
answer = {
    'optimizer_type': type(optimizer),
    'learning_rate': optimizer.param_groups[0]['lr'],
    'loss_function_type': type(loss_function)
}
checker.check_exercise(3, answer)

Step 8 and 9: Create Training Loop and Fit Model

The training loop implements the key steps for training a neural network model:

Step	Description	Code Example
1. Forward Pass	Pass input data through model to generate predictions	`predictions = model(inputs)`
2. Loss Computation	Calculate loss between predictions and targets	`loss = criterion(predictions, targets)`
3. Backward Pass	Compute gradients through backpropagation	`loss.backward()`
4. Parameter Updates	Update model parameters using optimizer	`optimizer.step()`
5. Gradient Reset	Zero out gradients for next iteration	`optimizer.zero_grad()`

The next step is to fit the model using the training data. The model is trained for a specified number of epochs, and the training and validation loss is monitored during training. The number of epochs is a hyperparameter that determines how many times the model will be trained on the entire training dataset.

Snippet 2: Training Loop Structure

```python for epoch in range(num_epochs): # Set model to training mode model.train()

# 1. Forward Pass
predictions = model(inputs)

# 2. Loss Computation
loss = criterion(predictions, targets)

# 3. Backward Pass
loss.backward()

# 4. Parameter Updates
optimizer.step()

# 5. Gradient Reset
optimizer.zero_grad()

# Exercise 4: Creating a Training Loop 🎯
# In this exercise, you will:
# 1. Create a training loop for your neural network
# 2. Implement forward and backward passes
# 3. Monitor training and validation loss

def train_model(model, 
                train_features, 
                train_targets, 
                val_features, 
                val_targets, 
                epochs=100):
    """
    Train a neural network model
    
    Args:
        model: PyTorch model to train
        train_features: Training features
        train_targets: Training targets
        val_features: Validation features
        val_targets: Validation targets
        epochs: Number of training epochs
        
    Returns:
        train_losses: List of training losses
        val_losses: List of validation losses
    """
    # Initialize lists to store losses
    train_losses = []
    val_losses = []
    
    # Put model in training mode
    # Your code here
    
    # Training loop
    for epoch in tqdm(range(epochs), desc="Training"):
        # 1. Zero gradients
        # Your code here
        
        # 2. Forward pass
        predictions = # Your code here
        
        # 3. Compute loss
        loss = # Your code here
        
        # 4. Backward pass
        # Your code here
        
        # 5. Update weights
        # Your code here
        
        # 6. Store training loss
        train_losses.append(loss.item())
        
        # 7. Compute validation loss
        model.eval() # Set model to evaluation mode
        with torch.no_grad(): # No need to track gradients for validation
            val_predictions = model(val_features)
            val_loss = model.loss_function(val_predictions, val_targets).item()
            val_losses.append(val_loss)
        
        # Set model back to training mode
        model.train()
    
    return train_losses, val_losses

# Run training for 100 epochs
train_losses, val_losses = train_model(
    model=model,
    train_features=X_train_scaled,
    train_targets=y_train_scaled,
    val_features=X_val_scaled,
    val_targets=y_val_scaled,
    epochs=300
)

# Plot training and validation loss
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(train_losses, label='Train Loss')
ax.plot(val_losses, label='Validation Loss')
utils.plotting.make_fig_pretty(ax, title='Loss vs Epochs', xlabel='Epochs', ylabel='Loss',ctab=True)
plt.show()

# ✅ Check your answer
answer = {
    'train_losses': train_losses[-1],
    'val_losses': val_losses[-1],
    'loss_trend': train_losses[0] > train_losses[-1],
    'overfit_check': val_losses[-1] <= val_losses[0] * 1.5  # Should not have increased much
}
checker.check_exercise(4, answer)

Overfitting, Underfitting, and Early Stopping

Definition: Overfitting occurs when a model learns the training data too well, including noise and outliers, leading to poor generalisation on unseen data. Underfitting occurs when a model is too simple to capture the underlying patterns in the data.

As we can see in the following figure, the training loss decreases over time, while the validation loss follows a similar trend. However, the validation loss starts to slowly deviate from the training loss after a certain number of epochs. This indicates that the model is starting to overfit the training data. The point at which the validation loss starts to increase is known as the “early stopping” point. This is the point at which we should stop training the model to prevent overfitting.

Step 10: Evaluate Model

The next step is to evaluate the model using the validation and test data. The model is evaluated on the validation set during training to monitor its performance and prevent overfitting.

Since we are training a model with MSE loss, we can also plot the predicted output against the actual output to see how well the model is performing. The predicted output should be close to the actual output, and the points should be clustered around the diagonal line. If the points are scattered far from the diagonal line, it indicates that the model is not performing well.

We can also compute the R-squared value to quantify the performance of the model. The R-squared value is a statistical measure that represents the proportion of the variance for a dependent variable that’s explained by an independent variable or variables in a regression model. The R-squared value ranges from 0 to 1, where 0 indicates that the model does not explain any of the variance in the data, and 1 indicates that the model explains all of the variance in the data.

For this step we are going to use the test set to evaluate the model.

Snippet 3: Evaluate Model

# Set model to evaluation mode
model.eval()

# Disable gradient calculation
with torch.no_grad():
    # Forward pass through the model
    predictions = model(inputs)

    # Compute loss
    loss = criterion(predictions, targets)

# Exercise 5: Model Evaluation 🎯
# In this exercise, you will:
# 1. Evaluate your trained model on the test set
# 2. Calculate R-squared score to measure model performance
# 3. Visualize actual vs. predicted values for one joint

# Put the model in evaluation mode
model.eval()

# Predict on the test set without computing gradients
with torch.no_grad():
    test_predictions = # Your code here
    
    # Calculate the test loss
    test_loss = # Your code here
    
    # Convert predictions and targets back to original scale
    test_predictions_original = # Your code here
    test_targets_original = # Your code here

# Calculate the R-squared score
r2_score = utils.ml.r2_score(test_targets_original, test_predictions_original)  

# Print evaluation metrics
print(f"Test Loss: {test_loss.item():.4f}")
print(f"R-squared Score: {r2_score:.4f}")

# Visualize actual vs. predicted values for the shoulder pitch joint (first joint)
fig, axes = plt.subplots(figsize=(16, 20), nrows=5)

for ix, joint in enumerate(y_test.columns):
    axes[ix].plot(test_targets_original[:, ix], test_predictions_original[:, ix], 'o', fillstyle='none', markersize=2)
    axes[ix].plot(test_targets_original[:, ix], test_targets_original[:, ix], 'r--')

    utils.plotting.make_fig_pretty(axes[ix], title=f"{joint}", ylabel='Predicted',
                                   xtick_fsize=10, ytick_fsize=10,
                                   title_fsize=12, xlabel_fsize=10)

    if ix == 4:
        axes[ix].set_xlabel('ACTUAL')



# ✅ Check your answer
answer = {
    'test_loss': test_loss.item(),
    'r2_score': r2_score,
    'predictions_shape': test_predictions_original.shape,
    'values_match': test_predictions_original.shape == test_targets_original.shape
}
checker.check_exercise(5, answer)