import plotly.graph_objects as go
import torch
from torch.optim import Adam
import sklearn.metrics as metrics
Types of Losses and Optimisation
Loss
Loss or Objective Function is a measure of the model’s performance. It is optimised during the training to improve model’s performance.
Broadly speaking, loss functions can be grouped into two major categories concerning the types of problems we come across in the real world: CLASSIFICATION and REGRESSION. In CLASSIFICATION problems, our task is to predict the respective probabilities of all classes the problem is dealing with. When it comes to REGRESSION, our task is to predict the continuous value concerning a given set of independent features to the learning algorithm.
Loss Functions for Regression
Mean Absolute Error Loss
We define MAE loss function as the average of absolute differences between the actual and the predicted value. It’s the second most commonly used regression loss function. It measures the average magnitude of errors in a set of predictions, without considering their directions.
\[\mathrm{MAE}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y_i}|\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
The corresponding cost function is the mean of these absolute errors (MAE). It is also known as the \(\ell_1\) loss function.
Mean Squared Error Loss
We define MSE loss function as the average of squared differences between the actual and the predicted value. It’s the most commonly used regression loss function.
\[\mathrm{MSE}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y_i})^2\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
The corresponding cost function is the mean of these squared errors (MSE). It is also known as the \(\ell_2\) loss function. The MSE loss function penalizes the model for making large errors by squaring them.
Huber Loss
We define Huber loss function as the combination of MSE and MAE. It’s less sensitive to outliers than the MSE loss function and is differentiable at 0.
\[\mathrm{Huber}(\boldsymbol{y}, \boldsymbol{\hat{y}} ; \delta) = \frac{1}{n}\sum_{i=1}^{n}\mathrm{L}_{\delta}(y_i - \hat{y_i})\]
\[\mathrm{L}_{\delta}(y_i - \hat{y_i}) = \begin{cases} \frac{1}{2}(y_i - \hat{y_i})^2 & \text{for } |y_i - \hat{y_i}| \leq \delta \\ \delta|y_i - \hat{y_i}| - \frac{1}{2}\delta^2 & \mathrm{otherwise} \end{cases}\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
The corresponding cost function is the mean of these Huber errors. The Huber loss function is more robust to outliers compared to the MSE loss function.
Example
Let’s take a simple example to understand the above loss functions.
Say for some data, the actual value is 100 and the predicted value is 110. The loss function for the above loss functions will be:
\[\begin{align*} \mathrm{MAE} &= |100 - 110| &= 10 \\ \mathrm{MSE} &= (100 - 110)^2 &= 100 \\ \mathrm{Huber}(\delta = 5) &= \mathrm{L}_{\delta}(100 - 110)\\ &= 5 \times |100 - 110| - \frac{1}{2}\times5^2 = 50 - 12.5 &= 37.5 \\ \end{align*}\]
Here, we can see that the MAE loss function is the least sensitive to outliers. The MSE loss function is the most sensitive to outliers. The Huber loss function is less sensitive to outliers than the MSE loss function and is differentiable at 0.
Implementation
Importing Libraries
Defining the loss functions
# MAE loss
def mae(y, y_pred, extra=None):
assert y.shape == y_pred.shape
= torch.abs(y - y_pred)
val return torch.mean(val)
# MSE loss
def mse(y, y_pred, extra=None):
assert y.shape == y_pred.shape
= (y - y_pred) ** 2
val return torch.mean(val)
# Huber loss
def huber(y, y_pred, extra=None):
assert y.shape == y_pred.shape
= extra if extra else 1
d = torch.abs(y - y_pred)
diff = torch.where(diff < d, 0.5 * diff ** 2, d * diff - 0.5 * d ** 2)
val return torch.mean(val)
# Binary Cross-Entropy loss
def bce(y, y_pred, extra=None):
assert y.shape == y_pred.shape
= -y * torch.log(y_pred) - (1 - y) * torch.log(1 - y_pred)
val return torch.mean(val)
# Focal loss
def focal(y, y_pred, extra=None):
assert y.shape == y_pred.shape
= extra if extra else 2
g = -y * torch.log(y_pred) * (1 - y_pred) ** g
case_1 = -(1 - y) * torch.log(1 - y_pred) * y_pred ** g
case_0 = case_1 + case_0
val return torch.mean(val)
= {"mae": mae, "mse": mse, "huber": huber, "bce": bce, "focal": focal} loss_func
Training Function
# Train function
def train(x, y, w, loss_type, lr=0.01, epochs=100, extra=False):
if loss_type in ("mae", "mse", "huber"): # regression
= False
classes = "RMSE"
res = metrics.mean_squared_error
res_func elif loss_type in ("bce", "focal"): # classification
= True
classes = "Accuracy"
res = metrics.accuracy_score
res_func else:
raise ValueError("Unknown loss function")
= []
result = loss_func[loss_type] # get loss function
loss_fn
= Adam([w], lr=lr)
opt
for i in range(epochs):
= torch.matmul(x, w)
y_pred if classes: # classification
= torch.sigmoid(y_pred)
y_pred
= loss_fn(y, y_pred, extra)
loss
loss.backward()
opt.step()
opt.zero_grad()
if classes: # classification
= torch.where(y_pred > 0.5, 1.0, 0.0) # threshold
y_pred
result.append(res_func(y, y_pred.detach()))else: # regression
=False))
result.append(res_func(y, y_pred.detach(), squared
if i % (epochs // 10) == 0:
print(f'Epoch {i}, loss {loss:.4f}, {res} {result[-1]:.4f}')
return result, y_pred.detach()
Generating the data for regression
= torch.rand(500, 1)
x = 2 * x + 3 + torch.randn(500, 1) * 0.5
y
= torch.concatenate([x, torch.ones((500, 1))], axis=1)
x
= go.Figure()
fig =x[:, 0], y=y[:, 0], mode='markers', name='data'))
fig.add_trace(go.Scatter(x='Data', xaxis_title='x', yaxis_title='y')
fig.update_layout(title fig.show()
MAE Loss
Training
= torch.randn(2, 1, requires_grad=True)
w
= 0.1
lr = "mae" loss_fn
= train(x, y, w, loss_fn, lr=lr, epochs=200) result, y_pred
Epoch 0, loss 5.0595, RMSE 5.1217
Epoch 20, loss 2.1006, RMSE 2.1534
Epoch 40, loss 0.5658, RMSE 0.7100
Epoch 60, loss 0.4173, RMSE 0.5248
Epoch 80, loss 0.3829, RMSE 0.4829
Epoch 100, loss 0.3719, RMSE 0.4692
Epoch 120, loss 0.3690, RMSE 0.4648
Epoch 140, loss 0.3684, RMSE 0.4642
Epoch 160, loss 0.3684, RMSE 0.4643
Epoch 180, loss 0.3684, RMSE 0.4642
Plot the loss
= go.Figure()
fig =torch.arange(len(result)), y=result, mode='lines', name='loss'))
fig.add_trace(go.Scatter(x='Loss', xaxis_title='epoch', yaxis_title='loss')
fig.update_layout(title fig.show()
Plot the data with the regression line
= go.Figure()
fig =x[:, 0], y=y.ravel(), mode='markers', name='data'))
fig.add_trace(go.Scatter(x=x[:, 0], y=2 * x[:, 0] + 3, mode='lines', name='true line', line=dict(color='green')))
fig.add_trace(go.Scatter(x=x[:, 0], y=y_pred[:, 0], mode='lines', name='regression line', line=dict(color='red')))
fig.add_trace(go.Scatter(x fig.show()
MSE Loss
Training
= torch.randn(2, 1, requires_grad=True)
w
= 0.1
lr = "mse" loss_fn
= train(x, y, w, "mse", lr=lr, epochs=200) result, y_pred
Epoch 0, loss 33.1759, RMSE 5.7599
Epoch 20, loss 8.3986, RMSE 2.8980
Epoch 40, loss 0.8111, RMSE 0.9006
Epoch 60, loss 0.2259, RMSE 0.4752
Epoch 80, loss 0.2344, RMSE 0.4842
Epoch 100, loss 0.2175, RMSE 0.4664
Epoch 120, loss 0.2172, RMSE 0.4660
Epoch 140, loss 0.2166, RMSE 0.4654
Epoch 160, loss 0.2163, RMSE 0.4651
Epoch 180, loss 0.2161, RMSE 0.4648
Plot the loss
= go.Figure()
fig =torch.arange(len(result)), y=result, mode='lines', name='loss'))
fig.add_trace(go.Scatter(x='Loss', xaxis_title='epoch', yaxis_title='loss')
fig.update_layout(title fig.show()
Plot the data with the regression line
= go.Figure()
fig =x[:, 0], y=y.ravel(), mode='markers', name='data'))
fig.add_trace(go.Scatter(x=x[:, 0], y=2 * x[:, 0] + 3, mode='lines', name='true line', line=dict(color='green')))
fig.add_trace(go.Scatter(x=x[:, 0], y=y_pred[:, 0], mode='lines', name='regression line', line=dict(color='red')))
fig.add_trace(go.Scatter(x='Data', xaxis_title='x', yaxis_title='y')
fig.update_layout(title fig.show()
Huber Loss
Training
= torch.randn(2, 1, requires_grad=True)
w
= 0.1
lr = "huber" loss_fn
= train(x, y, w, loss_fn, lr=lr, epochs=200, extra=0.5) result, y_pred
Epoch 0, loss 0.9997, RMSE 2.3211
Epoch 20, loss 0.1458, RMSE 0.6103
Epoch 40, loss 0.1058, RMSE 0.4996
Epoch 60, loss 0.0954, RMSE 0.4702
Epoch 80, loss 0.0935, RMSE 0.4641
Epoch 100, loss 0.0935, RMSE 0.4640
Epoch 120, loss 0.0934, RMSE 0.4639
Epoch 140, loss 0.0934, RMSE 0.4639
Epoch 160, loss 0.0934, RMSE 0.4639
Epoch 180, loss 0.0934, RMSE 0.4639
Plot the loss
= go.Figure()
fig =torch.arange(len(result)), y=result, mode='lines', name='loss'))
fig.add_trace(go.Scatter(x='Loss', xaxis_title='epoch', yaxis_title='loss')
fig.update_layout(title fig.show()
Plot the data with the regression line
= go.Figure()
fig =x[:, 0], y=y.ravel(), mode='markers', name='data'))
fig.add_trace(go.Scatter(x=x[:, 0], y=2 * x[:, 0] + 3, mode='lines', name='true line', line=dict(color='green')))
fig.add_trace(go.Scatter(x=x[:, 0], y=y_pred[:, 0], mode='lines', name='regression line', line=dict(color='red')))
fig.add_trace(go.Scatter(x fig.show()
Loss Functions for Classification
Binary Cross-Entropy Loss
This is the most common loss function used in classification problems. The binary cross-entropy loss decreases as the predicted probability converges to the actual label. It measures the performance of a classification model whose predicted output is a probability value between 0 and 1.
\[\mathrm{L}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \begin{cases} -\log(\hat{y_i}) & \text{if } y_i = 1 \\ -\log(1-\hat{y_i}) & \text{if } y_i = 0 \end{cases}\]
\[\mathrm{L}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = - \dfrac{1}{m} \sum_{i=1}^{m} y_i \log(\hat{y_i}) + (1-y_i) \log(1-\hat{y_i})\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
Focal Loss
We define Focal loss function as the combination of Binary Cross-Entropy Loss and a modulating factor. The modulating factor \(\gamma\) is used to reduce the relative loss for well-classified examples and put more focus on hard, misclassified examples. It’s less sensitive to outliers than the Binary Cross-Entropy Loss function and is differentiable at 0.
\[\mathrm{FL}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \begin{cases} -(1-\hat{y_i})^{\gamma}\log(\hat{y_i}) & \text{if } y_i = 1 \\ -(\hat{y_i})^{\gamma}\log(1-\hat{y_i}) & \text{if } y_i = 0 \end{cases}\]
\[\mathrm{FL}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = - \dfrac{1}{m} \sum_{i=1}^{m} y_i (1 - \hat{y_i})^{\gamma} \log(\hat{y_i}) + (1-y_i) (\hat{y_i})^{\gamma} \log(1-\hat{y_i})\]
where \(y_i\) is the actual label and \(\hat{y_i}\) is the predicted probability of the label.
Example
Let’s take a simple example to understand the above loss functions.
Say for some data, the actual label is 1 and the predicted probability of the label is 0.85. The loss function for the above loss functions will be:
\[\begin{align*} \mathrm{BCE} &= -\log(0.85) &\approx 0.162 \\ \mathrm{FL}(\gamma = 2) &= -(1-0.85)^2\log(0.85) &\approx 0.004 \\ \end{align*}\]
And for some data, the actual label is 1 and the predicted probability of the label is 0.55. The loss function for the above loss functions will be:
\[\begin{align*} \mathrm{BCE} &= -\log(0.55) &\approx 0.598 \\ \mathrm{FL}(\gamma = 2) &= -(1-0.55)^2\log(0.55) &\approx 0.121 \\ \end{align*}\]
Here, we can see that the propotional increase is approximately 3.6 times in the BCE loss function and approximately 30 times in the FL loss function. Hence, the FL loss function penalizes the model more for misclassifying the data.
Implementation
Generating the data for classification
from sklearn.datasets import make_blobs
= make_blobs(n_samples=500, centers=2, cluster_std=2, random_state=42)
x, y
= torch.from_numpy(x).float()
x = torch.from_numpy(y).float().reshape(-1, 1)
y
= torch.concatenate([x, torch.ones((500, 1))], axis=1)
x
= ['red' if l == 0 else 'blue' for l in y]
color
# plot data
= go.Figure()
fig =x[:, 0], y=x[:, 1], mode='markers', marker=dict(color=color)))
fig.add_trace(go.Scatter(x='Data', xaxis_title='x', yaxis_title='y')
fig.update_layout(title fig.show()
Binary Cross-Entropy Loss
Training
= torch.randn(3, 1, requires_grad=True)
w
= 0.1
lr = "bce" loss_fn
= train(x, y, w, loss_fn, lr=lr, epochs=100) result, y_pred
Epoch 0, loss 2.5939, Accuracy 0.4800
Epoch 10, loss 0.2322, Accuracy 0.9520
Epoch 20, loss 0.0507, Accuracy 0.9900
Epoch 30, loss 0.0366, Accuracy 0.9860
Epoch 40, loss 0.0330, Accuracy 0.9880
Epoch 50, loss 0.0317, Accuracy 0.9900
Epoch 60, loss 0.0310, Accuracy 0.9880
Epoch 70, loss 0.0305, Accuracy 0.9880
Epoch 80, loss 0.0301, Accuracy 0.9900
Epoch 90, loss 0.0297, Accuracy 0.9900
Plot the loss
= go.Figure()
fig =torch.arange(len(result)), y=result, mode='lines', name='Accuracy'))
fig.add_trace(go.Scatter(x='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.update_layout(title fig.show()
Plot the data with the separation line
= go.Figure()
fig =x[:, 0], y=x[:, 1], mode='markers', marker=dict(color=color)))
fig.add_trace(go.Scatter(x=x[:, 0], y=(-(w[0] * x[:, 0] + w[2]) / w[1]).detach(), mode='lines', name='separation line', line=dict(color='red')))
fig.add_trace(go.Scatter(x='Data', xaxis_title='x', yaxis_title='y', yaxis_range=(x[:, 1].min() - 5, x[:, 1].max() + 5))
fig.update_layout(title fig.show()
Focal Loss
Training
= torch.randn(3, 1, requires_grad=True)
w
= 0.1
lr = "focal" loss_fn
= train(x, y, w, loss_fn, lr=lr, epochs=100, extra=5) result, y_pred
Epoch 0, loss 1.3781, Accuracy 0.5020
Epoch 10, loss 0.0047, Accuracy 0.9820
Epoch 20, loss 0.0156, Accuracy 0.9720
Epoch 30, loss 0.0191, Accuracy 0.9720
Epoch 40, loss 0.0163, Accuracy 0.9740
Epoch 50, loss 0.0125, Accuracy 0.9800
Epoch 60, loss 0.0097, Accuracy 0.9860
Epoch 70, loss 0.0078, Accuracy 0.9820
Epoch 80, loss 0.0068, Accuracy 0.9820
Epoch 90, loss 0.0061, Accuracy 0.9820
Plot the loss
= go.Figure()
fig =torch.arange(len(result)), y=result, mode='lines', name='Accuracy'))
fig.add_trace(go.Scatter(x='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.update_layout(title fig.show()
Plot the data with the separation line
= go.Figure()
fig =x[:, 0], y=x[:, 1], mode='markers', marker=dict(color=color)))
fig.add_trace(go.Scatter(x=x[:, 0], y=(-(w[0] * x[:, 0] + w[2]) / w[1]).detach(), mode='lines', name='separation line', line=dict(color='red')))
fig.add_trace(go.Scatter(x='Classification', xaxis_title='x', yaxis_title='y', yaxis_range=(x[:, 1].min() - 5, x[:, 1].max() + 5))
fig.update_layout(title fig.show()
Comparison between BCE and FL
def ce(p):
return -torch.log(p)
def fl(p, gamma=2):
return (1 - p) ** gamma * -torch.log(p)
= torch.arange(0.01, 1, 0.01)
x
= go.Figure()
fig for gamma in [0, 0.5, 1, 2, 5, 10]:
=x, y=fl(x, gamma)/ce(x), mode='lines', name=f'gamma={gamma}'))
fig.add_trace(go.Scatter(x='Focal Loss / Cross Entropy', xaxis_title='p', yaxis_title='Focal Loss / Cross Entropy')
fig.update_layout(title fig.show()
= go.Figure()
fig for gamma in [0, 0.5, 1, 2, 5, 10]:
=x, y=torch.log(ce(x)/fl(x, gamma)), mode='lines', name=f'gamma={gamma}'))
fig.add_trace(go.Scatter(x='log(Cross Entropy / Focal Loss)', xaxis_title='p', yaxis_title='log(Cross Entropy / Focal Loss)')
fig.update_layout(title fig.show()