import plotly.graph_objects as go
import torch
from torch.optim import Adam
import sklearn.metrics as metricsTypes of Losses and Optimisation
Loss
Loss or Objective Function is a measure of the model’s performance. It is optimised during the training to improve model’s performance.
Broadly speaking, loss functions can be grouped into two major categories concerning the types of problems we come across in the real world: CLASSIFICATION and REGRESSION. In CLASSIFICATION problems, our task is to predict the respective probabilities of all classes the problem is dealing with. When it comes to REGRESSION, our task is to predict the continuous value concerning a given set of independent features to the learning algorithm.
Loss Functions for Regression
Mean Absolute Error Loss
We define MAE loss function as the average of absolute differences between the actual and the predicted value. It’s the second most commonly used regression loss function. It measures the average magnitude of errors in a set of predictions, without considering their directions.
\[\mathrm{MAE}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \frac{1}{n}\sum_{i=1}^{n}|y_i - \hat{y_i}|\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
The corresponding cost function is the mean of these absolute errors (MAE). It is also known as the \(\ell_1\) loss function.
Mean Squared Error Loss
We define MSE loss function as the average of squared differences between the actual and the predicted value. It’s the most commonly used regression loss function.
\[\mathrm{MSE}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y_i})^2\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
The corresponding cost function is the mean of these squared errors (MSE). It is also known as the \(\ell_2\) loss function. The MSE loss function penalizes the model for making large errors by squaring them.
Huber Loss
We define Huber loss function as the combination of MSE and MAE. It’s less sensitive to outliers than the MSE loss function and is differentiable at 0.
\[\mathrm{Huber}(\boldsymbol{y}, \boldsymbol{\hat{y}} ; \delta) = \frac{1}{n}\sum_{i=1}^{n}\mathrm{L}_{\delta}(y_i - \hat{y_i})\]
\[\mathrm{L}_{\delta}(y_i - \hat{y_i}) = \begin{cases} \frac{1}{2}(y_i - \hat{y_i})^2 & \text{for } |y_i - \hat{y_i}| \leq \delta \\ \delta|y_i - \hat{y_i}| - \frac{1}{2}\delta^2 & \mathrm{otherwise} \end{cases}\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
The corresponding cost function is the mean of these Huber errors. The Huber loss function is more robust to outliers compared to the MSE loss function.
Example
Let’s take a simple example to understand the above loss functions.
Say for some data, the actual value is 100 and the predicted value is 110. The loss function for the above loss functions will be:
\[\begin{align*} \mathrm{MAE} &= |100 - 110| &= 10 \\ \mathrm{MSE} &= (100 - 110)^2 &= 100 \\ \mathrm{Huber}(\delta = 5) &= \mathrm{L}_{\delta}(100 - 110)\\ &= 5 \times |100 - 110| - \frac{1}{2}\times5^2 = 50 - 12.5 &= 37.5 \\ \end{align*}\]
Here, we can see that the MAE loss function is the least sensitive to outliers. The MSE loss function is the most sensitive to outliers. The Huber loss function is less sensitive to outliers than the MSE loss function and is differentiable at 0.
Implementation
Importing Libraries
Defining the loss functions
# MAE loss
def mae(y, y_pred, extra=None):
assert y.shape == y_pred.shape
val = torch.abs(y - y_pred)
return torch.mean(val)
# MSE loss
def mse(y, y_pred, extra=None):
assert y.shape == y_pred.shape
val = (y - y_pred) ** 2
return torch.mean(val)
# Huber loss
def huber(y, y_pred, extra=None):
assert y.shape == y_pred.shape
d = extra if extra else 1
diff = torch.abs(y - y_pred)
val = torch.where(diff < d, 0.5 * diff ** 2, d * diff - 0.5 * d ** 2)
return torch.mean(val)
# Binary Cross-Entropy loss
def bce(y, y_pred, extra=None):
assert y.shape == y_pred.shape
val = -y * torch.log(y_pred) - (1 - y) * torch.log(1 - y_pred)
return torch.mean(val)
# Focal loss
def focal(y, y_pred, extra=None):
assert y.shape == y_pred.shape
g = extra if extra else 2
case_1 = -y * torch.log(y_pred) * (1 - y_pred) ** g
case_0 = -(1 - y) * torch.log(1 - y_pred) * y_pred ** g
val = case_1 + case_0
return torch.mean(val)loss_func = {"mae": mae, "mse": mse, "huber": huber, "bce": bce, "focal": focal}Training Function
# Train function
def train(x, y, w, loss_type, lr=0.01, epochs=100, extra=False):
if loss_type in ("mae", "mse", "huber"): # regression
classes = False
res = "RMSE"
res_func = metrics.mean_squared_error
elif loss_type in ("bce", "focal"): # classification
classes = True
res = "Accuracy"
res_func = metrics.accuracy_score
else:
raise ValueError("Unknown loss function")
result = []
loss_fn = loss_func[loss_type] # get loss function
opt = Adam([w], lr=lr)
for i in range(epochs):
y_pred = torch.matmul(x, w)
if classes: # classification
y_pred = torch.sigmoid(y_pred)
loss = loss_fn(y, y_pred, extra)
loss.backward()
opt.step()
opt.zero_grad()
if classes: # classification
y_pred = torch.where(y_pred > 0.5, 1.0, 0.0) # threshold
result.append(res_func(y, y_pred.detach()))
else: # regression
result.append(res_func(y, y_pred.detach(), squared=False))
if i % (epochs // 10) == 0:
print(f'Epoch {i}, loss {loss:.4f}, {res} {result[-1]:.4f}')
return result, y_pred.detach()Generating the data for regression
x = torch.rand(500, 1)
y = 2 * x + 3 + torch.randn(500, 1) * 0.5
x = torch.concatenate([x, torch.ones((500, 1))], axis=1)
fig = go.Figure()
fig.add_trace(go.Scatter(x=x[:, 0], y=y[:, 0], mode='markers', name='data'))
fig.update_layout(title='Data', xaxis_title='x', yaxis_title='y')
fig.show()MAE Loss
Training
w = torch.randn(2, 1, requires_grad=True)
lr = 0.1
loss_fn = "mae"result, y_pred = train(x, y, w, loss_fn, lr=lr, epochs=200)Epoch 0, loss 5.0595, RMSE 5.1217
Epoch 20, loss 2.1006, RMSE 2.1534
Epoch 40, loss 0.5658, RMSE 0.7100
Epoch 60, loss 0.4173, RMSE 0.5248
Epoch 80, loss 0.3829, RMSE 0.4829
Epoch 100, loss 0.3719, RMSE 0.4692
Epoch 120, loss 0.3690, RMSE 0.4648
Epoch 140, loss 0.3684, RMSE 0.4642
Epoch 160, loss 0.3684, RMSE 0.4643
Epoch 180, loss 0.3684, RMSE 0.4642
Plot the loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=torch.arange(len(result)), y=result, mode='lines', name='loss'))
fig.update_layout(title='Loss', xaxis_title='epoch', yaxis_title='loss')
fig.show()Plot the data with the regression line
fig = go.Figure()
fig.add_trace(go.Scatter(x=x[:, 0], y=y.ravel(), mode='markers', name='data'))
fig.add_trace(go.Scatter(x=x[:, 0], y=2 * x[:, 0] + 3, mode='lines', name='true line', line=dict(color='green')))
fig.add_trace(go.Scatter(x=x[:, 0], y=y_pred[:, 0], mode='lines', name='regression line', line=dict(color='red')))
fig.show()MSE Loss
Training
w = torch.randn(2, 1, requires_grad=True)
lr = 0.1
loss_fn = "mse"result, y_pred = train(x, y, w, "mse", lr=lr, epochs=200)Epoch 0, loss 33.1759, RMSE 5.7599
Epoch 20, loss 8.3986, RMSE 2.8980
Epoch 40, loss 0.8111, RMSE 0.9006
Epoch 60, loss 0.2259, RMSE 0.4752
Epoch 80, loss 0.2344, RMSE 0.4842
Epoch 100, loss 0.2175, RMSE 0.4664
Epoch 120, loss 0.2172, RMSE 0.4660
Epoch 140, loss 0.2166, RMSE 0.4654
Epoch 160, loss 0.2163, RMSE 0.4651
Epoch 180, loss 0.2161, RMSE 0.4648
Plot the loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=torch.arange(len(result)), y=result, mode='lines', name='loss'))
fig.update_layout(title='Loss', xaxis_title='epoch', yaxis_title='loss')
fig.show()Plot the data with the regression line
fig = go.Figure()
fig.add_trace(go.Scatter(x=x[:, 0], y=y.ravel(), mode='markers', name='data'))
fig.add_trace(go.Scatter(x=x[:, 0], y=2 * x[:, 0] + 3, mode='lines', name='true line', line=dict(color='green')))
fig.add_trace(go.Scatter(x=x[:, 0], y=y_pred[:, 0], mode='lines', name='regression line', line=dict(color='red')))
fig.update_layout(title='Data', xaxis_title='x', yaxis_title='y')
fig.show()Huber Loss
Training
w = torch.randn(2, 1, requires_grad=True)
lr = 0.1
loss_fn = "huber"result, y_pred = train(x, y, w, loss_fn, lr=lr, epochs=200, extra=0.5)Epoch 0, loss 0.9997, RMSE 2.3211
Epoch 20, loss 0.1458, RMSE 0.6103
Epoch 40, loss 0.1058, RMSE 0.4996
Epoch 60, loss 0.0954, RMSE 0.4702
Epoch 80, loss 0.0935, RMSE 0.4641
Epoch 100, loss 0.0935, RMSE 0.4640
Epoch 120, loss 0.0934, RMSE 0.4639
Epoch 140, loss 0.0934, RMSE 0.4639
Epoch 160, loss 0.0934, RMSE 0.4639
Epoch 180, loss 0.0934, RMSE 0.4639
Plot the loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=torch.arange(len(result)), y=result, mode='lines', name='loss'))
fig.update_layout(title='Loss', xaxis_title='epoch', yaxis_title='loss')
fig.show()Plot the data with the regression line
fig = go.Figure()
fig.add_trace(go.Scatter(x=x[:, 0], y=y.ravel(), mode='markers', name='data'))
fig.add_trace(go.Scatter(x=x[:, 0], y=2 * x[:, 0] + 3, mode='lines', name='true line', line=dict(color='green')))
fig.add_trace(go.Scatter(x=x[:, 0], y=y_pred[:, 0], mode='lines', name='regression line', line=dict(color='red')))
fig.show()Loss Functions for Classification
Binary Cross-Entropy Loss
This is the most common loss function used in classification problems. The binary cross-entropy loss decreases as the predicted probability converges to the actual label. It measures the performance of a classification model whose predicted output is a probability value between 0 and 1.
\[\mathrm{L}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \begin{cases} -\log(\hat{y_i}) & \text{if } y_i = 1 \\ -\log(1-\hat{y_i}) & \text{if } y_i = 0 \end{cases}\]
\[\mathrm{L}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = - \dfrac{1}{m} \sum_{i=1}^{m} y_i \log(\hat{y_i}) + (1-y_i) \log(1-\hat{y_i})\]
where \(y_i\) is the actual value and \(\hat{y_i}\) is the predicted value.
Focal Loss
We define Focal loss function as the combination of Binary Cross-Entropy Loss and a modulating factor. The modulating factor \(\gamma\) is used to reduce the relative loss for well-classified examples and put more focus on hard, misclassified examples. It’s less sensitive to outliers than the Binary Cross-Entropy Loss function and is differentiable at 0.
\[\mathrm{FL}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = \begin{cases} -(1-\hat{y_i})^{\gamma}\log(\hat{y_i}) & \text{if } y_i = 1 \\ -(\hat{y_i})^{\gamma}\log(1-\hat{y_i}) & \text{if } y_i = 0 \end{cases}\]
\[\mathrm{FL}(\boldsymbol{y}, \boldsymbol{\hat{y}}) = - \dfrac{1}{m} \sum_{i=1}^{m} y_i (1 - \hat{y_i})^{\gamma} \log(\hat{y_i}) + (1-y_i) (\hat{y_i})^{\gamma} \log(1-\hat{y_i})\]
where \(y_i\) is the actual label and \(\hat{y_i}\) is the predicted probability of the label.
Example
Let’s take a simple example to understand the above loss functions.
Say for some data, the actual label is 1 and the predicted probability of the label is 0.85. The loss function for the above loss functions will be:
\[\begin{align*} \mathrm{BCE} &= -\log(0.85) &\approx 0.162 \\ \mathrm{FL}(\gamma = 2) &= -(1-0.85)^2\log(0.85) &\approx 0.004 \\ \end{align*}\]
And for some data, the actual label is 1 and the predicted probability of the label is 0.55. The loss function for the above loss functions will be:
\[\begin{align*} \mathrm{BCE} &= -\log(0.55) &\approx 0.598 \\ \mathrm{FL}(\gamma = 2) &= -(1-0.55)^2\log(0.55) &\approx 0.121 \\ \end{align*}\]
Here, we can see that the propotional increase is approximately 3.6 times in the BCE loss function and approximately 30 times in the FL loss function. Hence, the FL loss function penalizes the model more for misclassifying the data.
Implementation
Generating the data for classification
from sklearn.datasets import make_blobs
x, y = make_blobs(n_samples=500, centers=2, cluster_std=2, random_state=42)
x = torch.from_numpy(x).float()
y = torch.from_numpy(y).float().reshape(-1, 1)
x = torch.concatenate([x, torch.ones((500, 1))], axis=1)
color = ['red' if l == 0 else 'blue' for l in y]
# plot data
fig = go.Figure()
fig.add_trace(go.Scatter(x=x[:, 0], y=x[:, 1], mode='markers', marker=dict(color=color)))
fig.update_layout(title='Data', xaxis_title='x', yaxis_title='y')
fig.show()Binary Cross-Entropy Loss
Training
w = torch.randn(3, 1, requires_grad=True)
lr = 0.1
loss_fn = "bce"result, y_pred = train(x, y, w, loss_fn, lr=lr, epochs=100)Epoch 0, loss 2.5939, Accuracy 0.4800
Epoch 10, loss 0.2322, Accuracy 0.9520
Epoch 20, loss 0.0507, Accuracy 0.9900
Epoch 30, loss 0.0366, Accuracy 0.9860
Epoch 40, loss 0.0330, Accuracy 0.9880
Epoch 50, loss 0.0317, Accuracy 0.9900
Epoch 60, loss 0.0310, Accuracy 0.9880
Epoch 70, loss 0.0305, Accuracy 0.9880
Epoch 80, loss 0.0301, Accuracy 0.9900
Epoch 90, loss 0.0297, Accuracy 0.9900
Plot the loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=torch.arange(len(result)), y=result, mode='lines', name='Accuracy'))
fig.update_layout(title='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.show()Plot the data with the separation line
fig = go.Figure()
fig.add_trace(go.Scatter(x=x[:, 0], y=x[:, 1], mode='markers', marker=dict(color=color)))
fig.add_trace(go.Scatter(x=x[:, 0], y=(-(w[0] * x[:, 0] + w[2]) / w[1]).detach(), mode='lines', name='separation line', line=dict(color='red')))
fig.update_layout(title='Data', xaxis_title='x', yaxis_title='y', yaxis_range=(x[:, 1].min() - 5, x[:, 1].max() + 5))
fig.show()Focal Loss
Training
w = torch.randn(3, 1, requires_grad=True)
lr = 0.1
loss_fn = "focal"result, y_pred = train(x, y, w, loss_fn, lr=lr, epochs=100, extra=5)Epoch 0, loss 1.3781, Accuracy 0.5020
Epoch 10, loss 0.0047, Accuracy 0.9820
Epoch 20, loss 0.0156, Accuracy 0.9720
Epoch 30, loss 0.0191, Accuracy 0.9720
Epoch 40, loss 0.0163, Accuracy 0.9740
Epoch 50, loss 0.0125, Accuracy 0.9800
Epoch 60, loss 0.0097, Accuracy 0.9860
Epoch 70, loss 0.0078, Accuracy 0.9820
Epoch 80, loss 0.0068, Accuracy 0.9820
Epoch 90, loss 0.0061, Accuracy 0.9820
Plot the loss
fig = go.Figure()
fig.add_trace(go.Scatter(x=torch.arange(len(result)), y=result, mode='lines', name='Accuracy'))
fig.update_layout(title='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.show()Plot the data with the separation line
fig = go.Figure()
fig.add_trace(go.Scatter(x=x[:, 0], y=x[:, 1], mode='markers', marker=dict(color=color)))
fig.add_trace(go.Scatter(x=x[:, 0], y=(-(w[0] * x[:, 0] + w[2]) / w[1]).detach(), mode='lines', name='separation line', line=dict(color='red')))
fig.update_layout(title='Classification', xaxis_title='x', yaxis_title='y', yaxis_range=(x[:, 1].min() - 5, x[:, 1].max() + 5))
fig.show()Comparison between BCE and FL
def ce(p):
return -torch.log(p)
def fl(p, gamma=2):
return (1 - p) ** gamma * -torch.log(p)x = torch.arange(0.01, 1, 0.01)
fig = go.Figure()
for gamma in [0, 0.5, 1, 2, 5, 10]:
fig.add_trace(go.Scatter(x=x, y=fl(x, gamma)/ce(x), mode='lines', name=f'gamma={gamma}'))
fig.update_layout(title='Focal Loss / Cross Entropy', xaxis_title='p', yaxis_title='Focal Loss / Cross Entropy')
fig.show()fig = go.Figure()
for gamma in [0, 0.5, 1, 2, 5, 10]:
fig.add_trace(go.Scatter(x=x, y=torch.log(ce(x)/fl(x, gamma)), mode='lines', name=f'gamma={gamma}'))
fig.update_layout(title='log(Cross Entropy / Focal Loss)', xaxis_title='p', yaxis_title='log(Cross Entropy / Focal Loss)')
fig.show()