InceptionV3 - 类别标签平滑正则化LSR - AIUAI

InceptionV3 论文中提出,one-hot 硬编码形式的标签会导致过拟合. 标签平滑能够提升分类精度.

简单的实现如:

smooth_labels = (1.0 - label_smoothing) * one_hot_labels + label_smoothing / num_classes

其中,可以设置label_smoothing=0.1num_classes 表示类别数.

具体示例如下.

1. 示例 1

From: PyTorch Label Smoothing for CrossEntropyLoss#7455

class LabelSmoothLoss(nn.Module): def __init__(self, smoothing=0.0): super(LabelSmoothLoss, self).__init__() self.smoothing = smoothing def forward(self, input, target): log_prob = F.log_softmax(input, dim=-1) weight = input.new_ones(input.size()) * \ self.smoothing / (input.size(-1) - 1.) weight.scatter_(-1, target.unsqueeze(-1), (1. - self.smoothing)) loss = (-weight * log_prob).sum(dim=-1).mean() return loss

答者验证结果:

[1] - When smoothing=0.0, the output is the same as nn.CrossEntropyLoss within precision 1e-5.
[2] - When smoothing>0.0, the sums of weights over different classes weight.sum(dim=-1) are always 1.

2. 示例 2

From: PyTorch Label Smoothing for CrossEntropyLoss#7455

分两步处理.

[1] - 定义函数,得到 smooth label:

def smooth_one_hot(true_labels: torch.Tensor, classes: int, smoothing=0.0): """ if smoothing == 0, it's one-hot method if 0 < smoothing < 1, it's smooth method """ assert 0 <= smoothing < 1 confidence = 1.0 - smoothing label_shape = torch.Size((true_labels.size(0), classes)) with torch.no_grad(): true_dist = torch.empty(size=label_shape, device=true_labels.device) true_dist.fill_(smoothing / (classes - 1)) true_dist.scatter_(1, true_labels.data.unsqueeze(1), confidence) return true_dist

[2] - CrossEntropyLoss 支持 k-hot/smoothed targets.

使用方式如:

Loss = CrossEntropyLoss(NonSparse=True, ...) . . . data = ... labels = ... outputs = model(data) smooth_label = smooth_one_hot(labels, ...) loss = (outputs, smooth_label) ...

答者测试情况如:

modelepochsdtypebatch size*gpuslrtrickstop1/top5improve
resnet50120FP1612880.4-77.35/-baseline
resnet50120FP1612880.4Label smoothing77.78/93.80+0.43

3. 示例 3

From: Github - NVIDIA/DeepLearningExamples/PyTorch/Classification

label smoothing 定义 - smoothing.py:

import torch import torch.nn as nn class LabelSmoothing(nn.Module): """ NLL loss with label smoothing. """ def __init__(self, smoothing=0.0): """ Constructor for the LabelSmoothing module. :param smoothing: label smoothing factor """ super(LabelSmoothing, self).__init__() self.confidence = 1.0 - smoothing self.smoothing = smoothing def forward(self, x, target): logprobs = torch.nn.functional.log_softmax(x, dim=-1) nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)) nll_loss = nll_loss.squeeze(1) smooth_loss = -logprobs.mean(dim=-1) loss = self.confidence * nll_loss + self.smoothing * smooth_loss return loss.mean()

训练函数中调用用法 - main.py

import torch import torch.nn as nn from smoothing import LabelSmoothing def add_parser_arguments(parser): parser.add_argument('--label-smoothing', default=0.0, type=float, metavar='S', help='label smoothing') def main(args): loss = nn.CrossEntropyLoss if args.label_smoothing > 0.0: loss = lambda: LabelSmoothing(args.label_smoothing) criterion = loss()

4. 示例 4

From: PyTorch Label Smoothing for CrossEntropyLoss#7455

class LabelSmoothingLoss(nn.Module): def __init__(self, classes, smoothing=0.0, dim=-1): super(LabelSmoothingLoss, self).__init__() self.confidence = 1.0 - smoothing self.smoothing = smoothing self.cls = classes self.dim = dim def forward(self, pred, target): pred = pred.log_softmax(dim=self.dim) with torch.no_grad(): # true_dist = pred.data.clone() true_dist = torch.zeros_like(pred) true_dist.fill_(self.smoothing / (self.cls - 1)) true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence) return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

5. 示例 5. Github-NJUNMT-pytorch

Github - NJUNMT-pytorch

NJUNMT-pytorch/src/modules/criterions.py#L45

import torch import torch.nn as nn class NMTCriterion(Criterion): """ A common used criterion for neural machine translation NMTCriterion is used for MLE training given golden target sample. Additional label_smoothing is supported. """ def __init__(self, padding_idx=PAD, label_smoothing=0.0): super().__init__() self.padding_idx = padding_idx self.label_smoothing = label_smoothing if label_smoothing > 0: self.criterion = nn.KLDivLoss(size_average=False, reduce=False) else: self.criterion = nn.NLLLoss(size_average=False, ignore_index=padding_idx, reduce=False) self.confidence = 1.0 - label_smoothing def _smooth_label(self, num_tokens): # When label smoothing is turned on, # KL-divergence between q_{smoothed ground truth prob.}(w) # and p_{prob. computed by model}(w) is minimized. # If label smoothing value is set to zero, the loss # is equivalent to NLLLoss or CrossEntropyLoss. # All non-true labels are uniformly set to low-confidence. one_hot = torch.randn(1, num_tokens) one_hot.fill_(self.label_smoothing / (num_tokens - 2)) one_hot[0][self.padding_idx] = 0 return one_hot def _bottle(self, v): return v.view(-1, v.size(2)) def _compute_loss(self, inputs, labels, **kwargs): """ Args: inputs (..., K): Expect logarithm probabilities. labels (...,): Index tensor. Should be the same size as inputs except the last dimension. """ batch_size = labels.size(0) scores = self._bottle(inputs) # [batch_size * seq_len, d_words] num_tokens = scores.size(-1) gtruth = labels.view(-1) if self.confidence < 1: # N: the number of samples # M: the number of labels tdata = gtruth.detach() # mask of PAD mask = torch.nonzero(tdata.eq(self.padding_idx)).squeeze() # Do label smoothing one_hot = self._smooth_label(num_tokens) if labels.is_cuda: one_hot = one_hot.cuda() tmp_ = one_hot.repeat(gtruth.size(0), 1) # [N, M] tmp_.scatter_(1, tdata.unsqueeze(1), self.confidence) if mask.numel() > 0: tmp_.index_fill_(0, mask, 0) gtruth = tmp_.detach() loss = self.criterion(scores, gtruth).view((batch_size, -1)).sum(-1) return loss

6. 示例 6. Github-pytorch-loss

Github - CoinCheung/pytorch-loss

Github - CoinCheung/pytorch-loss/label_smooth.py

#!/usr/bin/python # -*- encoding: utf-8 -*- import torch import torch.nn as nn class LabelSmoothSoftmaxCEV1(nn.Module): ''' This is the autograd version, you can also try the LabelSmoothSoftmaxCEV2 that uses derived gradients ''' def __init__(self, lb_smooth=0.1, reduction='mean', ignore_index=-100): super(LabelSmoothSoftmaxCEV1, self).__init__() self.lb_smooth = lb_smooth self.reduction = reduction self.lb_ignore = ignore_index self.log_softmax = nn.LogSoftmax(dim=1) def forward(self, logits, label): ''' args: logits: tensor of shape (N, C, H, W) args: label: tensor of shape(N, H, W) ''' # overcome ignored label with torch.no_grad(): num_classes = logits.size(1) label = label.clone().detach() ignore = label == self.lb_ignore n_valid = (ignore == 0).sum() label[ignore] = 0 lb_pos, lb_neg = 1. - self.lb_smooth, self.lb_smooth / num_classes label = torch.empty_like(logits).fill_( lb_neg).scatter_(1, label.unsqueeze(1), lb_pos).detach() logs = self.log_softmax(logits) loss = -torch.sum(logs * label, dim=1) loss[ignore] = 0 if self.reduction == 'mean': loss = loss.sum() / n_valid if self.reduction == 'sum': loss = loss.sum() return loss class LSRCrossEntropyFunction(torch.autograd.Function): @staticmethod def forward(ctx, logits, label, lb_smooth, reduction, lb_ignore): # prepare label num_classes = logits.size(1) label = label.clone().detach() ignore = label == lb_ignore n_valid = (ignore == 0).sum() label[ignore] = 0 lb_pos, lb_neg = 1. - lb_smooth, lb_smooth / num_classes label = torch.empty_like(logits).fill_( lb_neg).scatter_(1, label.unsqueeze(1), lb_pos).detach() ignore = ignore.nonzero() _, M = ignore.size() a, *b = ignore.chunk(M, dim=1) mask = [a, torch.arange(label.size(1)), *b] label[mask] = 0 coeff = (num_classes - 1) * lb_neg + lb_pos ctx.coeff = coeff ctx.mask = mask ctx.logits = logits ctx.label = label ctx.reduction = reduction ctx.n_valid = n_valid loss = torch.log_softmax(logits, dim=1).neg_().mul_(label).sum(dim=1) if reduction == 'mean': loss = loss.sum().div_(n_valid) if reduction == 'sum': loss = loss.sum() return loss @staticmethod def backward(ctx, grad_output): coeff = ctx.coeff mask = ctx.mask logits = ctx.logits label = ctx.label reduction = ctx.reduction n_valid = ctx.n_valid scores = torch.softmax(logits, dim=1).mul_(coeff) scores[mask] = 0 if reduction == 'none': grad = scores.sub_(label).mul_(grad_output.unsqueeze(1)) elif reduction == 'sum': grad = scores.sub_(label).mul_(grad_output) elif reduction == 'mean': grad = scores.sub_(label).mul_(grad_output.div_(n_valid)) return grad, None, None, None, None, None class LabelSmoothSoftmaxCEV2(nn.Module): def __init__(self, lb_smooth=0.1, reduction='mean', ignore_index=-100): super(LabelSmoothSoftmaxCEV2, self).__init__() self.lb_smooth = lb_smooth self.reduction = reduction self.lb_ignore = ignore_index def forward(self, logits, label): return LSRCrossEntropyFunction.apply( logits, label, self.lb_smooth, self.reduction, self.lb_ignore)

7. 示例 7. Github - PistonY/torch-toolbox

Github - PistonY/torch-toolbox

import torch from torch import nn @torch.no_grad() def smooth_one_hot(true_labels: torch.Tensor, classes: int, smoothing=0.0): """ if smoothing == 0, it's one-hot method if 0 < smoothing < 1, it's smooth method Warning: This function has no grad. """ # assert 0 <= smoothing < 1 confidence = 1.0 - smoothing label_shape = torch.Size((true_labels.size(0), classes)) smooth_label = torch.empty(size=label_shape, device=true_labels.device) smooth_label.fill_(smoothing / (classes - 1)) smooth_label.scatter_(1, true_labels.data.unsqueeze(1), confidence) return smooth_label class LabelSmoothingLoss(nn.Module): """This is label smoothing loss function. """ def __init__(self, classes, smoothing=0.0, dim=-1): super(LabelSmoothingLoss, self).__init__() self.confidence = 1.0 - smoothing self.smoothing = smoothing self.cls = classes self.dim = dim def forward(self, pred, target): pred = pred.log_softmax(dim=self.dim) true_dist = smooth_one_hot(target, self.cls, self.smoothing) return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

相关材料

[1] - 深度学习 | 训练网络trick——label smoothing(附Caffe版代码)

[2] - Github - qiu931110/Caffe_label_smooth

Last modification:June 23rd, 2020 at 04:03 pm