InceptionV3 论文中提出,one-hot 硬编码形式的标签会导致过拟合. 标签平滑能够提升分类精度.
简单的实现如:
smooth_labels = (1.0 - label_smoothing) * one_hot_labels + label_smoothing / num_classes
其中,可以设置label_smoothing=0.1
,num_classes
表示类别数.
具体示例如下.
1. 示例 1
class LabelSmoothLoss(nn.Module):
def __init__(self, smoothing=0.0):
super(LabelSmoothLoss, self).__init__()
self.smoothing = smoothing
def forward(self, input, target):
log_prob = F.log_softmax(input, dim=-1)
weight = input.new_ones(input.size()) * \
self.smoothing / (input.size(-1) - 1.)
weight.scatter_(-1, target.unsqueeze(-1), (1. - self.smoothing))
loss = (-weight * log_prob).sum(dim=-1).mean()
return loss
答者验证结果:
[1] - When smoothing=0.0, the output is the same as nn.CrossEntropyLoss
within precision 1e-5
.
[2] - When smoothing>0.0, the sums of weights over different classes weight.sum(dim=-1)
are always 1.
2. 示例 2
分两步处理.
[1] - 定义函数,得到 smooth label:
def smooth_one_hot(true_labels: torch.Tensor, classes: int, smoothing=0.0):
"""
if smoothing == 0, it's one-hot method
if 0 < smoothing < 1, it's smooth method
"""
assert 0 <= smoothing < 1
confidence = 1.0 - smoothing
label_shape = torch.Size((true_labels.size(0), classes))
with torch.no_grad():
true_dist = torch.empty(size=label_shape, device=true_labels.device)
true_dist.fill_(smoothing / (classes - 1))
true_dist.scatter_(1, true_labels.data.unsqueeze(1), confidence)
return true_dist
[2] - CrossEntropyLoss
支持 k-hot/smoothed targets.
使用方式如:
Loss = CrossEntropyLoss(NonSparse=True, ...)
. . .
data = ...
labels = ...
outputs = model(data)
smooth_label = smooth_one_hot(labels, ...)
loss = (outputs, smooth_label)
...
答者测试情况如:
model | epochs | dtype | batch size* | gpus | lr | tricks | top1/top5 | improve |
---|---|---|---|---|---|---|---|---|
resnet50 | 120 | FP16 | 128 | 8 | 0.4 | - | 77.35/- | baseline |
resnet50 | 120 | FP16 | 128 | 8 | 0.4 | Label smoothing | 77.78/93.80 | +0.43 |
3. 示例 3
From: Github - NVIDIA/DeepLearningExamples/PyTorch/Classification
label smoothing 定义 - smoothing.py
:
import torch
import torch.nn as nn
class LabelSmoothing(nn.Module):
"""
NLL loss with label smoothing.
"""
def __init__(self, smoothing=0.0):
"""
Constructor for the LabelSmoothing module.
:param smoothing: label smoothing factor
"""
super(LabelSmoothing, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
def forward(self, x, target):
logprobs = torch.nn.functional.log_softmax(x, dim=-1)
nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
nll_loss = nll_loss.squeeze(1)
smooth_loss = -logprobs.mean(dim=-1)
loss = self.confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
训练函数中调用用法 - main.py
:
import torch
import torch.nn as nn
from smoothing import LabelSmoothing
def add_parser_arguments(parser):
parser.add_argument('--label-smoothing',
default=0.0,
type=float,
metavar='S',
help='label smoothing')
def main(args):
loss = nn.CrossEntropyLoss
if args.label_smoothing > 0.0:
loss = lambda: LabelSmoothing(args.label_smoothing)
criterion = loss()
4. 示例 4
class LabelSmoothingLoss(nn.Module):
def __init__(self, classes, smoothing=0.0, dim=-1):
super(LabelSmoothingLoss, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.cls = classes
self.dim = dim
def forward(self, pred, target):
pred = pred.log_softmax(dim=self.dim)
with torch.no_grad():
# true_dist = pred.data.clone()
true_dist = torch.zeros_like(pred)
true_dist.fill_(self.smoothing / (self.cls - 1))
true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
5. 示例 5. Github-NJUNMT-pytorch
import torch
import torch.nn as nn
class NMTCriterion(Criterion):
"""
A common used criterion for neural machine translation
NMTCriterion is used for MLE training given golden target sample.
Additional label_smoothing is supported.
"""
def __init__(self, padding_idx=PAD, label_smoothing=0.0):
super().__init__()
self.padding_idx = padding_idx
self.label_smoothing = label_smoothing
if label_smoothing > 0:
self.criterion = nn.KLDivLoss(size_average=False, reduce=False)
else:
self.criterion = nn.NLLLoss(size_average=False, ignore_index=padding_idx, reduce=False)
self.confidence = 1.0 - label_smoothing
def _smooth_label(self, num_tokens):
# When label smoothing is turned on,
# KL-divergence between q_{smoothed ground truth prob.}(w)
# and p_{prob. computed by model}(w) is minimized.
# If label smoothing value is set to zero, the loss
# is equivalent to NLLLoss or CrossEntropyLoss.
# All non-true labels are uniformly set to low-confidence.
one_hot = torch.randn(1, num_tokens)
one_hot.fill_(self.label_smoothing / (num_tokens - 2))
one_hot[0][self.padding_idx] = 0
return one_hot
def _bottle(self, v):
return v.view(-1, v.size(2))
def _compute_loss(self, inputs, labels, **kwargs):
"""
Args:
inputs (..., K): Expect logarithm probabilities.
labels (...,): Index tensor. Should be the same size as inputs except the last dimension.
"""
batch_size = labels.size(0)
scores = self._bottle(inputs) # [batch_size * seq_len, d_words]
num_tokens = scores.size(-1)
gtruth = labels.view(-1)
if self.confidence < 1:
# N: the number of samples
# M: the number of labels
tdata = gtruth.detach()
# mask of PAD
mask = torch.nonzero(tdata.eq(self.padding_idx)).squeeze()
# Do label smoothing
one_hot = self._smooth_label(num_tokens)
if labels.is_cuda:
one_hot = one_hot.cuda()
tmp_ = one_hot.repeat(gtruth.size(0), 1) # [N, M]
tmp_.scatter_(1, tdata.unsqueeze(1), self.confidence)
if mask.numel() > 0:
tmp_.index_fill_(0, mask, 0)
gtruth = tmp_.detach()
loss = self.criterion(scores, gtruth).view((batch_size, -1)).sum(-1)
return loss
6. 示例 6. Github-pytorch-loss
#!/usr/bin/python
# -*- encoding: utf-8 -*-
import torch
import torch.nn as nn
class LabelSmoothSoftmaxCEV1(nn.Module):
'''
This is the autograd version, you can also try the LabelSmoothSoftmaxCEV2 that uses derived gradients
'''
def __init__(self, lb_smooth=0.1, reduction='mean', ignore_index=-100):
super(LabelSmoothSoftmaxCEV1, self).__init__()
self.lb_smooth = lb_smooth
self.reduction = reduction
self.lb_ignore = ignore_index
self.log_softmax = nn.LogSoftmax(dim=1)
def forward(self, logits, label):
'''
args: logits: tensor of shape (N, C, H, W)
args: label: tensor of shape(N, H, W)
'''
# overcome ignored label
with torch.no_grad():
num_classes = logits.size(1)
label = label.clone().detach()
ignore = label == self.lb_ignore
n_valid = (ignore == 0).sum()
label[ignore] = 0
lb_pos, lb_neg = 1. - self.lb_smooth, self.lb_smooth / num_classes
label = torch.empty_like(logits).fill_(
lb_neg).scatter_(1, label.unsqueeze(1), lb_pos).detach()
logs = self.log_softmax(logits)
loss = -torch.sum(logs * label, dim=1)
loss[ignore] = 0
if self.reduction == 'mean':
loss = loss.sum() / n_valid
if self.reduction == 'sum':
loss = loss.sum()
return loss
class LSRCrossEntropyFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, logits, label, lb_smooth, reduction, lb_ignore):
# prepare label
num_classes = logits.size(1)
label = label.clone().detach()
ignore = label == lb_ignore
n_valid = (ignore == 0).sum()
label[ignore] = 0
lb_pos, lb_neg = 1. - lb_smooth, lb_smooth / num_classes
label = torch.empty_like(logits).fill_(
lb_neg).scatter_(1, label.unsqueeze(1), lb_pos).detach()
ignore = ignore.nonzero()
_, M = ignore.size()
a, *b = ignore.chunk(M, dim=1)
mask = [a, torch.arange(label.size(1)), *b]
label[mask] = 0
coeff = (num_classes - 1) * lb_neg + lb_pos
ctx.coeff = coeff
ctx.mask = mask
ctx.logits = logits
ctx.label = label
ctx.reduction = reduction
ctx.n_valid = n_valid
loss = torch.log_softmax(logits, dim=1).neg_().mul_(label).sum(dim=1)
if reduction == 'mean':
loss = loss.sum().div_(n_valid)
if reduction == 'sum':
loss = loss.sum()
return loss
@staticmethod
def backward(ctx, grad_output):
coeff = ctx.coeff
mask = ctx.mask
logits = ctx.logits
label = ctx.label
reduction = ctx.reduction
n_valid = ctx.n_valid
scores = torch.softmax(logits, dim=1).mul_(coeff)
scores[mask] = 0
if reduction == 'none':
grad = scores.sub_(label).mul_(grad_output.unsqueeze(1))
elif reduction == 'sum':
grad = scores.sub_(label).mul_(grad_output)
elif reduction == 'mean':
grad = scores.sub_(label).mul_(grad_output.div_(n_valid))
return grad, None, None, None, None, None
class LabelSmoothSoftmaxCEV2(nn.Module):
def __init__(self, lb_smooth=0.1, reduction='mean', ignore_index=-100):
super(LabelSmoothSoftmaxCEV2, self).__init__()
self.lb_smooth = lb_smooth
self.reduction = reduction
self.lb_ignore = ignore_index
def forward(self, logits, label):
return LSRCrossEntropyFunction.apply(
logits, label,
self.lb_smooth,
self.reduction,
self.lb_ignore)
7. 示例 7. Github - PistonY/torch-toolbox
import torch
from torch import nn
@torch.no_grad()
def smooth_one_hot(true_labels: torch.Tensor, classes: int, smoothing=0.0):
"""
if smoothing == 0, it's one-hot method
if 0 < smoothing < 1, it's smooth method
Warning: This function has no grad.
"""
# assert 0 <= smoothing < 1
confidence = 1.0 - smoothing
label_shape = torch.Size((true_labels.size(0), classes))
smooth_label = torch.empty(size=label_shape, device=true_labels.device)
smooth_label.fill_(smoothing / (classes - 1))
smooth_label.scatter_(1, true_labels.data.unsqueeze(1), confidence)
return smooth_label
class LabelSmoothingLoss(nn.Module):
"""This is label smoothing loss function.
"""
def __init__(self, classes, smoothing=0.0, dim=-1):
super(LabelSmoothingLoss, self).__init__()
self.confidence = 1.0 - smoothing
self.smoothing = smoothing
self.cls = classes
self.dim = dim
def forward(self, pred, target):
pred = pred.log_softmax(dim=self.dim)
true_dist = smooth_one_hot(target, self.cls, self.smoothing)
return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))
6 comments
测试评论,看能不能发表
需要通过才能显示
hello world
博主超赞,标签平滑的不同实现总结得非常齐全
可以要一份您的博客建站的源码嘛?感谢您
基于 Typecho 的,http://typecho.org/
主题是基于 handsome 的,https://www.ihewro.com/archives/489/