基于 SoftmaxWithLossLayer 实现的 FocalLossLayer.
主要涉及四个文件:
- caffe.proto
- focal_loss_layer.hpp
- focal_loss_layer.cpp
- focal_loss_layer.cu
1. 修改 caffe.proto 文件
在 caffe_root/src/caffe.proto
中添加如下内容:
// Add the next contents to message LayerParameter
message LayerParameter {
optional FocalLossParameter focal_loss_param = 151; // select a id.
}
// Add the next contents to your caffe.proto
// Message that stores parameter used by FocalLossLayer
message FocalLossParameter {
// loss = -alpha * (1 - pk)^gamma * ln(pk)
// alpha is a parameter which scale the loss
optional float alpha = 1 [default = 0.25];
optional float gamma = 2 [default = 2.00];
}
2. 添加 focal_loss_layer.hpp 文件
将 focal_loss_layer.hpp
文件添加到 caffe_root/include/caffe/layers/
:
// focal_loss.hpp -- inplement of <<Focal Loss for Dense Object Detection>>
// modified from softmax_loss_layer.hpp
#ifndef CAFFE_FOCAL_LOSS_LAYER_HPP_
#define CAFFE_FOCAL_LOSS_LAYER_HPP_
#include <vector>
#include "caffe/blob.hpp"
#include "caffe/layer.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/layers/loss_layer.hpp"
#include "caffe/layers/softmax_layer.hpp"
namespace caffe {
/**
* @brief Computes the multinomial logistic loss for a one-of-many
* classification task, passing real-valued predictions through a
* softmax to get a probability distribution over classes.
*
* This layer should be preferred over separate
* SoftmaxLayer + MultinomialLogisticLossLayer
* as its gradient computation is more numerically stable.
* At test time, this layer can be replaced simply by a SoftmaxLayer.
*
* @param bottom input Blob vector (length 2)
* -# @f$ (N \times C \times H \times W) @f$
* the predictions @f$ x @f$, a Blob with values in
* @f$ [-\infty, +\infty] @f$ indicating the predicted score for each of
* the @f$ K = CHW @f$ classes. This layer maps these scores to a
* probability distribution over classes using the softmax function
* @f$ \hat{p}_{nk} = \exp(x_{nk}) /
* \left[\sum_{k'} \exp(x_{nk'})\right] @f$ (see SoftmaxLayer).
* -# @f$ (N \times 1 \times 1 \times 1) @f$
* the labels @f$ l @f$, an integer-valued Blob with values
* @f$ l_n \in [0, 1, 2, ..., K - 1] @f$
* indicating the correct class label among the @f$ K @f$ classes
* @param top output Blob vector (length 1)
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* the computed cross-entropy classification loss: @f$ E =
* \frac{-1}{N} \sum\limits_{n=1}^N \log(\hat{p}_{n,l_n})
* @f$, for softmax output class probabilites @f$ \hat{p} @f$
*/
// 继承自LossLayer
template <typename Dtype>
class FocalLossLayer : public LossLayer<Dtype> {
public:
/**
* @param param provides LossParameter loss_param, with options:
* - ignore_label (optional)
* Specify a label value that should be ignored when computing the loss.
* - normalize (optional, default true)
* If true, the loss is normalized by the number of (nonignored) labels
* present; otherwise the loss is simply summed over spatial locations.
*/
explicit FocalLossLayer(const LayerParameter& param)
: LossLayer<Dtype>(param) {}
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual inline const char* type() const { return "FocalLoss"; }
virtual inline int ExactNumTopBlobs() const { return -1; }
virtual inline int MinTopBlobs() const { return 1; }
virtual inline int MaxTopBlobs() const { return 2; }
protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
/**
* @brief Computes the softmax loss error gradient w.r.t. the predictions.
*
* Gradients cannot be computed with respect to the label inputs (bottom[1]),
* so this method ignores bottom[1] and requires !propagate_down[1], crashing
* if propagate_down[1] is set.
*
* @param top output Blob vector (length 1), providing the error gradient with
* respect to the outputs
* -# @f$ (1 \times 1 \times 1 \times 1) @f$
* This Blob's diff will simply contain the loss_weight* @f$ \lambda @f$,
* as @f$ \lambda @f$ is the coefficient of this layer's output
* @f$\ell_i@f$ in the overall Net loss
* @f$ E = \lambda_i \ell_i + \mbox{other loss terms}@f$; hence
* @f$ \frac{\partial E}{\partial \ell_i} = \lambda_i @f$.
* (*Assuming that this top Blob is not used as a bottom (input) by any
* other layer of the Net.)
* @param propagate_down see Layer::Backward.
* propagate_down[1] must be false as we can't compute gradients with
* respect to the labels.
* @param bottom input Blob vector (length 2)
* -# @f$ (N \times C \times H \times W) @f$
* the predictions @f$ x @f$; Backward computes diff
* @f$ \frac{\partial E}{\partial x} @f$
* -# @f$ (N \times 1 \times 1 \times 1) @f$
* the labels -- ignored as we can't compute their error gradients
*/
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
/// Read the normalization mode parameter and compute the normalizer based
/// on the blob size. If normalization_mode is VALID, the count of valid
/// outputs will be read from valid_count, unless it is -1 in which case
/// all outputs are assumed to be valid.
virtual Dtype get_normalizer(
LossParameter_NormalizationMode normalization_mode, int valid_count);
/// The internal SoftmaxLayer used to map predictions to a distribution.
shared_ptr<Layer<Dtype> > softmax_layer_;
/// prob stores the output probability predictions from the SoftmaxLayer.
Blob<Dtype> prob_;
/// bottom vector holder used in call to the underlying SoftmaxLayer::Forward
vector<Blob<Dtype>*> softmax_bottom_vec_;
/// top vector holder used in call to the underlying SoftmaxLayer::Forward
vector<Blob<Dtype>*> softmax_top_vec_;
/// Whether to ignore instances with a certain label.
bool has_ignore_label_;
/// The label indicating that an instance should be ignored.
int ignore_label_;
/// How to normalize the output loss.
LossParameter_NormalizationMode normalization_;
int softmax_axis_, outer_num_, inner_num_;
/// alpha_ and gamma_ factors are for Focal Loss
Dtype alpha_, gamma_;
};
} // namespace caffe
#endif // CAFFE_FOCAL_LOSS_LAYER_HPP_
3. 添加 focal_loss_layer.cpp 文件
将 focal_loss_layer.cpp
添加到 caffe_root/src/caffe/layers/
中:
#include <algorithm>
#include <cfloat>
#include <vector>
#include "caffe/layers/softmax_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/layers/focal_loss_layer.hpp"
namespace caffe {
template <typename Dtype>
void FocalLossLayer<Dtype>::LayerSetUp(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
LossLayer<Dtype>::LayerSetUp(bottom, top);
LayerParameter softmax_param(this->layer_param_);
softmax_param.set_type("Softmax");
softmax_layer_ = LayerRegistry<Dtype>::CreateLayer(softmax_param);
softmax_bottom_vec_.clear();
softmax_bottom_vec_.push_back(bottom[0]);
softmax_top_vec_.clear();
softmax_top_vec_.push_back(&prob_);
softmax_layer_->SetUp(softmax_bottom_vec_, softmax_top_vec_);
has_ignore_label_ =
this->layer_param_.loss_param().has_ignore_label();
if (has_ignore_label_) {
ignore_label_ = this->layer_param_.loss_param().ignore_label();
}
if (!this->layer_param_.loss_param().has_normalization() &&
this->layer_param_.loss_param().has_normalize()) {
normalization_ = this->layer_param_.loss_param().normalize() ?
LossParameter_NormalizationMode_VALID :
LossParameter_NormalizationMode_BATCH_SIZE;
} else {
normalization_ = this->layer_param_.loss_param().normalization();
normalization_ = this->layer_param_.loss_param().normalization();
}
// get alpha and gamma
alpha_ = this->layer_param_.focal_loss_param().alpha();
gamma_ = this->layer_param_.focal_loss_param().gamma();
}
template <typename Dtype>
void FocalLossLayer<Dtype>::Reshape(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
LossLayer<Dtype>::Reshape(bottom, top);
softmax_layer_->Reshape(softmax_bottom_vec_, softmax_top_vec_);
softmax_axis_ =
bottom[0]->CanonicalAxisIndex(this->layer_param_.softmax_param().axis()); // classify at which axis
outer_num_ = bottom[0]->count(0, softmax_axis_);
inner_num_ = bottom[0]->count(softmax_axis_ + 1);
CHECK_EQ(outer_num_ * inner_num_, bottom[1]->count())
<< "Number of labels must match number of predictions; "
<< "e.g., if softmax axis == 1 and prediction shape is (N, C, H, W), "
<< "label count (number of labels) must be N*H*W, "
<< "with integer values in {0, 1, ..., C-1}.";
if (top.size() >= 2) {
// softmax output
top[1]->ReshapeLike(*bottom[0]);
}
}
template <typename Dtype>
Dtype FocalLossLayer<Dtype>::get_normalizer(
LossParameter_NormalizationMode normalization_mode, int valid_count) {
Dtype normalizer;
switch (normalization_mode) {
case LossParameter_NormalizationMode_FULL:
normalizer = Dtype(outer_num_ * inner_num_);
break;
case LossParameter_NormalizationMode_VALID:
if (valid_count == -1) {
normalizer = Dtype(outer_num_ * inner_num_);
} else {
normalizer = Dtype(valid_count);
}
break;
case LossParameter_NormalizationMode_BATCH_SIZE:
normalizer = Dtype(outer_num_);
break;
case LossParameter_NormalizationMode_NONE:
normalizer = Dtype(1);
break;
default:
LOG(FATAL) << "Unknown normalization mode: "
<< LossParameter_NormalizationMode_Name(normalization_mode);
}
// Some users will have no labels for some examples in order to 'turn off' a
// particular loss in a multi-task setup. The max prevents NaNs in that case.
return std::max(Dtype(1.0), normalizer);
}
template <typename Dtype>
void FocalLossLayer<Dtype>::Forward_cpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
// The forward pass computes the softmax prob values.
softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
const Dtype* prob_data = prob_.cpu_data();
const Dtype* label = bottom[1]->cpu_data();
int dim = prob_.count() / outer_num_; // c * h * w
int count = 0;
Dtype loss = 0;
for (int i = 0; i < outer_num_; ++i) {
for (int j = 0; j < inner_num_; j++) {
const int label_value = static_cast<int>(label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
continue;
}
DCHECK_GE(label_value, 0);
DCHECK_LT(label_value, prob_.shape(softmax_axis_));
// loss = -log(p)
const Dtype pk = prob_data[i * dim + label_value * inner_num_ + j];
loss -= alpha_ * powf(1 - pk, gamma_) * log(std::max(pk, Dtype(FLT_MIN)));
++count; // count elements.
}//
}// per_num
top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_, count);
if (top.size() == 2) {
top[1]->ShareData(prob_);
}
}
template <typename Dtype>
void FocalLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
// bottom[0]->cpu_data(), zk
if (propagate_down[1]) {
LOG(FATAL) << this->type()
<< " Layer cannot backpropagate to label inputs.";
}
if (propagate_down[0]) {
Dtype* bottom_diff = bottom[0]->mutable_cpu_diff();
const Dtype* prob_data = prob_.cpu_data();
// if i != k,then diff = prob
//caffe_copy(prob_.count(), prob_data, bottom_diff);
const Dtype* label = bottom[1]->cpu_data();
// const Dtype* bottom_data = bottom[0]->cpu_data();
int dim = prob_.count() / outer_num_;
int count = 0;
int num_channel = bottom[0]->shape(softmax_axis_);
for (int i = 0; i < outer_num_; ++i) {
for (int j = 0; j < inner_num_; ++j) {
const int label_value = static_cast<int>(label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
for (int c = 0; c < bottom[0]->shape(softmax_axis_); ++c) {
bottom_diff[i * dim + c * inner_num_ + j] = 0;
}
} else {
++count;
int c = 0;
const Dtype pk = std::max(prob_data[i * dim + label_value * inner_num_ + j], Dtype(FLT_MIN));
for (c = 0; c < label_value; ++c) {
const Dtype pj = std::max(prob_data[i * dim + c * inner_num_ + j], Dtype(FLT_MIN));
bottom_diff[i * dim + c * inner_num_ + j] = Dtype(-1 * alpha_ * (gamma_ * pow(1 - pk, gamma_ - 1) * pk * pj * log(pk) - pow(1 - pk, gamma_) * pj)); // j != k
} // per_channel
bottom_diff[i * dim + label_value * inner_num_ + j] = Dtype (-1 * alpha_ * (-1 * gamma_ * powf(1 - pk, gamma_) * pk * log(pk) + powf(1 - pk, gamma_ + 1))); // j = k
c++;
for ( ; c < num_channel; ++c) {
const Dtype pj = std::max(prob_data[i * dim + c * inner_num_ + j], Dtype(FLT_MIN));
bottom_diff[i * dim + c * inner_num_ + j] = Dtype(-1 * alpha_ * (gamma_ * pow(1 - pk, gamma_ - 1) * pk * pj * log(pk) - pow(1 - pk, gamma_) * pj)); // j != k
} // per_channel
}
}// per_h_w
}// per_num
// Scale gradient
Dtype loss_weight = top[0]->cpu_diff()[0] /
get_normalizer(normalization_, count);
caffe_scal(prob_.count(), loss_weight, bottom_diff);
}
}
#ifdef CPU_ONLY
STUB_GPU(FocalLossLayer);
#endif
INSTANTIATE_CLASS(FocalLossLayer);
REGISTER_LAYER_CLASS(FocalLoss);
} // namespace caffe
4. 添加 focal_loss_layer.cu 文件
将 focal_loss_layer.cu
文件添加到路径 caffe_root/src/caffe/layers/
中:
#include <algorithm>
#include <cfloat>
#include <vector>
#include "caffe/layers/focal_loss_layer.hpp"
#include "caffe/util/math_functions.hpp"
namespace caffe {
template <typename Dtype>
__global__ void FocalLossForwardGPU(const int nthreads,
const Dtype * prob_data, const Dtype * label, Dtype* loss,
const int num, const int dim, const int spatial_dim,
const bool has_ignore_label_, const int ignore_label_,
Dtype * counts, const Dtype alpha_, const Dtype gamma_) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / spatial_dim;
const int s = index % spatial_dim;
const int label_value = static_cast<int>(label[n * spatial_dim + s]);
if (has_ignore_label_ && label_value == ignore_label_) {
loss[index] = 0;
counts[index] = 0;
} else {
const Dtype pk = max(prob_data[n * dim + label_value * spatial_dim + s], Dtype(FLT_MIN));
loss[index] = -1 * alpha_ * powf(1 - pk, gamma_) * log(pk);
counts[index] = 1;
}
}
}
template <typename Dtype>
void FocalLossLayer<Dtype>::Forward_gpu(
const vector<Blob<Dtype> *> & bottom, const vector<Blob<Dtype> *> & top) {
softmax_layer_->Forward(softmax_bottom_vec_, softmax_top_vec_);
const Dtype * prob_data = prob_.gpu_data();
const Dtype * label = bottom[1]->gpu_data();
const int dim = prob_.count() / outer_num_;
const int nthreads = outer_num_ * inner_num_;
Dtype * loss_data = bottom[0]->mutable_gpu_diff();
Dtype * counts = prob_.mutable_gpu_diff();
FocalLossForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts, alpha_, gamma_);
Dtype loss;
caffe_gpu_asum(nthreads, loss_data, &loss);
Dtype valid_count = -1;
if (normalization_ == LossParameter_NormalizationMode_VALID &&
has_ignore_label_) {
caffe_gpu_asum(nthreads, counts, & valid_count);
}
top[0]->mutable_cpu_data()[0] = loss / get_normalizer(normalization_,
valid_count);
if (2 == top.size()) {
top[1]->ShareData(prob_);
}
}
template <typename Dtype>
__global__ void FocalLossBackwardGPU(const int nthreads, const Dtype * prob_data,
const Dtype * label, Dtype * bottom_diff, const int num, const int dim,
const int spatial_dim, const bool has_ignore_label_,
const int ignore_label_, Dtype * counts, const Dtype alpha_, const Dtype gamma_) {
const int channels = dim / spatial_dim;
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / spatial_dim;
const int s = index % spatial_dim;
const int label_value = static_cast<int>(label[n * spatial_dim + s]);
if (has_ignore_label_ && label_value == ignore_label_) {
for (int c = 0; c < channels; ++c) {
bottom_diff[n * dim + c * spatial_dim + s] = 0;
}
counts[index] = 0;
} else {
int c = 0;
const Dtype pk = max(prob_data[n * dim + label_value * spatial_dim + s], Dtype(FLT_MIN));
for (c = 0; c < label_value; ++c) {
const Dtype pj = max(prob_data[n * dim + c * spatial_dim + s], Dtype(FLT_MIN));
bottom_diff[n * dim + c * spatial_dim + s] = Dtype(
-1 * alpha_ * (gamma_ * pow(1 - pk, gamma_ - 1) * pk * pj * log(pk) - pow(1 - pk, gamma_) * pj));
}
bottom_diff[n * dim + c * spatial_dim + s] = Dtype(
-1 * alpha_ * (-1 * gamma_ * pow(1 - pk, gamma_) * pk * log(pk) + pow(1 - pk, gamma_ + 1)));
c++;
for ( ; c < channels; ++c) {
const Dtype pj = max(prob_data[n * dim + c * spatial_dim + s], Dtype(FLT_MIN));
bottom_diff[n * dim + c * spatial_dim + s] = Dtype(
-1 * alpha_ * (gamma_ * pow(1 - pk, gamma_ - 1) * pk * pj * log(pk) - pow(1 - pk, gamma_) * pj));
}
counts[index] = 1;
}
}
}
template <typename Dtype>
void FocalLossLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype> *> & top,
const vector<bool>& propagate_down, const vector<Blob<Dtype> *> & bottom) {
if (propagate_down[1]) {
LOG(FATAL) << this->type()
<< " Layer cannot backpropagate to label inputs.";
}
if (propagate_down[0]) {
Dtype* bottom_diff = bottom[0]->mutable_gpu_diff();
const Dtype* prob_data = prob_.gpu_data();
const Dtype* top_data = top[0]->gpu_data();
const Dtype* label = bottom[1]->gpu_data();
const int dim = prob_.count() / outer_num_;
const int nthreads = outer_num_ * inner_num_;
// Since this memory is nerver used for anything else,
// we use to to avoid allocating new GPU memory
Dtype* counts = prob_.mutable_gpu_diff();
// NOLINT_NEXT_LINE(whitespace/operators)
FocalLossBackwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, bottom_diff,
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts,
alpha_, gamma_);
Dtype valid_count = -1;
// Only launch another CUDA kernel if we actually need the count of valid
// outputs.
if (normalization_ == LossParameter_NormalizationMode_VALID &&
has_ignore_label_) {
caffe_gpu_asum(nthreads, counts, & valid_count);
}
const Dtype loss_weight = top[0]->cpu_diff()[0] /
get_normalizer(normalization_, valid_count);
caffe_gpu_scal(prob_.count(), loss_weight, bottom_diff);
}
}
INSTANTIATE_LAYER_GPU_FUNCS(FocalLossLayer);
} // namespace caffe
5. FocalLossLayer 用法
在 caffe 编译完成后,FocalLossLayer 在 prototxt
文件中的用法为:
layer {
name: "focal_loss"
type: "FocalLoss"
bottom: "conv_cls"
bottom: "label"
top: "loss"
include {
phase: TRAIN
}
loss_param {
ignore_label: 255
}
focal_loss_param {
alpha: 0.25
gamma: 2.00
}
}
5 comments
这个focalloss有反向传播吗?请问
有啊,Backward_gpu
总是迭代几步就出现QNAN是什么问题
我是遇到自己的知识盲点,进行记录备忘了. 因为目的就是自己个人的记录和学习,所以更新也是看自己的情况