这里采用从图像直接读取图片数据和标签的方式进行多标签分类. 基于 VGG16 backbone 网络 修改 ImageDataLayer 读取图片和多标签数据 multi-task 输出 类似问题, 图像自动标注

<h2>1. 问题描述</h2>

假定每张图片具有 N 个标签(本文N=3),分别为 label1, label2, label3,...,labelN.
数据集 train.txt 内容格式如下:

img1.jpg 5 6 7 ... 8
img2.jpg 1 2 3 ... 6
img3.jpg 7 8 9 ... 10
......

以 train.txt 第一行为例, img1.jpg 为图像名,5 为在label1 所对应的索引,6 为在label2 所对应的索引,7 为在label3 所对应的索引,8 为 在labelN 所对应的索引(均为从 0 开始).

<h2>2. 解决方案</h2>

基于 Caffe 官方提供的 ImageDataLayer 只能读取单个 label,因此这里参考 ImageDataLayer 添加新的网络数据读取层 —— ImageMultilabelDataLayer.

Caffe 添加网络数据层所涉及主要步骤如下:

  • 添加 hpp 头文件,如 include/image_multilabel_data_layer.hpp
  • 添加 cpp 实现文件,如 src/caffe/layers/image_multilabel_data_layer.cpp
  • 添加 Layer 对应的类及其类涉及参数,在 src/caffe/proto/caffe.proto文件中

<h3>2.1 添加 image_multilabel_data_layer.hpp</h3>

#ifndef CAFFE_IMAGE_MULTILABEL_DATA_LAYER_HPP_
#define CAFFE_IMAGE_MULTILABEL_DATA_LAYER_HPP_

#include <string>
#include <utility>
#include <vector>

#include "caffe/blob.hpp"
#include "caffe/data_transformer.hpp"
#include "caffe/internal_thread.hpp"
#include "caffe/layer.hpp"
#include "caffe/layers/base_data_layer.hpp"
#include "caffe/proto/caffe.pb.h"

namespace caffe {

/**
 * @brief Provides data to the Net from image files.
 *
 * TODO(dox): thorough documentation for Forward and proto params.
 */
template <typename Dtype>
class ImageMultilabelDataLayer : public BasePrefetchingDataLayer<Dtype> {
 public:
  explicit ImageMultilabelDataLayer(const LayerParameter& param)
      : BasePrefetchingDataLayer<Dtype>(param) {}
  virtual ~ImageMultilabelDataLayer();
  virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top);

  virtual inline const char* type() const { return "ImageData"; }
  virtual inline int ExactNumBottomBlobs() const { return 0; }
  virtual inline int ExactNumTopBlobs() const { return 2; }

 protected:
  shared_ptr<Caffe::RNG> prefetch_rng_;
  virtual void ShuffleImages();
  virtual void load_batch(Batch<Dtype>* batch);

  vector<std::pair<std::string, int > > lines_; // string 对应 train.txt 中的图片名,int  实现多label,其中各label间默认以空格隔离.
  int lines_id_;
};


}  // namespace caffe

#endif  // CAFFE_IMAGE_MULTILABEL_DATA_LAYER_HPP_

<h3>2.2 添加 image_multilabel_data_layer.cpp</h3>

#ifdef USE_OPENCV
#include <opencv2/core/core.hpp>

#include <fstream>  // NOLINT(readability/streams)
#include <iostream>  // NOLINT(readability/streams)
#include <string>
#include <utility>
#include <vector>

#include "caffe/data_transformer.hpp"
#include "caffe/layers/base_data_layer.hpp"
#include "caffe/layers/image_multilabel_data_layer.hpp"
#include "caffe/util/benchmark.hpp"
#include "caffe/util/io.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/rng.hpp"

namespace caffe {

template <typename Dtype>
ImageMultilabelDataLayer<Dtype>::~ImageMultilabelDataLayer<Dtype>() {
  this->StopInternalThread();
}

template <typename Dtype>
void ImageMultilabelDataLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
      const vector<Blob<Dtype>*>& top) {
  const int new_height = this->layer_param_.image_multilabel_data_param().new_height(); //裁剪后的图片 height
  const int new_width  = this->layer_param_.image_multilabel_data_param().new_width();  //裁剪后的图片 width
  const bool is_color  = this->layer_param_.image_multilabel_data_param().is_color();
  string root_folder = this->layer_param_.image_multilabel_data_param().root_folder();  //图片存储路径

  CHECK((new_height == 0 && new_width == 0) ||
      (new_height > 0 && new_width > 0)) << "Current implementation requires "
      "new_height and new_width to be set at the same time."; // 保证裁剪后图片的 height 和 width 相等.
  // 读取保存着图片名和labels的file
  const string& source = this->layer_param_.image_multilabel_data_param().source();
  LOG(INFO) << "Opening file " << source;
  std::ifstream infile(source.c_str());
  string filename;
  // get labels classes
  // label_dim 为新增参数,表示labels类别,比关于人的,共有三类——年龄、性别、肤色,则label_dim=3
  int label_dim = this->layer_param_.image_multilabel_data_param().label_dim();
  // train.txt中的每行记录了每个图片名称及其labels.
  while (infile >> filename) {
    int* labels = new int[label_dim];
    for(int i = 0;i < label_dim;++i){
        infile >> labels[i];
    }
    lines_.push_back(std::make_pair(filename, labels));
  }

  CHECK(!lines_.empty()) << "File is empty";

  if (this->layer_param_.image_multilabel_data_param().shuffle()) {
    // 随机打乱数据顺序
    LOG(INFO) << "Shuffling data";
    const unsigned int prefetch_rng_seed = caffe_rng_rand();
    prefetch_rng_.reset(new Caffe::RNG(prefetch_rng_seed));
    ShuffleImages();
  } else {
    if (this->phase_ == TRAIN && Caffe::solver_rank() > 0 &&
        this->layer_param_.image_multilabel_data_param().rand_skip() == 0) {
      LOG(WARNING) << "Shuffling or skipping recommended for multi-GPU";
    }
  }
  LOG(INFO) << "A total of " << lines_.size() << " images.";

  lines_id_ = 0;
  // Check if we would need to randomly skip a few data points
  if (this->layer_param_.image_multilabel_data_param().rand_skip()) {
    unsigned int skip = caffe_rng_rand() %
        this->layer_param_.image_multilabel_data_param().rand_skip();
    LOG(INFO) << "Skipping first " << skip << " data points.";
    CHECK_GT(lines_.size(), skip) << "Not enough points to skip";
    lines_id_ = skip;
  }
  // Read an image, and use it to initialize the top blob.
  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
                                    new_height, new_width, is_color);
  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
  // Use data_transformer to infer the expected blob shape from a cv_image.
  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
  this->transformed_data_.Reshape(top_shape);
  // Reshape prefetch_data and top[0] according to the batch_size.
  const int batch_size = this->layer_param_.image_multilabel_data_param().batch_size();
  CHECK_GT(batch_size, 0) << "Positive batch size required";
  top_shape[0] = batch_size;
  for (int i = 0; i < this->prefetch_.size(); ++i) {
    this->prefetch_[i]->data_.Reshape(top_shape);
  }
  top[0]->Reshape(top_shape);

  LOG(INFO) << "output data size: " << top[0]->num() << ","
      << top[0]->channels() << "," << top[0]->height() << ","
      << top[0]->width();
  // labels
  // 读取图片所对应的多个labels
  vector<int> label_shape(2);
  label_shape[0] = batch_size;
  label_shape[1] = label_dim;
  top[1]->Reshape(label_shape);
  for (int i = 0; i < this->prefetch_.size(); ++i) {
    this->prefetch_[i]->label_.Reshape(label_shape);
  }
}

template <typename Dtype>
void ImageMultilabelDataLayer<Dtype>::ShuffleImages() {
  caffe::rng_t* prefetch_rng =
      static_cast<caffe::rng_t*>(prefetch_rng_->generator());
  shuffle(lines_.begin(), lines_.end(), prefetch_rng);
}

// This function is called on prefetch thread
template <typename Dtype>
void ImageMultilabelDataLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
  CPUTimer batch_timer;
  batch_timer.Start();
  int label_dim = this->layer_param_.image_multilabel_data_param().label_dim(); // 获取 label_dim 参数
  double read_time = 0;
  double trans_time = 0;
  CPUTimer timer;
  CHECK(batch->data_.count());
  CHECK(this->transformed_data_.count());
  ImageMultilabelDataParameter image_multilabel_data_param = this->layer_param_.image_multilabel_data_param();
  const int batch_size = image_multilabel_data_param.batch_size();
  const int new_height = image_multilabel_data_param.new_height();
  const int new_width = image_multilabel_data_param.new_width();
  const bool is_color = image_multilabel_data_param.is_color();
  string root_folder = image_multilabel_data_param.root_folder();

  // Reshape according to the first image of each batch
  // on single input batches allows for inputs of varying dimension.
  cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
      new_height, new_width, is_color);
  CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
  // Use data_transformer to infer the expected blob shape from a cv_img.
  vector<int> top_shape = this->data_transformer_->InferBlobShape(cv_img);
  this->transformed_data_.Reshape(top_shape);
  // Reshape batch according to the batch_size.
  top_shape[0] = batch_size;
  batch->data_.Reshape(top_shape);

  Dtype* prefetch_data = batch->data_.mutable_cpu_data();
  Dtype* prefetch_label = batch->label_.mutable_cpu_data();

  // datum scales
  const int lines_size = lines_.size();
  for (int item_id = 0; item_id < batch_size; ++item_id) {
    // get a blob
    timer.Start();
    CHECK_GT(lines_size, lines_id_);
    cv::Mat cv_img = ReadImageToCVMat(root_folder + lines_[lines_id_].first,
        new_height, new_width, is_color);
    CHECK(cv_img.data) << "Could not load " << lines_[lines_id_].first;
    read_time += timer.MicroSeconds();
    timer.Start();
    // Apply transformations (mirror, crop...) to the image
    int offset = batch->data_.offset(item_id);
    this->transformed_data_.set_cpu_data(prefetch_data + offset);
    this->data_transformer_->Transform(cv_img, &(this->transformed_data_));
    trans_time += timer.MicroSeconds();

    // 图片的多 labels 预读取
    for(int i = 0;i < label_dim;++i){
    // lines_[lines_id_].second就是最开始改为的int*,多label
    prefetch_label[item_id * label_dim + i] = lines_[lines_id_].second[i];
    }

    // go to the next iter
    lines_id_++;
    if (lines_id_ >= lines_size) {
      // We have reached the end. Restart from the first.
      DLOG(INFO) << "Restarting data prefetching from start.";
      lines_id_ = 0;
      if (this->layer_param_.image_multilabel_data_param().shuffle()) {
        ShuffleImages();
      }
    }
  }
  batch_timer.Stop();
  DLOG(INFO) << "Prefetch batch: " << batch_timer.MilliSeconds() << " ms.";
  DLOG(INFO) << "     Read time: " << read_time / 1000 << " ms.";
  DLOG(INFO) << "Transform time: " << trans_time / 1000 << " ms.";
}

INSTANTIATE_CLASS(ImageMultilabelDataLayer);
REGISTER_LAYER_CLASS(ImageMultilabelData);

}  // namespace caffe
#endif  // USE_OPENCV

<h3>2.3 添加 ImageMultilabelDataParameter 到 caffe.proto</h3>

message LayerParameter 中添加 ImageMultilabelDataParameter:

optional ImageMultilabelDataParameter image_multilabel_data_param = 147; //如果冲突,147更改为其它可用ID

新增 ImageMultilabelDataParameter:

message ImageMultilabelDataParameter {
  optional string source = 1;
  optional uint32 batch_size = 4 [default = 1];
  optional uint32 rand_skip = 7 [default = 0];
  optional bool shuffle = 8 [default = false];
  optional uint32 new_height = 9 [default = 0];
  optional uint32 new_width = 10 [default = 0];
  optional bool is_color = 11 [default = true];
  optional float scale = 2 [default = 1];
  optional string mean_file = 3;
  optional uint32 crop_size = 5 [default = 0];
  optional bool mirror = 6 [default = false];
  optional string root_folder = 12 [default = ""];
  optional uint32 label_dim = 13 [default = 1]; 
}

<h3>2.4 编译并测试</h3>

修改完成后,重新编译Caffe.

cd /path/to/caffe_multilabel/
make clean && make all -j4
make test -j4
make pycaffe

成功后可以定义一个简单的网络数据读取层,验证是否正确读取图片数据标签.

train_multilabel_test.prototxt 定义:

name: "multilabel-datalayer"
layer {
  name: "data"
  type: "ImageMultilabelData"
  top: "data"
  top: "label"
  transform_param {
    mean_value: 128
    mean_value: 128
    mean_value: 128
  }
  image_multilabel_data_param {
    mirror: true
    source: "/path/to/train.txt"
    root_folder: "/path/to/images/"
    new_height: 224 
    new_width: 224 
    batch_size: 32
    shuffle: true 
    label_dim: 3
   }
}
layer {
  name: "slice"
  type: "Slice"
  bottom: "label"
  top: "label1"
  top: "label2"
  top: "label3"
  slice_param {
    axis: 1
    slice_point:1
    slice_point:2
  }
}

train_multilabel_solver.prototxt 定义:

net: "/path/to/train_multilabel_test.prototxt.prototxt"
test_iter: 100
test_interval: 100
base_lr: 0.001
lr_policy: "step"
gamma: 0.1
stepsize: 1000
display: 20
max_iter: 10000
momentum: 0.9
weight_decay: 0.0005
snapshot: 1000
snapshot_prefix: "/path/to/out/caffemodel"
solver_mode: GPU

data_read_test.py

#!/usr/bin/env python
#! ---- coding: utf-8 ----
import numpy as np
import matplotlib.pyplot as plt

import sys
caffe_root = '/path/to/caffe-multilabel/'
sys.path.insert(0, caffe_root + 'python')
import caffe

caffe.set_mode_gpu()
caffe.set_device(0)
# caffe.set_mode_cpu()


print 'Start...'
solver_def = '/path/to/vgg16/solver.prototxt'

solver = caffe.SGDSolver(solver_def)
solver.step(1)

data = solver.net.blobs['data'].data
labels = solver.net.blobs['label'].data

img = np.transpose(data[0], (1, 2, 0))
gt = labels[0]

plt.imshow(img)
plt.show()

print 'Done.'

<h2>3. 基于VGG16多标签分类</h2>

<h3>3.1 train_val.prototxt</h3>

name: "vgg16-multilabel"
layer {
  name: "data"
  type: "ImageMultilabelData"
  top: "data"
  top: "label"
  include {
    phase: TRAIN
  }
  transform_param {
    mean_value: 128
    mean_value: 128
    mean_value: 128
  }
  image_multilabel_data_param {
    mirror: true
    source: "/path/to/train.txt"
    root_folder: "/path/to/images/"
    new_height: 224 
    new_width: 224 
    batch_size: 32
    shuffle: true
    label_dim: 3
   }
}
layer {
  name: "data"
  type: "ImageMultilabelData"
  top: "data"
  top: "label"
  include {
    phase: TEST
  }
  transform_param {
    mean_value: 128
    mean_value: 128
    mean_value: 128
  }
  image_multilabel_data_param {
    mirror: false
    source: "/path/to/test.txt"
    root_folder: "/path/to/images/"
    new_height: 224 
    new_width: 224 
    batch_size: 4
    shuffle: false
    label_dim: 3
   }
}

##### vgg16 layers ####
......
####  labels  ##################
layer {
  name: "slice"
  type: "Slice"
  bottom: "label"
  top: "label1"
  top: "label2"
  top: "label3"
  slice_param {
    axis: 1
    slice_point:1
    slice_point:2
  }
}

####  label1  #################
layer {
  bottom: "fc7"
  top: "fc8_label1"
  name: "fc8_label1"
  type: "InnerProduct"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: number1 # label1 包含的类别数
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "accuracy_label1"
  type: "Accuracy"
  bottom: "fc8_label1"
  bottom: "label1"
  top: "accuracy_label1"
  accuracy_param {
    top_k: 1
    ignore_label: 0
  }
  include {
    phase: TEST
  }
}
layer {
  bottom: "fc8_label1"
  bottom: "label1"
  top: "loss_label1"
  name: "loss_label1"
  type: "SoftmaxWithLoss"
  loss_param{
    ignore_label: 0
  }
}

####  label2  #################
layer {
  bottom: "fc7"
  top: "fc8_label2"
  name: "fc8_label2"
  type: "InnerProduct"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: number2 # label2 包含的类别数
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "accuracy_label2"
  type: "Accuracy"
  bottom: "fc8_label2"
  bottom: "label2"
  top: "accuracy_label2"
  accuracy_param {
    top_k: 1
    ignore_label: 0
  }
  include {
    phase: TEST
  }
}
layer {
  bottom: "fc8_label2"
  bottom: "label2"
  top: "loss_label2"
  name: "loss_label2"
  type: "SoftmaxWithLoss"
  loss_param{
    ignore_label: 0
  }
}

####  label3  #################
layer {
  bottom: "fc7"
  top: "fc8_label3"
  name: "fc8_label3"
  type: "InnerProduct"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: number3 # label3 包含的类别数
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "accuracy_label3"
  type: "Accuracy"
  bottom: "fc8_label3"
  bottom: "label3"
  top: "accuracy_label3"
  accuracy_param {
    top_k: 1
    ignore_label: 0
  }
  include {
    phase: TEST
  }
}
layer {
  bottom: "fc8_label3"
  bottom: "label3"
  top: "loss_label3"
  name: "loss_label3"
  type: "SoftmaxWithLoss"
  loss_param{
    ignore_label: 0
  }
}

<h3>3.2 solver.prototxt</h3>

net: "/path/to/train_val.prototxt"
test_iter: 1000
test_interval: 20000
base_lr: 0.001
lr_policy: "step"
gamma: 0.1
stepsize: 50000
display: 20
max_iter: 200000
momentum: 0.9
weight_decay: 0.0005
snapshot: 10000
snapshot_prefix: "/path/to/out/caffemodel/"
solver_mode: GPU

<h3>3.3 deploy.prototxt</h3>

name: "vgg16-multilabel"
input: "data"
input_dim: 1
input_dim: 3
input_dim: 224
input_dim: 224

##### vgg16 layers ####
......
####  label1  #################
layer {
  bottom: "fc7"
  top: "fc8_label1"
  name: "fc8_label1"
  type: "InnerProduct"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: number1 # label1 包含的类别数
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  bottom: "fc8_label1"
  top: "prob_label1"
  name: "prob_label1"
  type: "Softmax"
}


####  label2  #################
layer {
  bottom: "fc7"
  top: "fc8_label2"
  name: "fc8_label2"
  type: "InnerProduct"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: number2 # label2 包含的类别数
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  bottom: "fc8_label2"
  top: "prob_label2"
  name: "prob_label2"
  type: "Softmax"
}


####  label3  #################
layer {
  bottom: "fc7"
  top: "fc8_label3"
  name: "fc8_label3"
  type: "InnerProduct"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: number3 # label3 包含的类别数
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  bottom: "fc8_label3"
  top: "prob_label3"
  name: "prob_label3"
  type: "Softmax"
}

<h3>3.5 多标签分类部署 deploy.py</h3>

#!/usr/bin/env python
# ---- coding: utf-8 ----
import numpy as np
from PIL import Image
import scipy.misc
import matplotlib.pyplot as plt

import sys
caffe_root = '/path/to/caffe-multilabel/'
sys.path.insert(0, caffe_root + 'python')
import caffe

caffe.set_mode_gpu()
caffe.set_device(0)
# caffe.set_mode_cpu()

class SimpleTransformer(object):

    """
    SimpleTransformer is a simple class for preprocessing and deprocessing
    images for caffe.
    """

    def __init__(self, mean=[128, 128, 128]):
        self.mean = np.array(mean, dtype=np.float32)
        self.scale = 1.0

    def set_mean(self, mean):
        """
        Set the mean to subtract for centering the data.
        """
        self.mean = mean

    def set_scale(self, scale):
        """
        Set the data scaling.
        """
        self.scale = scale

    def preprocess(self, im):
        """
        preprocess() emulate the pre-processing occuring in the vgg16 caffe
        prototxt.
        """

        im = np.float32(im)
        im = im[:, :, ::-1]  # change to BGR
        im -= self.mean
        im *= self.scale
        im = im.transpose((2, 0, 1))

        return im

    def deprocess(self, im):
        """
        inverse of preprocess()
        """
        im = im.transpose(1, 2, 0)
        im /= self.scale
        im += self.mean
        im = im[:, :, ::-1]  # change to RGB

        return np.uint8(im)


if name == '__main__':
    print 'Start...'

    test_image = '/home/sh/Pictures/upper/10.jpg'
    im = np.asarray(Image.open(test_image))
    im = scipy.misc.imresize(im, [224, 224])

    model_def = '/path/to/deploy.prototxt'
    weight_def = '/path/to/multilabel_vgg16_iter_100000.caffemodel'
    net = caffe.Net(model_def, weight_def, caffe.TEST)

    transformer = SimpleTransformer()
    transformed_image = transformer.preprocess(im)
    net.blobs['data'].data[...] = transformed_image
    outputs = net.forward()

    prob_label1 = outputs'prob_label1'.argmax()
    prob_label2 = outputs'prob_label2'.argmax()
    prob_label3 = outputs'prob_label3'.argmax()

    plt.imshow(im)
    plt.axis('off')
    plt.show()
    print ' pred results '
    print '  label1: ', label1_names[prob_label1]
    print '  label2: ', label2_names[prob_label2]
    print '  label3: ', label3_names[prob_label3]
    print ''

<h2>Reference</h2>

[1] - caffe实现多label输入(修改源码版)

Last modification:October 9th, 2018 at 09:31 am