detectron/ops/upsample_wsl_op.cc

#include "upsample_wsl_op.h"

#include "caffe2/utils/cpu_neon.h"
#include "caffe2/utils/math.h"

namespace caffe2 {

template <>
bool UpsampleBilinearWSLOp<float, CPUContext>::RunOnDevice() {
  const auto& X = Input(0);
  const auto& Z = Input(1);
  auto* Y = Output(0);

  const int batch_size = X.dim32(0);
  const int num_channels = X.dim32(1);
  const int input_height = X.dim32(2);
  const int input_width = X.dim32(3);
  //int output_width = input_width * width_scale_;
  //int output_height = input_height * height_scale_;
  int output_width = Z.dim32(3);
  int output_height = Z.dim32(2);
  Y->Resize(batch_size, num_channels, output_height, output_width);

  const float* input = X.data<float>();
  float* output = Y->mutable_data<float>();
  int channels = num_channels * batch_size;

  const float rheight = (output_height > 1)
      ? (float)(input_height - 1) / (output_height - 1)
      : 0.f;
  const float rwidth =
      (output_width > 1) ? (float)(input_width - 1) / (output_width - 1) : 0.f;
  for (int h2 = 0; h2 < output_height; ++h2) {
    const float h1r = rheight * h2;
    const int h1 = h1r;
    const int h1p = (h1 < input_height - 1) ? 1 : 0;
    const float h1lambda = h1r - h1;
    const float h0lambda = (float)1. - h1lambda;
    for (int w2 = 0; w2 < output_width; ++w2) {
      const float w1r = rwidth * w2;
      const int w1 = w1r;
      const int w1p = (w1 < input_width - 1) ? 1 : 0;
      const float w1lambda = w1r - w1;
      const float w0lambda = (float)1. - w1lambda;
      const float* Xdata = &input[h1 * input_width + w1];
      float* Ydata = &output[h2 * output_width + w2];
      for (int c = 0; c < channels; ++c) {
        Ydata[0] = h0lambda * (w0lambda * Xdata[0] + w1lambda * Xdata[w1p]) +
            h1lambda *
                (w0lambda * Xdata[h1p * input_width] +
                 w1lambda * Xdata[h1p * input_width + w1p]);
        Xdata += input_width * input_height;
        Ydata += output_width * output_height;
      }
    }
  }

  return true;
}

template <>
bool UpsampleBilinearWSLGradientOp<float, CPUContext>::RunOnDevice() {
  const auto& dY = Input(0);
  const auto& X = Input(1);
  auto* dX = Output(0);

  const auto& inputDims = dY.dims();
  CAFFE_ENFORCE_EQ(4, inputDims.size());
  const int batch_size = dY.dim32(0);
  const int num_channels = dY.dim32(1);
  const int input_height = dY.dim32(2);
  const int input_width = dY.dim32(3);
  const int output_height = X.dim32(2);
  const int output_width = X.dim32(3);
  dX->Resize(batch_size, num_channels, output_height, output_width);
  math::Set<float, CPUContext>(
      dX->numel(), 0.0f, dX->mutable_data<float>(), &context_);

  const float* dYdata = dY.data<float>();
  float* dXdata = dX->mutable_data<float>();
  int channels = num_channels * batch_size;

  const float rheight = (input_height > 1)
      ? (float)(output_height - 1) / (input_height - 1)
      : 0.f;
  const float rwidth =
      (input_width > 1) ? (float)(output_width - 1) / (input_width - 1) : 0.f;

  for (int h2 = 0; h2 < input_height; ++h2) {
    const float h1r = rheight * h2;
    const int h1 = h1r;
    const int h1p = (h1 < output_height - 1) ? 1 : 0;
    const float h1lambda = h1r - h1;
    const float h0lambda = (float)1. - h1lambda;
    for (int w2 = 0; w2 < input_width; ++w2) {
      const float w1r = rwidth * w2;
      const int w1 = w1r;
      const int w1p = (w1 < output_width - 1) ? 1 : 0;
      const float w1lambda = w1r - w1;
      const float w0lambda = (float)1. - w1lambda;
      float* pos1 = &dXdata[h1 * output_width + w1];
      const float* pos2 = &dYdata[h2 * input_width + w2];
      for (int c = 0; c < channels; ++c) {
        pos1[0] += h0lambda * w0lambda * pos2[0];
        pos1[w1p] += h0lambda * w1lambda * pos2[0];
        pos1[h1p * output_width] += h1lambda * w0lambda * pos2[0];
        pos1[h1p * output_width + w1p] += h1lambda * w1lambda * pos2[0];
        pos1 += output_width * output_height;
        pos2 += input_width * input_height;
      }
    }
  }

  return true;
}

REGISTER_CPU_OPERATOR(UpsampleBilinearWSL, UpsampleBilinearWSLOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
    UpsampleBilinearWSLGradient,
    UpsampleBilinearWSLGradientOp<float, CPUContext>);

// Input: X, output: Y
OPERATOR_SCHEMA(UpsampleBilinearWSL)
    .NumInputs(2)
    .NumOutputs(1)
    .Arg("width_scale", "Scale along width dimension")
    .Arg("height_scale", "Scale along height dimension")
    .SetDoc(R"DOC(
Resizes the spatial dimensions of the input using bilinear
interpolation. The `width_scale` and `height_scale` arguments
control the size of the output, which is given by:
output_width = floor(input_width * width_scale)
output_height = floor(output_height * height_scale)
)DOC")
    .Input(0, "X", "Input tensor")
    .Input(1, "Z", "Input tensor")
    .Output(0, "Y", "Output tensor");

// Input: dY, output: dX
OPERATOR_SCHEMA(UpsampleBilinearWSLGradient)
    .NumInputs(2)
    .NumOutputs(1)
    .Arg("width_scale", "Scale along width dimension")
    .Arg("height_scale", "Scale along height dimension");

class GetUpsampleBilinearWSLGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "UpsampleBilinearWSLGradient",
        "",
        vector<string>{GO(0), I(0)},
        vector<string>{GI(0)});
  }
};
REGISTER_GRADIENT(UpsampleBilinearWSL, GetUpsampleBilinearWSLGradient);

} // namespace caffe2