caffe2/cuda_rtc/elemenntwise_rtc_gpu.cc

#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "caffe2/cuda_rtc/common_rtc.h"

namespace caffe2 {
namespace {
class ElementwiseRTCFunction
    : public CudaRTCFunction<ElementwiseRTCFunction> {
 public:
  ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {}

  template <typename... Args>
  string KernelName(Args... /*args*/) {
    return name_;
  }

  template <typename... Args>
  string GetSource(Args... args);

 private:
  string name_;
};

template<>
string ElementwiseRTCFunction::GetSource(
    int input_size, int output_size,
    const string command_string) {
  std::stringstream ss;
  ss << "extern \"C\" __global__ void " << name_ <<
        "(const size_t nthreads, \n";
  // Insert the parameter list.
  int remain_params = input_size + output_size;
  for (int i = 0; i < input_size; ++i) {
    ss << "const float* in" << i
       << ((remain_params--) ? ", \n" : "");
  }
  for (int i = 0; i < output_size; ++i) {
    ss << "float* out" << i
       << ((remain_params--) ? ", \n" : "");
  }
  ss << ") {\n"
        "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n"
        "index < nthreads; index += blockDim.x * gridDim.x) {\n"
     << command_string << "\n"
     << "}\n}";
  return ss.str();
}
}  // namespace

/**
 * A GPU operator that can generate limited elementwise operations.
 *
 * ElementwiseRTCOp allows one to do a simple and limited thing: it takes in
 * multiple inputs and multiple outputs, as well as a raw string argument
 * rtc_src. The runtime then generates the following kernel code:
 *
 *   __global__ void kernel_name(const size_t nthreads, ...) {
 *     for(int index = blockIdx.x * blockDim.x + threadIdx.x;
 *         index < nthreads; index += blockDim.x * gridDim.x) {
 *       rtc_src
 *     }
 *   }
 * where the "..." part is auto generated, so one can refer to the input and
 * output as in0, in1, ..., out0, out1... in the rtc_src string.
 *
 * For example, if one wants to do a vector multiplication, one can take two
 * inputs and one outputs, and write rtc_src as
 *     out0[index] = in0[index] * in1[index];
 *
 * This op is currently highly experimental. We do not have a gradient
 * registered for it either.
 */
class ElementwiseRTCOp final : public Operator<CUDAContext> {
 public:
  ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<CUDAContext>(operator_def, ws) {
    const string src = OperatorBase::GetSingleArgument<string>(
        "rtc_src", "");
    CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size.");
    func_.Compile(InputSize(), OutputSize(), src);
  }
  ~ElementwiseRTCOp() override {}

  bool RunOnDevice() override {
    static_assert(sizeof(void*) == sizeof(size_t),
                  "The argbuffer relies on the assumption that void* and "
                  "size_t have the same size.");
    vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1);
    size_t* argBuffer = argBuffer_vec.data();
    CAFFE_ENFORCE(
        Input(0).numel() < std::numeric_limits<int>::max(),
        "The kernel function currently only supports int index.");
    argBuffer[0] = Input(0).numel();
    void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1);
    for (int i = 0; i < InputSize(); ++i) {
      ptr_buffer[i] = const_cast<float*>(Input(i).data<float>());
    }
    for (int i = 0; i < OutputSize(); ++i) {
      Output(i)->ResizeLike(Input(0));
      ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>();
    }
    size_t argBufferSize = sizeof(argBuffer);
    void* config[] = {
      CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
      CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize,
      CU_LAUNCH_PARAM_END
    };
    func_.LaunchEx(
        CAFFE_GET_BLOCKS(Input(0).numel()),
        1,
        1,
        CAFFE_CUDA_NUM_THREADS,
        1,
        1,
        0,
        context_.cuda_stream(),
        config);
    return true;
  }

 private:
  ElementwiseRTCFunction func_;
};

namespace {
REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp);
}

}  // namespace caffe2