caffe2/operators/bucketize_op.cu

#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/bucketize_op.h"

#include <thrust/binary_search.h>
#include <thrust/device_vector.h>

namespace caffe2 {

__global__ void BucketizeOpKernel(
    const int N,
    const int M,
    const float* bounds,
    const float* X,
    int32_t* out) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    int32_t low = -1, high = M;
    while (high - low > 1) {
      const int32_t median = low + (high - low) / 2;
      if (bounds[median] < X[i]) {
        low = median;
      } else {
        high = median;
      }
    }
    out[i] = high;
  }
}

template <>
bool BucketizeOp<CUDAContext>::RunOnDevice() {
  auto& input = Input(X);
  CAFFE_ENFORCE_GE(input.dim(), 1);

  auto N = input.numel();
  auto* output = Output(INDICES, input.sizes(), at::dtype<int32_t>());
  const auto* input_data = input.template data<float>();
  auto* output_data = output->template mutable_data<int32_t>();

  BucketizeOpKernel<<<
      CAFFE_GET_BLOCKS(N),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context_.cuda_stream()>>>(
      N,
      boundaries_device_.numel(),
      boundaries_device_.data<float>(),
      input_data,
      output_data);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  return true;
};

REGISTER_CUDA_OPERATOR(Bucketize, BucketizeOp<CUDAContext>);
} // namespace caffe2

using BucketizeCUDA = caffe2::BucketizeOp<caffe2::CUDAContext>;

C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(
    Bucketize,
    BucketizeCUDA);