cornell-brg · XingyaoZhang · May 7, 2021 · May 10, 2021
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -139,7 +139,7 @@ option(USE_HB "Use HammerBlade" ON)
 option(USE_HB_EMUL "Use HammerBlade Emulation Layer" OFF)
 option(PROFILE_ATEN "Turn on ATen kernel profiling" ON)
 cmake_dependent_option(
-  HB_REDISPATCH "Enable conditional redispatch to HB" OFF
+    HB_REDISPATCH "Enable conditional redispatch to HB" ON
   "PROFILE_ATEN;USE_HB" OFF)
 cmake_dependent_option(
   HB_ENABLE_KERNEL_LOG "Enable HammerBlade kernel call logging" OFF

diff --git a/aten/src/ATen/native/hammerblade/Convolution.cpp b/aten/src/ATen/native/hammerblade/Convolution.cpp
@@ -2,6 +2,8 @@
 #include <ATen/native/hammerblade/HammerBladeTensor.h>
 #include <ATen/native/hammerblade/Offload.h>
 
+#define ENABLE_SYSTOLIC 0
+
 namespace at {
 namespace native {
 
@@ -143,7 +145,7 @@ static void convolution_shape_check(
 Tensor hb_convolution_forward(
     CheckedFrom c,
     const TensorArg& input, const TensorArg& weight,
-    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, 
+    IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
     int64_t groups) {
   checkAllSameType(c, {input, weight});
   checkAllSameHB(c, {input, weight});
@@ -159,7 +161,7 @@ Tensor hb_convolution_forward(
 
   // Avoid ambiguity of "output" when this is being used as backwards
   TensorArg output{ output_t, "result", 0 };
-  convolution_shape_check(c, input, weight, output, padding, stride, 
+  convolution_shape_check(c, input, weight, output, padding, stride,
       dilation, groups);
 
   Tensor weight_contig = weight->contiguous();
@@ -174,15 +176,45 @@ Tensor hb_convolution_forward(
   device_args.push_back(create_device_vector(padding, true, device_ptrs));
   device_args.push_back(create_device_vector(stride, true, device_ptrs));
 
+  /*
+  TORCH_CHECK((*input).size(3) == (*input).size(2), "we only support square imap\n");
+  TORCH_CHECK((*weight).size(3) == 3, "we only support 3x3 filter\n");
+  TORCH_CHECK((*weight).size(3) == (*weight).size(2), "we only support square filter\n");
+  */
+
+  string kernel_name;
+  if ((*weight).size(3) != 3 && (*weight).size(3) != 1) {
+    kernel_name = "tensorlib_convolution_forward";
+  } else {
+    switch((*input).size(3)) {
+      case 32:
+        kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32" : "tensorlib_conv_resnet_32_1x1_32x32";
+        break;
+      case 16:
+        kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16" : "tensorlib_conv_resnet_32_1x1_16x16";
+        break;
+      case 8:
+        kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8" : "tensorlib_conv_resnet_32_1x1_8x8";
+        break;
+      case 4:
+        kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4" : "tensorlib_conv_resnet_32_1x1_4x4";
+        break;
+      default:
+        kernel_name = "tensorlib_convolution_forward";
+        // TORCH_CHECK(false, "we only support 32x32, 16x16, 8x8, and 4x4 imap\n");
+        break;
+    }
+  }
+
   c10::hammerblade::offload_kernel(
-      "tensorlib_convolution_forward", device_args);
+      kernel_name.c_str(), device_args);
   cleanup_device(device_args, device_ptrs);
 
   return *output;
 }
 
 // In-place!
-void hb_convolution_add_bias_(CheckedFrom c, const TensorArg& output, 
+void hb_convolution_add_bias_(CheckedFrom c, const TensorArg& output,
                               const TensorArg& bias) {
   checkAllSameType(c, {output, bias});
   checkAllSameHB(c, {output, bias});
@@ -222,8 +254,31 @@ Tensor hb_convolution_backward_input(
   device_args.push_back(create_device_vector(padding, true, device_ptrs));
   device_args.push_back(create_device_vector(stride, true, device_ptrs));
 
+  string kernel_name;
+  if ((weight_contig).size(3) != 3 && (weight_contig).size(3) != 1) {
+    kernel_name = "tensorlib_convolution_backward_input";
+  } else {
+    switch((*grad_output).size(3)) {
+      case 32:
+        kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32_back_input" : "tensorlib_conv_resnet_32_1x1_32x32_back_input";
+        break;
+      case 16:
+        kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16_back_input" : "tensorlib_conv_resnet_32_1x1_16x16_back_input";
+        break;
+      case 8:
+        kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8_back_input" : "tensorlib_conv_resnet_32_1x1_8x8_back_input";
+        break;
+      case 4:
+        kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4_back_input" : "tensorlib_conv_resnet_32_1x1_4x4_back_input";
+        break;
+      default:
+        kernel_name = "tensorlib_convolution_backward_input";
+        break;
+    }
+  }
+
   c10::hammerblade::offload_kernel(
-      "tensorlib_convolution_backward_input", device_args);
+      kernel_name.c_str(), device_args);
   cleanup_device(device_args, device_ptrs);
 
   return *grad_input;
@@ -249,13 +304,36 @@ Tensor hb_convolution_backward_weight(
   std::vector<eva_t> device_args;
   std::vector<eva_t> device_ptrs;
   device_args.push_back(create_device_tensor(*grad_weight, device_ptrs));
-  device_args.push_back(create_device_tensor(*grad_output, device_ptrs));
   device_args.push_back(create_device_tensor(*input, device_ptrs));
+  device_args.push_back(create_device_tensor(*grad_output, device_ptrs));
   device_args.push_back(create_device_vector(padding, true, device_ptrs));
   device_args.push_back(create_device_vector(stride, true, device_ptrs));
 
+  string kernel_name;
+  if ((*grad_weight).size(3) != 3 && (*grad_weight).size(3) != 1) {
+    kernel_name = "tensorlib_convolution_backward_weight";
+  } else {
+    switch((*input).size(3)) {
+      case 32:
+        kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32_back_weight" : "tensorlib_conv_resnet_32_1x1_32x32_back_weight";
+        break;
+      case 16:
+        kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16_back_weight" : "tensorlib_conv_resnet_32_1x1_16x16_back_weight";
+        break;
+      case 8:
+        kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8_back_weight" : "tensorlib_conv_resnet_32_1x1_8x8_back_weight";
+        break;
+      case 4:
+        kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4_back_weight" : "tensorlib_conv_resnet_32_1x1_4x4_back_weight";
+        break;
+      default:
+        kernel_name = "tensorlib_convolution_backward_weight";
+        break;
+    }
+  }
+
   c10::hammerblade::offload_kernel(
-      "tensorlib_convolution_backward_weight", device_args);
+      kernel_name.c_str(), device_args);
   cleanup_device(device_args, device_ptrs);
 
   return grad_weight_t;
@@ -275,7 +353,7 @@ Tensor hb_convolution_transpose(
 
 Tensor hb_convolution(
     const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
-    IntArrayRef padding, IntArrayRef stride, 
+    IntArrayRef padding, IntArrayRef stride,
     IntArrayRef dilation, int64_t groups) {
   TensorArg input  { input_t,  "input",  1 },
             weight { weight_t, "weight", 2 },

diff --git a/hammerblade/torch/kernel/kernel_conv_baseline.cpp b/hammerblade/torch/kernel/kernel_conv_baseline.cpp
@@ -0,0 +1,140 @@
+//====================================================================
+// SPMD 2D Convolution
+// Idea is that each tile will receive a piece of output image that
+// does not overlap with any other tile to work on
+// 10/02/2020 Lin Cheng
+//====================================================================
+
+#define BLOCK_DIM_X   14
+#define BLOCK_DIM_Y   14
+#define FILTER_DIM     5
+#define NUM_FILTERS    6
+
+#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1)
+#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1)
+
+#include <kernel_common.hpp>
+#include <kernel_conv_baseline.hpp>
+
+
+extern "C" {
+
+  __attribute__ ((noinline))  int tensorlib_conv_baseline(
+    hb_tensor_t* output,
+    hb_tensor_t* input,
+    hb_tensor_t* weight,
+    hb_vector_t* padding,
+    hb_vector_t* strides) {
+
+    HBTensor<float, 4> omap(output);
+    HBTensor<float, 4> imap(input);
+    HBTensor<float, 4> filter(weight);
+
+    // Conv2d parameters
+    auto N    = omap.dim(0); // number of images in batch
+    auto Cout = omap.dim(1); // number of output channels
+    auto Hout = omap.dim(2);
+    auto Wout = omap.dim(3);
+    auto Cin  = imap.dim(1); // number of input channels
+    auto Hin  = imap.dim(2);
+    auto Win  = imap.dim(3);
+    auto Hk   = filter.dim(2);
+    auto Wk   = filter.dim(3);
+
+    size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y;
+    size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X;
+    if (Hout % BLOCK_DIM_Y != 0) {
+      h_blocks_per_out_channel++;
+    }
+    if (Wout % BLOCK_DIM_X != 0) {
+      w_blocks_per_out_channel++;
+    }
+    size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel;
+    size_t num_blocks = N * Cout * blocks_per_out_channel;
+
+    float filter_buf[FILTER_DIM * FILTER_DIM];  //   5x5 * 4 = 100B
+    float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y];      // 14x14 * 4 = 784B
+    float imap_buf[IMAP_DIM_X * IMAP_DIM_Y];        // 18x18 * 4 = 1296B
+
+    // cross check
+    hb_assert(FILTER_DIM == Hk);
+    hb_assert(FILTER_DIM == Wk);
+    hb_assert(NUM_FILTERS == Cout);
+
+
+    auto filterDMA = [&](size_t filter_id, size_t channel_id) {
+      float* filter_src_base = (float*)filter.data_ptr();
+      uint32_t* filter_src_strides = filter.get_strides();
+      filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1];
+      fill_filter_buffer<FILTER_DIM>(filter_src_base, filter_buf);
+    };
+
+    auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) {
+      size_t imap_x = block_x * BLOCK_DIM_X;
+      size_t imap_y = block_y * BLOCK_DIM_Y;
+      float* imap_src_base = (float*)imap.data_ptr();
+      uint32_t* imap_src_strides = imap.get_strides();
+      imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1];
+      imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3];
+      size_t y_step = imap_src_strides[2];
+      fill_imap_buffer<IMAP_DIM_X, IMAP_DIM_Y>(imap_src_base, imap_buf, y_step);
+    };
+
+    auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) {
+      size_t omap_x = block_x * BLOCK_DIM_X;
+      size_t omap_y = block_y * BLOCK_DIM_Y;
+      float* omap_src_base = (float*)omap.data_ptr();
+      uint32_t* omap_src_strides = omap.get_strides();
+      omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1];
+      omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3];
+      size_t y_step = omap_src_strides[2];
+      drain_omap_buffer<BLOCK_DIM_X, BLOCK_DIM_Y>(omap_buf, omap_src_base, y_step);
+    };
+
+    bsg_cuda_print_stat_kernel_start();
+
+    // main loop
+    for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) {
+      if (idx < num_blocks) {
+
+        // figure out what we are producing
+        size_t tmp = idx;
+        size_t image_id = tmp / (Cout * blocks_per_out_channel);
+        tmp = tmp % (Cout * blocks_per_out_channel);
+        size_t filter_id = tmp / blocks_per_out_channel;
+        tmp = tmp % blocks_per_out_channel;
+        size_t block_y = tmp / w_blocks_per_out_channel;
+        size_t block_x = tmp % w_blocks_per_out_channel;
+
+        // reset output buffer
+        reset_buffer<BLOCK_DIM_X, BLOCK_DIM_Y>(omap_buf);
+
+        for (size_t channel_id = 0; channel_id < Cin; channel_id++) {
+
+          // read in the image
+          imapDMA(image_id, channel_id, block_x, block_y);
+
+          // read in the filter
+          filterDMA(filter_id, channel_id);
+
+          // do conv
+          conv2d_5x5(imap_buf, filter_buf, omap_buf);
+
+        } // channel
+
+        // write omap back
+        omapDMA(image_id, filter_id, block_x, block_y);
+
+      } // if (idx < num_blocks)
+    } // main loop
+
+    bsg_cuda_print_stat_kernel_end();
+
+    g_barrier.sync();
+    return 0;
+  }
+
+  HB_EMUL_REG_KERNEL(tensorlib_conv_baseline, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*,
+                     hb_vector_t*, hb_vector_t*)
+
+}