diff --git a/CMakeLists.txt b/CMakeLists.txt index 25fbaf1a4b..cc9787394d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,7 +139,7 @@ option(USE_HB "Use HammerBlade" ON) option(USE_HB_EMUL "Use HammerBlade Emulation Layer" OFF) option(PROFILE_ATEN "Turn on ATen kernel profiling" ON) cmake_dependent_option( - HB_REDISPATCH "Enable conditional redispatch to HB" OFF + HB_REDISPATCH "Enable conditional redispatch to HB" ON "PROFILE_ATEN;USE_HB" OFF) cmake_dependent_option( HB_ENABLE_KERNEL_LOG "Enable HammerBlade kernel call logging" OFF diff --git a/aten/src/ATen/native/hammerblade/Convolution.cpp b/aten/src/ATen/native/hammerblade/Convolution.cpp index 7f16c5eb1d..d6da7e9add 100644 --- a/aten/src/ATen/native/hammerblade/Convolution.cpp +++ b/aten/src/ATen/native/hammerblade/Convolution.cpp @@ -2,6 +2,8 @@ #include #include +#define ENABLE_SYSTOLIC 0 + namespace at { namespace native { @@ -143,7 +145,7 @@ static void convolution_shape_check( Tensor hb_convolution_forward( CheckedFrom c, const TensorArg& input, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) { checkAllSameType(c, {input, weight}); checkAllSameHB(c, {input, weight}); @@ -159,7 +161,7 @@ Tensor hb_convolution_forward( // Avoid ambiguity of "output" when this is being used as backwards TensorArg output{ output_t, "result", 0 }; - convolution_shape_check(c, input, weight, output, padding, stride, + convolution_shape_check(c, input, weight, output, padding, stride, dilation, groups); Tensor weight_contig = weight->contiguous(); @@ -174,15 +176,45 @@ Tensor hb_convolution_forward( device_args.push_back(create_device_vector(padding, true, device_ptrs)); device_args.push_back(create_device_vector(stride, true, device_ptrs)); + /* + TORCH_CHECK((*input).size(3) == (*input).size(2), "we only support square imap\n"); + TORCH_CHECK((*weight).size(3) == 3, "we only support 3x3 filter\n"); + TORCH_CHECK((*weight).size(3) == (*weight).size(2), "we only support square filter\n"); + */ + + string kernel_name; + if ((*weight).size(3) != 3 && (*weight).size(3) != 1) { + kernel_name = "tensorlib_convolution_forward"; + } else { + switch((*input).size(3)) { + case 32: + kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32" : "tensorlib_conv_resnet_32_1x1_32x32"; + break; + case 16: + kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16" : "tensorlib_conv_resnet_32_1x1_16x16"; + break; + case 8: + kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8" : "tensorlib_conv_resnet_32_1x1_8x8"; + break; + case 4: + kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4" : "tensorlib_conv_resnet_32_1x1_4x4"; + break; + default: + kernel_name = "tensorlib_convolution_forward"; + // TORCH_CHECK(false, "we only support 32x32, 16x16, 8x8, and 4x4 imap\n"); + break; + } + } + c10::hammerblade::offload_kernel( - "tensorlib_convolution_forward", device_args); + kernel_name.c_str(), device_args); cleanup_device(device_args, device_ptrs); return *output; } // In-place! -void hb_convolution_add_bias_(CheckedFrom c, const TensorArg& output, +void hb_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const TensorArg& bias) { checkAllSameType(c, {output, bias}); checkAllSameHB(c, {output, bias}); @@ -222,8 +254,31 @@ Tensor hb_convolution_backward_input( device_args.push_back(create_device_vector(padding, true, device_ptrs)); device_args.push_back(create_device_vector(stride, true, device_ptrs)); + string kernel_name; + if ((weight_contig).size(3) != 3 && (weight_contig).size(3) != 1) { + kernel_name = "tensorlib_convolution_backward_input"; + } else { + switch((*grad_output).size(3)) { + case 32: + kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32_back_input" : "tensorlib_conv_resnet_32_1x1_32x32_back_input"; + break; + case 16: + kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16_back_input" : "tensorlib_conv_resnet_32_1x1_16x16_back_input"; + break; + case 8: + kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8_back_input" : "tensorlib_conv_resnet_32_1x1_8x8_back_input"; + break; + case 4: + kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4_back_input" : "tensorlib_conv_resnet_32_1x1_4x4_back_input"; + break; + default: + kernel_name = "tensorlib_convolution_backward_input"; + break; + } + } + c10::hammerblade::offload_kernel( - "tensorlib_convolution_backward_input", device_args); + kernel_name.c_str(), device_args); cleanup_device(device_args, device_ptrs); return *grad_input; @@ -249,13 +304,36 @@ Tensor hb_convolution_backward_weight( std::vector device_args; std::vector device_ptrs; device_args.push_back(create_device_tensor(*grad_weight, device_ptrs)); - device_args.push_back(create_device_tensor(*grad_output, device_ptrs)); device_args.push_back(create_device_tensor(*input, device_ptrs)); + device_args.push_back(create_device_tensor(*grad_output, device_ptrs)); device_args.push_back(create_device_vector(padding, true, device_ptrs)); device_args.push_back(create_device_vector(stride, true, device_ptrs)); + string kernel_name; + if ((*grad_weight).size(3) != 3 && (*grad_weight).size(3) != 1) { + kernel_name = "tensorlib_convolution_backward_weight"; + } else { + switch((*input).size(3)) { + case 32: + kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32_back_weight" : "tensorlib_conv_resnet_32_1x1_32x32_back_weight"; + break; + case 16: + kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16_back_weight" : "tensorlib_conv_resnet_32_1x1_16x16_back_weight"; + break; + case 8: + kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8_back_weight" : "tensorlib_conv_resnet_32_1x1_8x8_back_weight"; + break; + case 4: + kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4_back_weight" : "tensorlib_conv_resnet_32_1x1_4x4_back_weight"; + break; + default: + kernel_name = "tensorlib_convolution_backward_weight"; + break; + } + } + c10::hammerblade::offload_kernel( - "tensorlib_convolution_backward_weight", device_args); + kernel_name.c_str(), device_args); cleanup_device(device_args, device_ptrs); return grad_weight_t; @@ -275,7 +353,7 @@ Tensor hb_convolution_transpose( Tensor hb_convolution( const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t, - IntArrayRef padding, IntArrayRef stride, + IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups) { TensorArg input { input_t, "input", 1 }, weight { weight_t, "weight", 2 }, diff --git a/hammerblade/torch/kernel/kernel_conv_baseline.cpp b/hammerblade/torch/kernel/kernel_conv_baseline.cpp new file mode 100644 index 0000000000..5663e314a3 --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_baseline.cpp @@ -0,0 +1,140 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define BLOCK_DIM_X 14 +#define BLOCK_DIM_Y 14 +#define FILTER_DIM 5 +#define NUM_FILTERS 6 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_baseline( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + if (Hout % BLOCK_DIM_Y != 0) { + h_blocks_per_out_channel++; + } + if (Wout % BLOCK_DIM_X != 0) { + w_blocks_per_out_channel++; + } + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(NUM_FILTERS == Cout); + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_kernel_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + imapDMA(image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_5x5(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_cuda_print_stat_kernel_end(); + + g_barrier.sync(); + return 0; + } + + HB_EMUL_REG_KERNEL(tensorlib_conv_baseline, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} \ No newline at end of file diff --git a/hammerblade/torch/kernel/kernel_conv_baseline.hpp b/hammerblade/torch/kernel/kernel_conv_baseline.hpp new file mode 100644 index 0000000000..183be96a1a --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_baseline.hpp @@ -0,0 +1,608 @@ +//==================================================================== +// Helper functions for SPMD 2D Conv +// 10/06/2020 Lin Cheng +//==================================================================== + +// templated unrolling code by Krithik Ranjan +template +struct Unroll { + inline static void reset_buffer(T* buf); + inline static void fill_buffer(T* src, T* buf); + inline static void fill_buffer_rotate(T* src, T* buf); + inline static void drain_buffer(T* buf, T* dest); +}; + +template +inline void Unroll::reset_buffer(T* buf) { + buf[N] = 0; + Unroll::reset_buffer(buf); +} + +template +inline void Unroll::fill_buffer(T* src, T* buf) { + buf[N] = ((bsg_attr_remote T*) src)[N]; + Unroll::fill_buffer(src, buf); +} + +template +inline void Unroll::fill_buffer_rotate(T* src, T* buf) { + *buf = ((bsg_attr_remote T*) src)[N]; + Unroll::fill_buffer_rotate(src, buf+1); +} + +template +inline void Unroll::drain_buffer(T* buf, T* dest) { + ((bsg_attr_remote T*) dest)[N] = buf[N]; + Unroll::drain_buffer(buf, dest); +} + +template +struct Unroll<0, T> { + inline static void reset_buffer(T* buf); + inline static void fill_buffer(T* src, T* buf); + inline static void fill_buffer_rotate(T* src, T* buf); + inline static void drain_buffer(T* buf, T* dest); +}; + +template +inline void Unroll<0, T>::reset_buffer(T* buf) { + buf[0] = 0; +} + +template +inline void Unroll<0, T>::fill_buffer(T* src, T* buf) { + buf[0] = ((bsg_attr_remote T*) src)[0]; +} + +template +inline void Unroll<0, T>::fill_buffer_rotate(T* src, T* buf) { + *buf = ((bsg_attr_remote T*) src)[0]; +} +template +inline void Unroll<0, T>::drain_buffer(T* buf, T* dest) { + ((bsg_attr_remote T*) dest)[0] = buf[0]; +} + +// conv related helpers + +template +inline void reset_buffer(float* buf) { + for (size_t i = 0; i < DIM_Y; i++) { + Unroll::reset_buffer(buf); + buf += DIM_X; + } +} + +template +inline void fill_filter_buffer(float* src, float* buf) { + Unroll::fill_buffer(src, buf); +} + +template +inline void fill_filter_buffer_rotate(float* src, float* buf) { + Unroll::fill_buffer_rotate(src, buf); +} + +template +inline void fill_imap_buffer(float* src, float* buf, size_t y_step) { + for (size_t i = 0; i < DIM_Y; i++) { + Unroll::fill_buffer(src, buf); + buf += DIM_X; + src += y_step; + } +} + +template +inline void drain_omap_buffer(float* buf, float* dest, size_t y_step) { + for (size_t i = 0; i < DIM_Y; i++) { + Unroll::drain_buffer(buf, dest); + buf += DIM_X; + dest += y_step; + } +} + + +inline void conv2d_5x5(float* imap, float* filter, float* omap) { + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + for (size_t x = 0; x < BLOCK_DIM_X; x += 7) { + register float psum0 = omap[y * BLOCK_DIM_X + x + 0]; + register float psum1 = omap[y * BLOCK_DIM_X + x + 1]; + register float psum2 = omap[y * BLOCK_DIM_X + x + 2]; + register float psum3 = omap[y * BLOCK_DIM_X + x + 3]; + register float psum4 = omap[y * BLOCK_DIM_X + x + 4]; + register float psum5 = omap[y * BLOCK_DIM_X + x + 5]; + register float psum6 = omap[y * BLOCK_DIM_X + x + 6]; + for (size_t yy = 0; yy < FILTER_DIM; yy++) { + register float filter0 = filter[yy * FILTER_DIM + 0]; + register float filter1 = filter[yy * FILTER_DIM + 1]; + register float filter2 = filter[yy * FILTER_DIM + 2]; + register float filter3 = filter[yy * FILTER_DIM + 3]; + register float filter4 = filter[yy * FILTER_DIM + 4]; + register float imap0 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 0]; + register float imap1 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 1]; + register float imap2 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 2]; + register float imap3 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 3]; + register float imap4 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 4]; + register float imap5 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 5]; + register float imap6 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 6]; + register float imap7 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 7]; + register float imap8 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 8]; + register float imap9 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 9]; + register float imap10 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 10]; + asm volatile("": : :"memory"); + +#ifdef HB_EMUL + psum0 += imap0 * filter0; + psum1 += imap1 * filter0; + psum2 += imap2 * filter0; + psum3 += imap3 * filter0; + psum4 += imap4 * filter0; + psum5 += imap5 * filter0; + psum6 += imap6 * filter0; + + psum0 += imap1 * filter1; + psum1 += imap2 * filter1; + psum2 += imap3 * filter1; + psum3 += imap4 * filter1; + psum4 += imap5 * filter1; + psum5 += imap6 * filter1; + psum6 += imap7 * filter1; + + psum0 += imap2 * filter2; + psum1 += imap3 * filter2; + psum2 += imap4 * filter2; + psum3 += imap5 * filter2; + psum4 += imap6 * filter2; + psum5 += imap7 * filter2; + psum6 += imap8 * filter2; + + psum0 += imap3 * filter3; + psum1 += imap4 * filter3; + psum2 += imap5 * filter3; + psum3 += imap6 * filter3; + psum4 += imap7 * filter3; + psum5 += imap8 * filter3; + psum6 += imap9 * filter3; + + psum0 += imap4 * filter4; + psum1 += imap5 * filter4; + psum2 += imap6 * filter4; + psum3 += imap7 * filter4; + psum4 += imap8 * filter4; + psum5 += imap9 * filter4; + psum6 += imap10 * filter4; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap3), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap4), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap5), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap6), "f"(filter0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap4), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap5), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap6), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap7), "f"(filter1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap5), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap6), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap7), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap8), "f"(filter2)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap4), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap5), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap6), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap7), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap8), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap9), "f"(filter3)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap4), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap5), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap6), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap7), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap8), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap9), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap10), "f"(filter4)); +#endif + } + omap[y * BLOCK_DIM_X + x + 0] = psum0; + omap[y * BLOCK_DIM_X + x + 1] = psum1; + omap[y * BLOCK_DIM_X + x + 2] = psum2; + omap[y * BLOCK_DIM_X + x + 3] = psum3; + omap[y * BLOCK_DIM_X + x + 4] = psum4; + omap[y * BLOCK_DIM_X + x + 5] = psum5; + omap[y * BLOCK_DIM_X + x + 6] = psum6; + } + } +} + +inline void conv2d_5x5_back(float* imap, float* filter, float* omap) { + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + for (size_t x = 0; x < BLOCK_DIM_X; x += 8) { + register float psum0 = omap[y * BLOCK_DIM_X + x + 0]; + register float psum1 = omap[y * BLOCK_DIM_X + x + 1]; + register float psum2 = omap[y * BLOCK_DIM_X + x + 2]; + register float psum3 = omap[y * BLOCK_DIM_X + x + 3]; + register float psum4 = omap[y * BLOCK_DIM_X + x + 4]; + register float psum5 = omap[y * BLOCK_DIM_X + x + 5]; + register float psum6 = omap[y * BLOCK_DIM_X + x + 6]; + register float psum7 = omap[y * BLOCK_DIM_X + x + 7]; + for (size_t yy = 0; yy < FILTER_DIM; yy++) { + register float filter0 = filter[yy * FILTER_DIM + 0]; + register float filter1 = filter[yy * FILTER_DIM + 1]; + register float filter2 = filter[yy * FILTER_DIM + 2]; + register float filter3 = filter[yy * FILTER_DIM + 3]; + register float filter4 = filter[yy * FILTER_DIM + 4]; + register float imap0 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 0]; + register float imap1 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 1]; + register float imap2 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 2]; + register float imap3 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 3]; + register float imap4 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 4]; + register float imap5 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 5]; + register float imap6 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 6]; + register float imap7 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 7]; + register float imap8 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 8]; + register float imap9 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 9]; + register float imap10 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 10]; + register float imap11 = imap[y * IMAP_DIM_X + x + yy * IMAP_DIM_X + 11]; + asm volatile("": : :"memory"); + +#ifdef HB_EMUL + psum0 += imap0 * filter0; + psum1 += imap1 * filter0; + psum2 += imap2 * filter0; + psum3 += imap3 * filter0; + psum4 += imap4 * filter0; + psum5 += imap5 * filter0; + psum6 += imap6 * filter0; + psum7 += imap7 * filter0; + + psum0 += imap1 * filter1; + psum1 += imap2 * filter1; + psum2 += imap3 * filter1; + psum3 += imap4 * filter1; + psum4 += imap5 * filter1; + psum5 += imap6 * filter1; + psum6 += imap7 * filter1; + psum7 += imap8 * filter1; + + psum0 += imap2 * filter2; + psum1 += imap3 * filter2; + psum2 += imap4 * filter2; + psum3 += imap5 * filter2; + psum4 += imap6 * filter2; + psum5 += imap7 * filter2; + psum6 += imap8 * filter2; + psum7 += imap9 * filter2; + + psum0 += imap3 * filter3; + psum1 += imap4 * filter3; + psum2 += imap5 * filter3; + psum3 += imap6 * filter3; + psum4 += imap7 * filter3; + psum5 += imap8 * filter3; + psum6 += imap9 * filter3; + psum7 += imap10 * filter3; + + psum0 += imap4 * filter4; + psum1 += imap5 * filter4; + psum2 += imap6 * filter4; + psum3 += imap7 * filter4; + psum4 += imap8 * filter4; + psum5 += imap9 * filter4; + psum6 += imap10 * filter4; + psum7 += imap11 * filter4; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap3), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap4), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap5), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap6), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap7), "f"(filter0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap4), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap5), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap6), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap7), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap8), "f"(filter1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap5), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap6), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap7), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap8), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap9), "f"(filter2)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap4), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap5), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap6), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap7), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap8), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap9), "f"(filter3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap10), "f"(filter3)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap4), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap5), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap6), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap7), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap8), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap9), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap10), "f"(filter4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap11), "f"(filter4)); +#endif + } + omap[y * BLOCK_DIM_X + x + 0] = psum0; + omap[y * BLOCK_DIM_X + x + 1] = psum1; + omap[y * BLOCK_DIM_X + x + 2] = psum2; + omap[y * BLOCK_DIM_X + x + 3] = psum3; + omap[y * BLOCK_DIM_X + x + 4] = psum4; + omap[y * BLOCK_DIM_X + x + 5] = psum5; + omap[y * BLOCK_DIM_X + x + 6] = psum6; + omap[y * BLOCK_DIM_X + x + 7] = psum7; + } + } +} + + +template +inline void conv2d_3x3_16(float* imap, float* filter, float* omap) { + for (size_t y = 0; y < B_DIM_Y; y++) { + for (size_t x = 0; x < B_DIM_X; x += 8) { + register float psum0 = omap[y * B_DIM_X + x + 0]; + register float psum1 = omap[y * B_DIM_X + x + 1]; + register float psum2 = omap[y * B_DIM_X + x + 2]; + register float psum3 = omap[y * B_DIM_X + x + 3]; + register float psum4 = omap[y * B_DIM_X + x + 4]; + register float psum5 = omap[y * B_DIM_X + x + 5]; + register float psum6 = omap[y * B_DIM_X + x + 6]; + register float psum7 = omap[y * B_DIM_X + x + 7]; + for (size_t yy = 0; yy < F_DIM; yy++) { + register float filter0 = filter[yy * F_DIM + 0]; + register float filter1 = filter[yy * F_DIM + 1]; + register float filter2 = filter[yy * F_DIM + 2]; + register float imap0 = imap[y * I_DIM_X + x + yy * I_DIM_X + 0]; + register float imap1 = imap[y * I_DIM_X + x + yy * I_DIM_X + 1]; + register float imap2 = imap[y * I_DIM_X + x + yy * I_DIM_X + 2]; + register float imap3 = imap[y * I_DIM_X + x + yy * I_DIM_X + 3]; + register float imap4 = imap[y * I_DIM_X + x + yy * I_DIM_X + 4]; + register float imap5 = imap[y * I_DIM_X + x + yy * I_DIM_X + 5]; + register float imap6 = imap[y * I_DIM_X + x + yy * I_DIM_X + 6]; + register float imap7 = imap[y * I_DIM_X + x + yy * I_DIM_X + 7]; + register float imap8 = imap[y * I_DIM_X + x + yy * I_DIM_X + 8]; + register float imap9 = imap[y * I_DIM_X + x + yy * I_DIM_X + 9]; + asm volatile("": : :"memory"); + +#ifdef HB_EMUL + psum0 += imap0 * filter0; + psum1 += imap1 * filter0; + psum2 += imap2 * filter0; + psum3 += imap3 * filter0; + psum4 += imap4 * filter0; + psum5 += imap5 * filter0; + psum6 += imap6 * filter0; + psum7 += imap7 * filter0; + + psum0 += imap1 * filter1; + psum1 += imap2 * filter1; + psum2 += imap3 * filter1; + psum3 += imap4 * filter1; + psum4 += imap5 * filter1; + psum5 += imap6 * filter1; + psum6 += imap7 * filter1; + psum7 += imap8 * filter1; + + psum0 += imap2 * filter2; + psum1 += imap3 * filter2; + psum2 += imap4 * filter2; + psum3 += imap5 * filter2; + psum4 += imap6 * filter2; + psum5 += imap7 * filter2; + psum6 += imap8 * filter2; + psum7 += imap9 * filter2; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap3), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap4), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap5), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap6), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap7), "f"(filter0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap4), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap5), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap6), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap7), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap8), "f"(filter1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap5), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap6), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap7), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap8), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap9), "f"(filter2)); +#endif + } + omap[y * B_DIM_X + x + 0] = psum0; + omap[y * B_DIM_X + x + 1] = psum1; + omap[y * B_DIM_X + x + 2] = psum2; + omap[y * B_DIM_X + x + 3] = psum3; + omap[y * B_DIM_X + x + 4] = psum4; + omap[y * B_DIM_X + x + 5] = psum5; + omap[y * B_DIM_X + x + 6] = psum6; + omap[y * B_DIM_X + x + 7] = psum7; + } + } +} + +template +inline void conv2d_3x3_4(float* imap, float* filter, float* omap) { + for (size_t y = 0; y < B_DIM_Y; y++) { + for (size_t x = 0; x < B_DIM_X; x += 4) { + register float psum0 = omap[y * B_DIM_X + x + 0]; + register float psum1 = omap[y * B_DIM_X + x + 1]; + register float psum2 = omap[y * B_DIM_X + x + 2]; + register float psum3 = omap[y * B_DIM_X + x + 3]; + for (size_t yy = 0; yy < F_DIM; yy++) { + register float filter0 = filter[yy * F_DIM + 0]; + register float filter1 = filter[yy * F_DIM + 1]; + register float filter2 = filter[yy * F_DIM + 2]; + register float imap0 = imap[y * I_DIM_X + x + yy * I_DIM_X + 0]; + register float imap1 = imap[y * I_DIM_X + x + yy * I_DIM_X + 1]; + register float imap2 = imap[y * I_DIM_X + x + yy * I_DIM_X + 2]; + register float imap3 = imap[y * I_DIM_X + x + yy * I_DIM_X + 3]; + register float imap4 = imap[y * I_DIM_X + x + yy * I_DIM_X + 4]; + register float imap5 = imap[y * I_DIM_X + x + yy * I_DIM_X + 5]; + asm volatile("": : :"memory"); + +#ifdef HB_EMUL + psum0 += imap0 * filter0; + psum1 += imap1 * filter0; + psum2 += imap2 * filter0; + psum3 += imap3 * filter0; + + psum0 += imap1 * filter1; + psum1 += imap2 * filter1; + psum2 += imap3 * filter1; + psum3 += imap4 * filter1; + + psum0 += imap2 * filter2; + psum1 += imap3 * filter2; + psum2 += imap4 * filter2; + psum3 += imap5 * filter2; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap3), "f"(filter0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(filter1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap4), "f"(filter1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(filter2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap5), "f"(filter2)); +#endif + } + omap[y * B_DIM_X + x + 0] = psum0; + omap[y * B_DIM_X + x + 1] = psum1; + omap[y * B_DIM_X + x + 2] = psum2; + omap[y * B_DIM_X + x + 3] = psum3; + } + } +} + +template +inline void conv2d_1x1_16(float* imap, float* filter, float* omap) { + for (size_t y = 0; y < B_DIM_Y; ++y) { + for (size_t x = 0; x < B_DIM_X; x += 8) { + register float psum0 = omap[y * B_DIM_X + x + 0]; + register float psum1 = omap[y * B_DIM_X + x + 1]; + register float psum2 = omap[y * B_DIM_X + x + 2]; + register float psum3 = omap[y * B_DIM_X + x + 3]; + register float psum4 = omap[y * B_DIM_X + x + 4]; + register float psum5 = omap[y * B_DIM_X + x + 5]; + register float psum6 = omap[y * B_DIM_X + x + 6]; + register float psum7 = omap[y * B_DIM_X + x + 7]; + + register float filter0 = filter[0]; + register float imap0 = imap[y * B_DIM_X + x + 0]; + register float imap1 = imap[y * B_DIM_X + x + 1]; + register float imap2 = imap[y * B_DIM_X + x + 2]; + register float imap3 = imap[y * B_DIM_X + x + 3]; + register float imap4 = imap[y * B_DIM_X + x + 4]; + register float imap5 = imap[y * B_DIM_X + x + 5]; + register float imap6 = imap[y * B_DIM_X + x + 6]; + register float imap7 = imap[y * B_DIM_X + x + 7]; + asm volatile("": : :"memory"); + +#ifdef HB_EMUL + psum0 += imap0 * filter0; + psum1 += imap1 * filter0; + psum2 += imap2 * filter0; + psum3 += imap3 * filter0; + psum4 += imap4 * filter0; + psum5 += imap5 * filter0; + psum6 += imap6 * filter0; + psum7 += imap7 * filter0; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap3), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum4) : "f"(imap4), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum5) : "f"(imap5), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum6) : "f"(imap6), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum7) : "f"(imap7), "f"(filter0)); +#endif + omap[y * B_DIM_X + x + 0] = psum0; + omap[y * B_DIM_X + x + 1] = psum1; + omap[y * B_DIM_X + x + 2] = psum2; + omap[y * B_DIM_X + x + 3] = psum3; + omap[y * B_DIM_X + x + 4] = psum4; + omap[y * B_DIM_X + x + 5] = psum5; + omap[y * B_DIM_X + x + 6] = psum6; + omap[y * B_DIM_X + x + 7] = psum7; + } + } +} + +template +inline void conv2d_1x1_4(float* imap, float* filter, float* omap) { + for (size_t y = 0; y < B_DIM_Y; ++y) { + for (size_t x = 0; x < B_DIM_X; x += 4) { + register float psum0 = omap[y * B_DIM_X + x + 0]; + register float psum1 = omap[y * B_DIM_X + x + 1]; + register float psum2 = omap[y * B_DIM_X + x + 2]; + register float psum3 = omap[y * B_DIM_X + x + 3]; + + register float filter0 = filter[0]; + register float imap0 = imap[y * B_DIM_X + x + 0]; + register float imap1 = imap[y * B_DIM_X + x + 1]; + register float imap2 = imap[y * B_DIM_X + x + 2]; + register float imap3 = imap[y * B_DIM_X + x + 3]; + asm volatile("": : :"memory"); + +#ifdef HB_EMUL + psum0 += imap0 * filter0; + psum1 += imap1 * filter0; + psum2 += imap2 * filter0; + psum3 += imap3 * filter0; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(filter0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum3) : "f"(imap3), "f"(filter0)); +#endif + omap[y * B_DIM_X + x + 0] = psum0; + omap[y * B_DIM_X + x + 1] = psum1; + omap[y * B_DIM_X + x + 2] = psum2; + omap[y * B_DIM_X + x + 3] = psum3; + } + } +} + diff --git a/hammerblade/torch/kernel/kernel_conv_resnet.cpp b/hammerblade/torch/kernel/kernel_conv_resnet.cpp new file mode 100644 index 0000000000..1b185cc6ee --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet.cpp @@ -0,0 +1,714 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 32 +#define BLOCK_DIM_X 16 +#define BLOCK_DIM_Y 16 +#define FILTER_DIM 3 +#define PADDING 1 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + // this is used to correct the padding output offset + imap_x = imap_x == 0 ? 0 : imap_x - PADDING; + imap_y = imap_y == 0 ? 0 : imap_y - PADDING; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X-PADDING; + size_t read_y = IMAP_DIM_Y-PADDING; + size_t block_id = block_y * 2 + block_x; + size_t W_pad = -1; + // see if we need to add padding + switch (block_id) { + case 0: + W_pad = 0; + addPaddingH_1(0); + logical_start = PADDING*IMAP_DIM_X+PADDING; + break; + case 1: + W_pad = 0; + addPaddingH_1(IMAP_DIM_X-PADDING); + logical_start = PADDING*IMAP_DIM_X; + break; + case 2: + W_pad = (IMAP_DIM_Y-PADDING)*IMAP_DIM_X; + addPaddingH_1(0); + logical_start = PADDING; + break; + case 3: + W_pad = (IMAP_DIM_Y-PADDING)*IMAP_DIM_X; + addPaddingH_1(IMAP_DIM_X-PADDING); + logical_start = 0; + break; + default: + hb_assert(false); + } + addPaddingW_1(W_pad); // top / bot padding + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + for (size_t r = 0; r < read_y; r++) { + + // unroll by IMAP_DIM_X-PADDING == 17 + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + register float tmp08 = *(imap_src_base + 8); + register float tmp09 = *(imap_src_base + 9); + register float tmp10 = *(imap_src_base + 10); + register float tmp11 = *(imap_src_base + 11); + register float tmp12 = *(imap_src_base + 12); + register float tmp13 = *(imap_src_base + 13); + register float tmp14 = *(imap_src_base + 14); + register float tmp15 = *(imap_src_base + 15); + register float tmp16 = *(imap_src_base + 16); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + imap_buf[buf_offset + 4] = tmp04; + imap_buf[buf_offset + 5] = tmp05; + imap_buf[buf_offset + 6] = tmp06; + imap_buf[buf_offset + 7] = tmp07; + imap_buf[buf_offset + 8] = tmp08; + imap_buf[buf_offset + 9] = tmp09; + imap_buf[buf_offset + 10] = tmp10; + imap_buf[buf_offset + 11] = tmp11; + imap_buf[buf_offset + 12] = tmp12; + imap_buf[buf_offset + 13] = tmp13; + imap_buf[buf_offset + 14] = tmp14; + imap_buf[buf_offset + 15] = tmp15; + imap_buf[buf_offset + 16] = tmp16; + + buf_offset += IMAP_DIM_X; + imap_src_base += y_step; + } + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_32x32( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id / 2; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)/2) { + if (bsg_id % 2 == 0 && idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_3x3_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_32x32_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id / 2; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)/2) { + if (bsg_id % 2 == 0 && idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_3x3_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_32x32_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + size_t buf_offset = 0; + for (size_t row = 0; row < BLOCK_DIM_Y; row++) { + // unroll by BLOCK_DIM_X == 16 + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + register float tmp08 = *(grad_src_base + 8); + register float tmp09 = *(grad_src_base + 9); + register float tmp10 = *(grad_src_base + 10); + register float tmp11 = *(grad_src_base + 11); + register float tmp12 = *(grad_src_base + 12); + register float tmp13 = *(grad_src_base + 13); + register float tmp14 = *(grad_src_base + 14); + register float tmp15 = *(grad_src_base + 15); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + grad_buf[buf_offset + 8] = tmp08; + grad_buf[buf_offset + 9] = tmp09; + grad_buf[buf_offset + 10] = tmp10; + grad_buf[buf_offset + 11] = tmp11; + grad_buf[buf_offset + 12] = tmp12; + grad_buf[buf_offset + 13] = tmp13; + grad_buf[buf_offset + 14] = tmp14; + grad_buf[buf_offset + 15] = tmp15; + + buf_offset += BLOCK_DIM_X; + grad_src_base += y_step; + } + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id / 2; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)/2) { + if (bsg_id % 2 == 0 && idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum0 = 0; + register float psum1 = 0; + register float psum2 = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 8) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + register float grad4 = grad_row[x+4]; + register float grad5 = grad_row[x+5]; + register float grad6 = grad_row[x+6]; + register float grad7 = grad_row[x+7]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + register float imap6 = imap_row[x+6]; + register float imap7 = imap_row[x+7]; + register float imap8 = imap_row[x+8]; + register float imap9 = imap_row[x+9]; + +#ifdef HB_EMUL + psum0 += imap0 * grad0; + psum1 += imap1 * grad0; + psum2 += imap2 * grad0; + + psum0 += imap1 * grad1; + psum1 += imap2 * grad1; + psum2 += imap3 * grad1; + + psum0 += imap2 * grad2; + psum1 += imap3 * grad2; + psum2 += imap4 * grad2; + + psum0 += imap3 * grad3; + psum1 += imap4 * grad3; + psum2 += imap5 * grad3; + + psum0 += imap4 * grad4; + psum1 += imap5 * grad4; + psum2 += imap6 * grad4; + + psum0 += imap5 * grad5; + psum1 += imap6 * grad5; + psum2 += imap7 * grad5; + + psum0 += imap6 * grad6; + psum1 += imap7 * grad6; + psum2 += imap8 * grad6; + + psum0 += imap7 * grad7; + psum1 += imap8 * grad7; + psum2 += imap9 * grad7; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(grad0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(grad1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(grad2)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap4), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap5), "f"(grad3)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap4), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap5), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap6), "f"(grad4)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap5), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap6), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap7), "f"(grad5)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap6), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap7), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap8), "f"(grad6)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap7), "f"(grad7)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap8), "f"(grad7)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap9), "f"(grad7)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum0; + output[1] += psum1; + output[2] += psum2; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_32x32, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_32x32_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_32x32_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} diff --git a/hammerblade/torch/kernel/kernel_conv_resnet_16x16.cpp b/hammerblade/torch/kernel/kernel_conv_resnet_16x16.cpp new file mode 100644 index 0000000000..0a871da0c9 --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet_16x16.cpp @@ -0,0 +1,689 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 16 +#define BLOCK_DIM_X 16 +#define BLOCK_DIM_Y 16 +#define FILTER_DIM 3 +#define PADDING 1 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + size_t imap_x = 0; + size_t imap_y = 0; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X-2*PADDING; + size_t read_y = IMAP_DIM_Y-2*PADDING; + size_t block_id = block_y * 2 + block_x; + + addPaddingW_1(0); + addPaddingW_1((IMAP_DIM_Y-PADDING)*IMAP_DIM_X); + addPaddingH_1(0); + addPaddingH_1(IMAP_DIM_X-PADDING); + logical_start = PADDING*IMAP_DIM_X+PADDING; + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + for (size_t r = 0; r < read_y; r++) { + + // unroll by IMAP_DIM_X-2*PADDING == 16 + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + register float tmp08 = *(imap_src_base + 8); + register float tmp09 = *(imap_src_base + 9); + register float tmp10 = *(imap_src_base + 10); + register float tmp11 = *(imap_src_base + 11); + register float tmp12 = *(imap_src_base + 12); + register float tmp13 = *(imap_src_base + 13); + register float tmp14 = *(imap_src_base + 14); + register float tmp15 = *(imap_src_base + 15); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + imap_buf[buf_offset + 4] = tmp04; + imap_buf[buf_offset + 5] = tmp05; + imap_buf[buf_offset + 6] = tmp06; + imap_buf[buf_offset + 7] = tmp07; + imap_buf[buf_offset + 8] = tmp08; + imap_buf[buf_offset + 9] = tmp09; + imap_buf[buf_offset + 10] = tmp10; + imap_buf[buf_offset + 11] = tmp11; + imap_buf[buf_offset + 12] = tmp12; + imap_buf[buf_offset + 13] = tmp13; + imap_buf[buf_offset + 14] = tmp14; + imap_buf[buf_offset + 15] = tmp15; + + buf_offset += IMAP_DIM_X; + imap_src_base += y_step; + } + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_16x16( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_3x3_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_16x16_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_3x3_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_16x16_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + // fill_imap_buffer(grad_src_base, grad_buf, y_step); + size_t buf_offset = 0; + for (size_t row = 0; row < BLOCK_DIM_Y; row++) { + // unroll by BLOCK_DIM_X == 16 + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + register float tmp08 = *(grad_src_base + 8); + register float tmp09 = *(grad_src_base + 9); + register float tmp10 = *(grad_src_base + 10); + register float tmp11 = *(grad_src_base + 11); + register float tmp12 = *(grad_src_base + 12); + register float tmp13 = *(grad_src_base + 13); + register float tmp14 = *(grad_src_base + 14); + register float tmp15 = *(grad_src_base + 15); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + grad_buf[buf_offset + 8] = tmp08; + grad_buf[buf_offset + 9] = tmp09; + grad_buf[buf_offset + 10] = tmp10; + grad_buf[buf_offset + 11] = tmp11; + grad_buf[buf_offset + 12] = tmp12; + grad_buf[buf_offset + 13] = tmp13; + grad_buf[buf_offset + 14] = tmp14; + grad_buf[buf_offset + 15] = tmp15; + + buf_offset += BLOCK_DIM_X; + grad_src_base += y_step; + } + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum0 = 0; + register float psum1 = 0; + register float psum2 = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 8) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + register float grad4 = grad_row[x+4]; + register float grad5 = grad_row[x+5]; + register float grad6 = grad_row[x+6]; + register float grad7 = grad_row[x+7]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + register float imap6 = imap_row[x+6]; + register float imap7 = imap_row[x+7]; + register float imap8 = imap_row[x+8]; + register float imap9 = imap_row[x+9]; + +#ifdef HB_EMUL + psum0 += imap0 * grad0; + psum1 += imap1 * grad0; + psum2 += imap2 * grad0; + + psum0 += imap1 * grad1; + psum1 += imap2 * grad1; + psum2 += imap3 * grad1; + + psum0 += imap2 * grad2; + psum1 += imap3 * grad2; + psum2 += imap4 * grad2; + + psum0 += imap3 * grad3; + psum1 += imap4 * grad3; + psum2 += imap5 * grad3; + + psum0 += imap4 * grad4; + psum1 += imap5 * grad4; + psum2 += imap6 * grad4; + + psum0 += imap5 * grad5; + psum1 += imap6 * grad5; + psum2 += imap7 * grad5; + + psum0 += imap6 * grad6; + psum1 += imap7 * grad6; + psum2 += imap8 * grad6; + + psum0 += imap7 * grad7; + psum1 += imap8 * grad7; + psum2 += imap9 * grad7; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(grad0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(grad1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(grad2)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap4), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap5), "f"(grad3)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap4), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap5), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap6), "f"(grad4)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap5), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap6), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap7), "f"(grad5)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap6), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap7), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap8), "f"(grad6)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap7), "f"(grad7)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap8), "f"(grad7)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap9), "f"(grad7)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum0; + output[1] += psum1; + output[2] += psum2; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_16x16, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_16x16_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_16x16_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} \ No newline at end of file diff --git a/hammerblade/torch/kernel/kernel_conv_resnet_16x16_kernel1x1.cpp b/hammerblade/torch/kernel/kernel_conv_resnet_16x16_kernel1x1.cpp new file mode 100644 index 0000000000..b4fc3b956a --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet_16x16_kernel1x1.cpp @@ -0,0 +1,562 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 16 +#define BLOCK_DIM_X 16 +#define BLOCK_DIM_Y 16 +#define FILTER_DIM 1 +#define PADDING 0 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + + size_t imap_x = 0; + size_t imap_y = 0; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X; + size_t read_y = IMAP_DIM_Y; + size_t block_id = block_y * 2 + block_x; + + logical_start = 0; + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + for (size_t r = 0; r < read_y; r++) { + + // unroll by IMAP_DIM_X-2*PADDING == 16 + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + register float tmp08 = *(imap_src_base + 8); + register float tmp09 = *(imap_src_base + 9); + register float tmp10 = *(imap_src_base + 10); + register float tmp11 = *(imap_src_base + 11); + register float tmp12 = *(imap_src_base + 12); + register float tmp13 = *(imap_src_base + 13); + register float tmp14 = *(imap_src_base + 14); + register float tmp15 = *(imap_src_base + 15); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + imap_buf[buf_offset + 4] = tmp04; + imap_buf[buf_offset + 5] = tmp05; + imap_buf[buf_offset + 6] = tmp06; + imap_buf[buf_offset + 7] = tmp07; + imap_buf[buf_offset + 8] = tmp08; + imap_buf[buf_offset + 9] = tmp09; + imap_buf[buf_offset + 10] = tmp10; + imap_buf[buf_offset + 11] = tmp11; + imap_buf[buf_offset + 12] = tmp12; + imap_buf[buf_offset + 13] = tmp13; + imap_buf[buf_offset + 14] = tmp14; + imap_buf[buf_offset + 15] = tmp15; + + buf_offset += IMAP_DIM_X; + imap_src_base += y_step; + } + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_16x16( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 0 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_1x1_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_16x16_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_1x1_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_16x16_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + // fill_imap_buffer(grad_src_base, grad_buf, y_step); + size_t buf_offset = 0; + for (size_t row = 0; row < BLOCK_DIM_Y; row++) { + // unroll by BLOCK_DIM_X == 16 + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + register float tmp08 = *(grad_src_base + 8); + register float tmp09 = *(grad_src_base + 9); + register float tmp10 = *(grad_src_base + 10); + register float tmp11 = *(grad_src_base + 11); + register float tmp12 = *(grad_src_base + 12); + register float tmp13 = *(grad_src_base + 13); + register float tmp14 = *(grad_src_base + 14); + register float tmp15 = *(grad_src_base + 15); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + grad_buf[buf_offset + 8] = tmp08; + grad_buf[buf_offset + 9] = tmp09; + grad_buf[buf_offset + 10] = tmp10; + grad_buf[buf_offset + 11] = tmp11; + grad_buf[buf_offset + 12] = tmp12; + grad_buf[buf_offset + 13] = tmp13; + grad_buf[buf_offset + 14] = tmp14; + grad_buf[buf_offset + 15] = tmp15; + + buf_offset += BLOCK_DIM_X; + grad_src_base += y_step; + } + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum0 = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 8) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + register float grad4 = grad_row[x+4]; + register float grad5 = grad_row[x+5]; + register float grad6 = grad_row[x+6]; + register float grad7 = grad_row[x+7]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + register float imap6 = imap_row[x+6]; + register float imap7 = imap_row[x+7]; + +#ifdef HB_EMUL + psum0 += imap0 * grad0; + psum0 += imap1 * grad1; + psum0 += imap2 * grad2; + psum0 += imap3 * grad3; + psum0 += imap4 * grad4; + psum0 += imap5 * grad5; + psum0 += imap6 * grad6; + psum0 += imap7 * grad7; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap4), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap5), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap6), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap7), "f"(grad7)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum0; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_16x16, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_16x16_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_16x16_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} diff --git a/hammerblade/torch/kernel/kernel_conv_resnet_4x4.cpp b/hammerblade/torch/kernel/kernel_conv_resnet_4x4.cpp new file mode 100644 index 0000000000..a26ac744a0 --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet_4x4.cpp @@ -0,0 +1,643 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 4 +#define BLOCK_DIM_X 4 +#define BLOCK_DIM_Y 4 +#define FILTER_DIM 3 +#define PADDING 1 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + size_t imap_x = 0; + size_t imap_y = 0; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X-2*PADDING; + size_t read_y = IMAP_DIM_Y-2*PADDING; + size_t block_id = block_y * 2 + block_x; + + addPaddingW_1(0); + addPaddingW_1((IMAP_DIM_Y-PADDING)*IMAP_DIM_X); + addPaddingH_1(0); + addPaddingH_1(IMAP_DIM_X-PADDING); + logical_start = PADDING*IMAP_DIM_X+PADDING; + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + + // 4x4 case there are 16 elements -- fully unroll + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + register float tmp08 = *(imap_src_base + 8); + register float tmp09 = *(imap_src_base + 9); + register float tmp10 = *(imap_src_base + 10); + register float tmp11 = *(imap_src_base + 11); + register float tmp12 = *(imap_src_base + 12); + register float tmp13 = *(imap_src_base + 13); + register float tmp14 = *(imap_src_base + 14); + register float tmp15 = *(imap_src_base + 15); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + buf_offset += IMAP_DIM_X; + imap_buf[buf_offset + 0] = tmp04; + imap_buf[buf_offset + 1] = tmp05; + imap_buf[buf_offset + 2] = tmp06; + imap_buf[buf_offset + 3] = tmp07; + buf_offset += IMAP_DIM_X; + imap_buf[buf_offset + 0] = tmp08; + imap_buf[buf_offset + 1] = tmp09; + imap_buf[buf_offset + 2] = tmp10; + imap_buf[buf_offset + 3] = tmp11; + buf_offset += IMAP_DIM_X; + imap_buf[buf_offset + 0] = tmp12; + imap_buf[buf_offset + 1] = tmp13; + imap_buf[buf_offset + 2] = tmp14; + imap_buf[buf_offset + 3] = tmp15; + + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_4x4( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_3x3_4(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_4x4_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_3x3_4(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_4x4_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + // fill_imap_buffer(grad_src_base, grad_buf, y_step); + size_t buf_offset = 0; + // 4x4 case there are 16 elements -- fully unroll + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + register float tmp08 = *(grad_src_base + 8); + register float tmp09 = *(grad_src_base + 9); + register float tmp10 = *(grad_src_base + 10); + register float tmp11 = *(grad_src_base + 11); + register float tmp12 = *(grad_src_base + 12); + register float tmp13 = *(grad_src_base + 13); + register float tmp14 = *(grad_src_base + 14); + register float tmp15 = *(grad_src_base + 15); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + grad_buf[buf_offset + 8] = tmp08; + grad_buf[buf_offset + 9] = tmp09; + grad_buf[buf_offset + 10] = tmp10; + grad_buf[buf_offset + 11] = tmp11; + grad_buf[buf_offset + 12] = tmp12; + grad_buf[buf_offset + 13] = tmp13; + grad_buf[buf_offset + 14] = tmp14; + grad_buf[buf_offset + 15] = tmp15; + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum0 = 0; + register float psum1 = 0; + register float psum2 = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 4) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + +#ifdef HB_EMUL + psum0 += imap0 * grad0; + psum1 += imap1 * grad0; + psum2 += imap2 * grad0; + + psum0 += imap1 * grad1; + psum1 += imap2 * grad1; + psum2 += imap3 * grad1; + + psum0 += imap2 * grad2; + psum1 += imap3 * grad2; + psum2 += imap4 * grad2; + + psum0 += imap3 * grad3; + psum1 += imap4 * grad3; + psum2 += imap5 * grad3; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(grad0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(grad1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(grad2)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap4), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap5), "f"(grad3)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum0; + output[1] += psum1; + output[2] += psum2; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_4x4, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_4x4_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_4x4_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} diff --git a/hammerblade/torch/kernel/kernel_conv_resnet_4x4_kernel1x1.cpp b/hammerblade/torch/kernel/kernel_conv_resnet_4x4_kernel1x1.cpp new file mode 100644 index 0000000000..08e949085e --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet_4x4_kernel1x1.cpp @@ -0,0 +1,541 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 4 +#define BLOCK_DIM_X 4 +#define BLOCK_DIM_Y 4 +#define FILTER_DIM 1 +#define PADDING 0 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + size_t imap_x = 0; + size_t imap_y = 0; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X-2*PADDING; + size_t read_y = IMAP_DIM_Y-2*PADDING; + size_t block_id = block_y * 2 + block_x; + + logical_start = 0; + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + + // 4x4 case there are 16 elements -- fully unroll + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + register float tmp08 = *(imap_src_base + 8); + register float tmp09 = *(imap_src_base + 9); + register float tmp10 = *(imap_src_base + 10); + register float tmp11 = *(imap_src_base + 11); + register float tmp12 = *(imap_src_base + 12); + register float tmp13 = *(imap_src_base + 13); + register float tmp14 = *(imap_src_base + 14); + register float tmp15 = *(imap_src_base + 15); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + buf_offset += IMAP_DIM_X; + imap_buf[buf_offset + 0] = tmp04; + imap_buf[buf_offset + 1] = tmp05; + imap_buf[buf_offset + 2] = tmp06; + imap_buf[buf_offset + 3] = tmp07; + buf_offset += IMAP_DIM_X; + imap_buf[buf_offset + 0] = tmp08; + imap_buf[buf_offset + 1] = tmp09; + imap_buf[buf_offset + 2] = tmp10; + imap_buf[buf_offset + 3] = tmp11; + buf_offset += IMAP_DIM_X; + imap_buf[buf_offset + 0] = tmp12; + imap_buf[buf_offset + 1] = tmp13; + imap_buf[buf_offset + 2] = tmp14; + imap_buf[buf_offset + 3] = tmp15; + + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_4x4( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 0 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_1x1_4(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_4x4_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 0 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_1x1_4(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_4x4_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 0 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + // fill_imap_buffer(grad_src_base, grad_buf, y_step); + size_t buf_offset = 0; + // 4x4 case there are 16 elements -- fully unroll + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + register float tmp08 = *(grad_src_base + 8); + register float tmp09 = *(grad_src_base + 9); + register float tmp10 = *(grad_src_base + 10); + register float tmp11 = *(grad_src_base + 11); + register float tmp12 = *(grad_src_base + 12); + register float tmp13 = *(grad_src_base + 13); + register float tmp14 = *(grad_src_base + 14); + register float tmp15 = *(grad_src_base + 15); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + grad_buf[buf_offset + 8] = tmp08; + grad_buf[buf_offset + 9] = tmp09; + grad_buf[buf_offset + 10] = tmp10; + grad_buf[buf_offset + 11] = tmp11; + grad_buf[buf_offset + 12] = tmp12; + grad_buf[buf_offset + 13] = tmp13; + grad_buf[buf_offset + 14] = tmp14; + grad_buf[buf_offset + 15] = tmp15; + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum0 = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 4) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + +#ifdef HB_EMUL + psum0 += imap0 * grad0; + psum0 += imap1 * grad1; + psum0 += imap2 * grad2; + psum0 += imap3 * grad3; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(grad3)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum0; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_4x4, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_4x4_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_4x4_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} diff --git a/hammerblade/torch/kernel/kernel_conv_resnet_8x8.cpp b/hammerblade/torch/kernel/kernel_conv_resnet_8x8.cpp new file mode 100644 index 0000000000..e8886d826a --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet_8x8.cpp @@ -0,0 +1,656 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 8 +#define BLOCK_DIM_X 8 +#define BLOCK_DIM_Y 8 +#define FILTER_DIM 3 +#define PADDING 1 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + size_t imap_x = 0; + size_t imap_y = 0; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X-2*PADDING; + size_t read_y = IMAP_DIM_Y-2*PADDING; + size_t block_id = block_y * 2 + block_x; + + addPaddingW_1(0); + addPaddingW_1((IMAP_DIM_Y-PADDING)*IMAP_DIM_X); + addPaddingH_1(0); + addPaddingH_1(IMAP_DIM_X-PADDING); + logical_start = PADDING*IMAP_DIM_X+PADDING; + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + for (size_t r = 0; r < read_y; r++) { + + // unroll by IMAP_DIM_X-2*PADDING == 8 + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + imap_buf[buf_offset + 4] = tmp04; + imap_buf[buf_offset + 5] = tmp05; + imap_buf[buf_offset + 6] = tmp06; + imap_buf[buf_offset + 7] = tmp07; + + buf_offset += IMAP_DIM_X; + imap_src_base += y_step; + } + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_8x8( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_3x3_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_8x8_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_3x3_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_3x3_8x8_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(PADDING == s[0]); // assume stride == 1 + hb_assert(PADDING == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + + auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + float* imap_src_base = (float*)imap.data_ptr(); + uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + fill_imap_buffer(imap_src_base, imap_buf, y_step); + }; + + // add 1 col of zeros + auto addPaddingH_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_Y) + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + imap_buf[start] = 0; + start += IMAP_DIM_X; + } + }; + + // add 1 row of zeros + auto addPaddingW_1 = [&](size_t start) { + bsg_unroll(IMAP_DIM_X) + for (size_t c = 0; c < IMAP_DIM_X; c++) { + imap_buf[start + c] = 0; + } + }; + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + size_t buf_offset = 0; + for (size_t row = 0; row < BLOCK_DIM_Y; row++) { + // unroll by BLOCK_DIM_X == 8 + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + + buf_offset += BLOCK_DIM_X; + grad_src_base += y_step; + } + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum0 = 0; + register float psum1 = 0; + register float psum2 = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 8) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + register float grad4 = grad_row[x+4]; + register float grad5 = grad_row[x+5]; + register float grad6 = grad_row[x+6]; + register float grad7 = grad_row[x+7]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + register float imap6 = imap_row[x+6]; + register float imap7 = imap_row[x+7]; + register float imap8 = imap_row[x+8]; + register float imap9 = imap_row[x+9]; + +#ifdef HB_EMUL + psum0 += imap0 * grad0; + psum1 += imap1 * grad0; + psum2 += imap2 * grad0; + + psum0 += imap1 * grad1; + psum1 += imap2 * grad1; + psum2 += imap3 * grad1; + + psum0 += imap2 * grad2; + psum1 += imap3 * grad2; + psum2 += imap4 * grad2; + + psum0 += imap3 * grad3; + psum1 += imap4 * grad3; + psum2 += imap5 * grad3; + + psum0 += imap4 * grad4; + psum1 += imap5 * grad4; + psum2 += imap6 * grad4; + + psum0 += imap5 * grad5; + psum1 += imap6 * grad5; + psum2 += imap7 * grad5; + + psum0 += imap6 * grad6; + psum1 += imap7 * grad6; + psum2 += imap8 * grad6; + + psum0 += imap7 * grad7; + psum1 += imap8 * grad7; + psum2 += imap9 * grad7; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap1), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap2), "f"(grad0)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap2), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap3), "f"(grad1)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap3), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap4), "f"(grad2)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap4), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap5), "f"(grad3)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap4), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap5), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap6), "f"(grad4)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap5), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap6), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap7), "f"(grad5)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap6), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap7), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap8), "f"(grad6)); + + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap7), "f"(grad7)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum1) : "f"(imap8), "f"(grad7)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum2) : "f"(imap9), "f"(grad7)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum0; + output[1] += psum1; + output[2] += psum2; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_8x8, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_8x8_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_3x3_8x8_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} \ No newline at end of file diff --git a/hammerblade/torch/kernel/kernel_conv_resnet_8x8_kernel1x1.cpp b/hammerblade/torch/kernel/kernel_conv_resnet_8x8_kernel1x1.cpp new file mode 100644 index 0000000000..4e90991a71 --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet_8x8_kernel1x1.cpp @@ -0,0 +1,528 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 8 +#define BLOCK_DIM_X 8 +#define BLOCK_DIM_Y 8 +#define FILTER_DIM 1 +#define PADDING 0 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + // add 1 col of zeros + size_t imap_x = 0; + size_t imap_y = 0; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X-2*PADDING; + size_t read_y = IMAP_DIM_Y-2*PADDING; + size_t block_id = block_y * 2 + block_x; + + logical_start = 0; + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + for (size_t r = 0; r < read_y; r++) { + + // unroll by IMAP_DIM_X-2*PADDING == 8 + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + imap_buf[buf_offset + 4] = tmp04; + imap_buf[buf_offset + 5] = tmp05; + imap_buf[buf_offset + 6] = tmp06; + imap_buf[buf_offset + 7] = tmp07; + + buf_offset += IMAP_DIM_X; + imap_src_base += y_step; + } + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_8x8( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 0 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_1x1_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_8x8_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_1x1_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_8x8_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + size_t buf_offset = 0; + for (size_t row = 0; row < BLOCK_DIM_Y; row++) { + // unroll by BLOCK_DIM_X == 8 + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + + buf_offset += BLOCK_DIM_X; + grad_src_base += y_step; + } + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) { + if (idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum0 = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 8) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + register float grad4 = grad_row[x+4]; + register float grad5 = grad_row[x+5]; + register float grad6 = grad_row[x+6]; + register float grad7 = grad_row[x+7]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + register float imap6 = imap_row[x+6]; + register float imap7 = imap_row[x+7]; + +#ifdef HB_EMUL + psum0 += imap0 * grad0; + psum0 += imap1 * grad1; + psum0 += imap2 * grad2; + psum0 += imap3 * grad3; + psum0 += imap4 * grad4; + psum0 += imap5 * grad5; + psum0 += imap6 * grad6; + psum0 += imap7 * grad7; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap3), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap4), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap5), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap6), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum0) : "f"(imap7), "f"(grad7)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum0; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_8x8, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_8x8_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_8x8_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +} diff --git a/hammerblade/torch/kernel/kernel_conv_resnet_kernel1x1.cpp b/hammerblade/torch/kernel/kernel_conv_resnet_kernel1x1.cpp new file mode 100644 index 0000000000..33ef5b35ac --- /dev/null +++ b/hammerblade/torch/kernel/kernel_conv_resnet_kernel1x1.cpp @@ -0,0 +1,558 @@ +//==================================================================== +// SPMD 2D Convolution +// Idea is that each tile will receive a piece of output image that +// does not overlap with any other tile to work on +// 10/02/2020 Lin Cheng +//==================================================================== + +#define RAW_DIM 32 +#define BLOCK_DIM_X 16 +#define BLOCK_DIM_Y 16 +#define FILTER_DIM 1 +#define PADDING 0 +#define STRIDE 1 + +#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1) +#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1) + +#include +#include + +namespace { + +inline void imapDMA_padding(HBTensor& imap, float* imap_buf, size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + + + size_t imap_x = block_x * BLOCK_DIM_X; + size_t imap_y = block_y * BLOCK_DIM_Y; + size_t logical_start = 0; // starting offset of imap buffer writting + size_t read_x = IMAP_DIM_X-PADDING; + size_t read_y = IMAP_DIM_Y-PADDING; + size_t block_id = block_y * 2 + block_x; + + float* imap_src_base = (float*)imap.data_ptr(); + const uint32_t* imap_src_strides = imap.get_strides(); + imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1]; + imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3]; + size_t y_step = imap_src_strides[2]; + + size_t buf_offset = logical_start; + for (size_t r = 0; r < read_y; r++) { + + // unroll by IMAP_DIM_X-PADDING == 17 + register float tmp00 = *(imap_src_base + 0); + register float tmp01 = *(imap_src_base + 1); + register float tmp02 = *(imap_src_base + 2); + register float tmp03 = *(imap_src_base + 3); + register float tmp04 = *(imap_src_base + 4); + register float tmp05 = *(imap_src_base + 5); + register float tmp06 = *(imap_src_base + 6); + register float tmp07 = *(imap_src_base + 7); + register float tmp08 = *(imap_src_base + 8); + register float tmp09 = *(imap_src_base + 9); + register float tmp10 = *(imap_src_base + 10); + register float tmp11 = *(imap_src_base + 11); + register float tmp12 = *(imap_src_base + 12); + register float tmp13 = *(imap_src_base + 13); + register float tmp14 = *(imap_src_base + 14); + register float tmp15 = *(imap_src_base + 15); + register float tmp16 = *(imap_src_base + 16); + asm volatile("": : :"memory"); + imap_buf[buf_offset + 0] = tmp00; + imap_buf[buf_offset + 1] = tmp01; + imap_buf[buf_offset + 2] = tmp02; + imap_buf[buf_offset + 3] = tmp03; + imap_buf[buf_offset + 4] = tmp04; + imap_buf[buf_offset + 5] = tmp05; + imap_buf[buf_offset + 6] = tmp06; + imap_buf[buf_offset + 7] = tmp07; + imap_buf[buf_offset + 8] = tmp08; + imap_buf[buf_offset + 9] = tmp09; + imap_buf[buf_offset + 10] = tmp10; + imap_buf[buf_offset + 11] = tmp11; + imap_buf[buf_offset + 12] = tmp12; + imap_buf[buf_offset + 13] = tmp13; + imap_buf[buf_offset + 14] = tmp14; + imap_buf[buf_offset + 15] = tmp15; + imap_buf[buf_offset + 16] = tmp16; + + buf_offset += IMAP_DIM_X; + imap_src_base += y_step; + } + // debug + /* + size_t debug_offset = 0; + for (size_t r = 0; r < IMAP_DIM_Y; r++) { + for (size_t c = 0; c < IMAP_DIM_X; c++) { + std::cout << imap_buf[debug_offset] << " "; + debug_offset++; + } + std::cout << std::endl; + } + std::cout << std::endl; + std::cout << std::endl; + */ +} + +} // namespace + +extern "C" { + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_32x32( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 0 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + auto filterDMA = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(4); + + bsg_saif_start(); + // main loop + for (size_t idx = bsg_id / 2; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)/2) { + if (bsg_id % 2 == 0 && idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t filter_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t channel_id = 0; channel_id < Cin; channel_id++) { + + // read in the image + //imapDMA(image_id, channel_id, block_x, block_y); + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the filter + filterDMA(filter_id, channel_id); + + // do conv + conv2d_1x1_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, filter_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(4); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_32x32_back_input( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor omap(output); + HBTensor imap(input); + HBTensor filter(weight); + HBVector p(padding); + HBVector s(strides); + + // Conv2d parameters + auto N = omap.dim(0); // number of images in batch + auto Cout = omap.dim(1); // number of output channels + auto Hout = omap.dim(2); + auto Wout = omap.dim(3); + auto Cin = imap.dim(1); // number of input channels + auto Hin = imap.dim(2); + auto Win = imap.dim(3); + auto Hk = filter.dim(2); + auto Wk = filter.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hk); + hb_assert(FILTER_DIM == Wk); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hout); + hb_assert(RAW_DIM == Wout); + hb_assert(PADDING == p[0]); // assume padding == 0 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + // XXX: in the backward pass for input, we need to pad 3 - 1 - 1 = 1 ... + // so it happens to be the same as forward pass + + hb_assert(Hout % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wout % BLOCK_DIM_X == 0); + + size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout * blocks_per_out_channel; + + // allocate buffers + float filter_buf[FILTER_DIM * FILTER_DIM]; + float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; + + + // this one reads the filter in forward order + // then write it into SPM is rotated order + auto filterDMA_rotate = [&](size_t filter_id, size_t channel_id) { + float* filter_src_base = (float*)filter.data_ptr(); + uint32_t* filter_src_strides = filter.get_strides(); + filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1]; + fill_filter_buffer_rotate(filter_src_base, filter_buf); + }; + + + auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) { + size_t omap_x = block_x * BLOCK_DIM_X; + size_t omap_y = block_y * BLOCK_DIM_Y; + float* omap_src_base = (float*)omap.data_ptr(); + uint32_t* omap_src_strides = omap.get_strides(); + omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1]; + omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3]; + size_t y_step = omap_src_strides[2]; + drain_omap_buffer(omap_buf, omap_src_base, y_step); + }; + + bsg_cuda_print_stat_start(5); + bsg_saif_start(); + // main loop + for (size_t idx = bsg_id / 2; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)/2) { + if (bsg_id % 2 == 0 && idx < num_blocks) { + + // figure out what we are producing + size_t tmp = idx; + size_t image_id = tmp / (Cout * blocks_per_out_channel); + tmp = tmp % (Cout * blocks_per_out_channel); + size_t channel_id = tmp / blocks_per_out_channel; + tmp = tmp % blocks_per_out_channel; + size_t block_y = tmp / w_blocks_per_out_channel; + size_t block_x = tmp % w_blocks_per_out_channel; + + // reset output buffer + reset_buffer(omap_buf); + + for (size_t filter_id = 0; filter_id < Cin; filter_id++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, filter_id, block_x, block_y); + + // read in the filter + filterDMA_rotate(filter_id, channel_id); + + // do conv + conv2d_1x1_16(imap_buf, filter_buf, omap_buf); + + } // channel + + // write omap back + omapDMA(image_id, channel_id, block_x, block_y); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(5); + + g_barrier.sync(); + return 0; + } + + + + + __attribute__ ((noinline)) int tensorlib_conv_resnet_32_1x1_32x32_back_weight( + hb_tensor_t* output, + hb_tensor_t* input, + hb_tensor_t* weight, + hb_vector_t* padding, + hb_vector_t* strides) { + + HBTensor filter(output); + HBTensor imap(input); + HBTensor grad(weight); + HBVector p(padding); + HBVector s(strides); + + // extract parameters + auto N = filter.dim(0); // number of filters to calculate grad for + auto Cout = filter.dim(1); // number of channels in the images + auto Hout = filter.dim(2); + auto Wout = filter.dim(3); // filter dimensions + auto N_imap = imap.dim(0); // number of images + auto Hin = imap.dim(2); // image dimensions + auto Win = imap.dim(3); + auto Hk = grad.dim(2); // grad dimensions + auto Wk = grad.dim(3); + + // cross check + hb_assert(FILTER_DIM == Hout); + hb_assert(FILTER_DIM == Wout); + hb_assert(RAW_DIM == Hin); // assume we are doing 32x32 -> 32x32 + hb_assert(RAW_DIM == Win); + hb_assert(RAW_DIM == Hk); + hb_assert(RAW_DIM == Wk); + hb_assert(PADDING == p[0]); // assume padding == 1 + hb_assert(PADDING == p[1]); + hb_assert(STRIDE == s[0]); // assume stride == 1 + hb_assert(STRIDE == s[1]); + + hb_assert(Hk % BLOCK_DIM_Y == 0); // we dont have partial blocks + hb_assert(Wk % BLOCK_DIM_X == 0); + + // Here we break grad into blocks + size_t h_blocks_per_out_channel = Hk / BLOCK_DIM_Y; + size_t w_blocks_per_out_channel = Wk / BLOCK_DIM_X; + + size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel; + size_t num_blocks = N * Cout; // parallel over filter x channel + + float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B + float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B + float grad_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B + + auto gradDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) { + size_t grad_x = block_x * BLOCK_DIM_X; + size_t grad_y = block_y * BLOCK_DIM_Y; + float* grad_src_base = (float*)grad.data_ptr(); + uint32_t* grad_src_strides = grad.get_strides(); + grad_src_base += image_id * grad_src_strides[0] + channel_id * grad_src_strides[1]; + grad_src_base += grad_y * grad_src_strides[2] + grad_x * grad_src_strides[3]; + size_t y_step = grad_src_strides[2]; + size_t buf_offset = 0; + for (size_t row = 0; row < BLOCK_DIM_Y; row++) { + // unroll by BLOCK_DIM_X == 16 + register float tmp00 = *(grad_src_base + 0); + register float tmp01 = *(grad_src_base + 1); + register float tmp02 = *(grad_src_base + 2); + register float tmp03 = *(grad_src_base + 3); + register float tmp04 = *(grad_src_base + 4); + register float tmp05 = *(grad_src_base + 5); + register float tmp06 = *(grad_src_base + 6); + register float tmp07 = *(grad_src_base + 7); + register float tmp08 = *(grad_src_base + 8); + register float tmp09 = *(grad_src_base + 9); + register float tmp10 = *(grad_src_base + 10); + register float tmp11 = *(grad_src_base + 11); + register float tmp12 = *(grad_src_base + 12); + register float tmp13 = *(grad_src_base + 13); + register float tmp14 = *(grad_src_base + 14); + register float tmp15 = *(grad_src_base + 15); + asm volatile("": : :"memory"); + grad_buf[buf_offset + 0] = tmp00; + grad_buf[buf_offset + 1] = tmp01; + grad_buf[buf_offset + 2] = tmp02; + grad_buf[buf_offset + 3] = tmp03; + grad_buf[buf_offset + 4] = tmp04; + grad_buf[buf_offset + 5] = tmp05; + grad_buf[buf_offset + 6] = tmp06; + grad_buf[buf_offset + 7] = tmp07; + grad_buf[buf_offset + 8] = tmp08; + grad_buf[buf_offset + 9] = tmp09; + grad_buf[buf_offset + 10] = tmp10; + grad_buf[buf_offset + 11] = tmp11; + grad_buf[buf_offset + 12] = tmp12; + grad_buf[buf_offset + 13] = tmp13; + grad_buf[buf_offset + 14] = tmp14; + grad_buf[buf_offset + 15] = tmp15; + + buf_offset += BLOCK_DIM_X; + grad_src_base += y_step; + } + }; + + auto filterDMA_wb = [&](size_t filter_id, size_t channel_id) { + bsg_attr_remote float* filter_dst_base = (float*)filter.data_ptr(); + uint32_t* filter_dst_strides = filter.get_strides(); + filter_dst_base += filter_id * filter_dst_strides[0] + channel_id * filter_dst_strides[1]; + bsg_unroll(FILTER_DIM * FILTER_DIM) + for (size_t i = 0; i < FILTER_DIM * FILTER_DIM; i++) { + filter_dst_base[i] = filter_buf[i]; + } + }; + + + bsg_cuda_print_stat_start(6); + bsg_saif_start(); + + // main loop + for (size_t idx = bsg_id / 2; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)/2) { + if (bsg_id % 2 == 0 && idx < num_blocks) { + + // figure out what we are producing + size_t filter_id = idx / Cout; + size_t channel_id = idx % Cout; + + // reset output buffer + reset_buffer(filter_buf); + + for (size_t image_id = 0; image_id < N_imap; image_id++) { + for (size_t block_y = 0; block_y < h_blocks_per_out_channel; block_y++) { + for (size_t block_x = 0; block_x < w_blocks_per_out_channel; block_x++) { + + // read in the image + imapDMA_padding(imap, imap_buf, image_id, channel_id, block_x, block_y); + + // read in the grad + gradDMA(image_id, filter_id, block_x, block_y); + + // do conv + // 5x5 is too large to unroll in both x and y directions + for (size_t f_y = 0; f_y < FILTER_DIM; f_y++) { + register float psum = 0; + float* imap_ptr = imap_buf + f_y * IMAP_DIM_X; + float* grad_ptr = grad_buf; + float* output = filter_buf + f_y * FILTER_DIM; + for (size_t y = 0; y < BLOCK_DIM_Y; y++) { + float *imap_row = imap_ptr; + float *grad_row = grad_ptr; + for (size_t x = 0; x < BLOCK_DIM_X; x += 8) { + register float grad0 = grad_row[x+0]; + register float grad1 = grad_row[x+1]; + register float grad2 = grad_row[x+2]; + register float grad3 = grad_row[x+3]; + register float grad4 = grad_row[x+4]; + register float grad5 = grad_row[x+5]; + register float grad6 = grad_row[x+6]; + register float grad7 = grad_row[x+7]; + + register float imap0 = imap_row[x+0]; + register float imap1 = imap_row[x+1]; + register float imap2 = imap_row[x+2]; + register float imap3 = imap_row[x+3]; + register float imap4 = imap_row[x+4]; + register float imap5 = imap_row[x+5]; + register float imap6 = imap_row[x+6]; + register float imap7 = imap_row[x+7]; + +#ifdef HB_EMUL + psum += imap0 * grad0; + psum += imap1 * grad1; + psum += imap2 * grad2; + psum += imap3 * grad3; + psum += imap4 * grad4; + psum += imap5 * grad5; + psum += imap6 * grad6; + psum += imap7 * grad7; +#else + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap0), "f"(grad0)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap1), "f"(grad1)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap2), "f"(grad2)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap3), "f"(grad3)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap4), "f"(grad4)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap5), "f"(grad5)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap6), "f"(grad6)); + asm volatile("fmadd.s %0, %1, %2, %0" : "+f"(psum) : "f"(imap7), "f"(grad7)); +#endif + } + imap_ptr += IMAP_DIM_X; + grad_ptr += BLOCK_DIM_X; + } + output[0] += psum; + } + } // block + } + } // image + + // write omap back + filterDMA_wb(filter_id, channel_id); + + } // if (idx < num_blocks) + } // main loop + + bsg_saif_end(); + bsg_cuda_print_stat_end(6); + + g_barrier.sync(); + return 0; + } + + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_32x32, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_32x32_back_input, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + + HB_EMUL_REG_KERNEL(tensorlib_conv_resnet_32_1x1_32x32_back_weight, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*, + hb_vector_t*, hb_vector_t*) + +}