Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding conv layer optimizations #169

Open
wants to merge 2 commits into
base: pytorch-sparse-workload
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ option(USE_HB "Use HammerBlade" ON)
option(USE_HB_EMUL "Use HammerBlade Emulation Layer" OFF)
option(PROFILE_ATEN "Turn on ATen kernel profiling" ON)
cmake_dependent_option(
HB_REDISPATCH "Enable conditional redispatch to HB" OFF
HB_REDISPATCH "Enable conditional redispatch to HB" ON
"PROFILE_ATEN;USE_HB" OFF)
cmake_dependent_option(
HB_ENABLE_KERNEL_LOG "Enable HammerBlade kernel call logging" OFF
Expand Down
94 changes: 86 additions & 8 deletions aten/src/ATen/native/hammerblade/Convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <ATen/native/hammerblade/HammerBladeTensor.h>
#include <ATen/native/hammerblade/Offload.h>

#define ENABLE_SYSTOLIC 0

namespace at {
namespace native {

Expand Down Expand Up @@ -143,7 +145,7 @@ static void convolution_shape_check(
Tensor hb_convolution_forward(
CheckedFrom c,
const TensorArg& input, const TensorArg& weight,
IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation,
int64_t groups) {
checkAllSameType(c, {input, weight});
checkAllSameHB(c, {input, weight});
Expand All @@ -159,7 +161,7 @@ Tensor hb_convolution_forward(

// Avoid ambiguity of "output" when this is being used as backwards
TensorArg output{ output_t, "result", 0 };
convolution_shape_check(c, input, weight, output, padding, stride,
convolution_shape_check(c, input, weight, output, padding, stride,
dilation, groups);

Tensor weight_contig = weight->contiguous();
Expand All @@ -174,15 +176,45 @@ Tensor hb_convolution_forward(
device_args.push_back(create_device_vector(padding, true, device_ptrs));
device_args.push_back(create_device_vector(stride, true, device_ptrs));

/*
TORCH_CHECK((*input).size(3) == (*input).size(2), "we only support square imap\n");
TORCH_CHECK((*weight).size(3) == 3, "we only support 3x3 filter\n");
TORCH_CHECK((*weight).size(3) == (*weight).size(2), "we only support square filter\n");
*/

string kernel_name;
if ((*weight).size(3) != 3 && (*weight).size(3) != 1) {
kernel_name = "tensorlib_convolution_forward";
} else {
switch((*input).size(3)) {
case 32:
kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32" : "tensorlib_conv_resnet_32_1x1_32x32";
break;
case 16:
kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16" : "tensorlib_conv_resnet_32_1x1_16x16";
break;
case 8:
kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8" : "tensorlib_conv_resnet_32_1x1_8x8";
break;
case 4:
kernel_name = ((*weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4" : "tensorlib_conv_resnet_32_1x1_4x4";
break;
default:
kernel_name = "tensorlib_convolution_forward";
// TORCH_CHECK(false, "we only support 32x32, 16x16, 8x8, and 4x4 imap\n");
break;
}
}

c10::hammerblade::offload_kernel(
"tensorlib_convolution_forward", device_args);
kernel_name.c_str(), device_args);
cleanup_device(device_args, device_ptrs);

return *output;
}

// In-place!
void hb_convolution_add_bias_(CheckedFrom c, const TensorArg& output,
void hb_convolution_add_bias_(CheckedFrom c, const TensorArg& output,
const TensorArg& bias) {
checkAllSameType(c, {output, bias});
checkAllSameHB(c, {output, bias});
Expand Down Expand Up @@ -222,8 +254,31 @@ Tensor hb_convolution_backward_input(
device_args.push_back(create_device_vector(padding, true, device_ptrs));
device_args.push_back(create_device_vector(stride, true, device_ptrs));

string kernel_name;
if ((weight_contig).size(3) != 3 && (weight_contig).size(3) != 1) {
kernel_name = "tensorlib_convolution_backward_input";
} else {
switch((*grad_output).size(3)) {
case 32:
kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32_back_input" : "tensorlib_conv_resnet_32_1x1_32x32_back_input";
break;
case 16:
kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16_back_input" : "tensorlib_conv_resnet_32_1x1_16x16_back_input";
break;
case 8:
kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8_back_input" : "tensorlib_conv_resnet_32_1x1_8x8_back_input";
break;
case 4:
kernel_name = ((weight_contig).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4_back_input" : "tensorlib_conv_resnet_32_1x1_4x4_back_input";
break;
default:
kernel_name = "tensorlib_convolution_backward_input";
break;
}
}

c10::hammerblade::offload_kernel(
"tensorlib_convolution_backward_input", device_args);
kernel_name.c_str(), device_args);
cleanup_device(device_args, device_ptrs);

return *grad_input;
Expand All @@ -249,13 +304,36 @@ Tensor hb_convolution_backward_weight(
std::vector<eva_t> device_args;
std::vector<eva_t> device_ptrs;
device_args.push_back(create_device_tensor(*grad_weight, device_ptrs));
device_args.push_back(create_device_tensor(*grad_output, device_ptrs));
device_args.push_back(create_device_tensor(*input, device_ptrs));
device_args.push_back(create_device_tensor(*grad_output, device_ptrs));
device_args.push_back(create_device_vector(padding, true, device_ptrs));
device_args.push_back(create_device_vector(stride, true, device_ptrs));

string kernel_name;
if ((*grad_weight).size(3) != 3 && (*grad_weight).size(3) != 1) {
kernel_name = "tensorlib_convolution_backward_weight";
} else {
switch((*input).size(3)) {
case 32:
kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_32x32_back_weight" : "tensorlib_conv_resnet_32_1x1_32x32_back_weight";
break;
case 16:
kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_16x16_back_weight" : "tensorlib_conv_resnet_32_1x1_16x16_back_weight";
break;
case 8:
kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_8x8_back_weight" : "tensorlib_conv_resnet_32_1x1_8x8_back_weight";
break;
case 4:
kernel_name = ((*grad_weight).size(3) == 3) ? "tensorlib_conv_resnet_32_3x3_4x4_back_weight" : "tensorlib_conv_resnet_32_1x1_4x4_back_weight";
break;
default:
kernel_name = "tensorlib_convolution_backward_weight";
break;
}
}

c10::hammerblade::offload_kernel(
"tensorlib_convolution_backward_weight", device_args);
kernel_name.c_str(), device_args);
cleanup_device(device_args, device_ptrs);

return grad_weight_t;
Expand All @@ -275,7 +353,7 @@ Tensor hb_convolution_transpose(

Tensor hb_convolution(
const Tensor& input_t, const Tensor& weight_t, const Tensor& bias_t,
IntArrayRef padding, IntArrayRef stride,
IntArrayRef padding, IntArrayRef stride,
IntArrayRef dilation, int64_t groups) {
TensorArg input { input_t, "input", 1 },
weight { weight_t, "weight", 2 },
Expand Down
140 changes: 140 additions & 0 deletions hammerblade/torch/kernel/kernel_conv_baseline.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
//====================================================================
// SPMD 2D Convolution
// Idea is that each tile will receive a piece of output image that
// does not overlap with any other tile to work on
// 10/02/2020 Lin Cheng
//====================================================================

#define BLOCK_DIM_X 14
#define BLOCK_DIM_Y 14
#define FILTER_DIM 5
#define NUM_FILTERS 6

#define IMAP_DIM_X (BLOCK_DIM_X + FILTER_DIM - 1)
#define IMAP_DIM_Y (BLOCK_DIM_Y + FILTER_DIM - 1)

#include <kernel_common.hpp>
#include <kernel_conv_baseline.hpp>


extern "C" {

__attribute__ ((noinline)) int tensorlib_conv_baseline(
hb_tensor_t* output,
hb_tensor_t* input,
hb_tensor_t* weight,
hb_vector_t* padding,
hb_vector_t* strides) {

HBTensor<float, 4> omap(output);
HBTensor<float, 4> imap(input);
HBTensor<float, 4> filter(weight);

// Conv2d parameters
auto N = omap.dim(0); // number of images in batch
auto Cout = omap.dim(1); // number of output channels
auto Hout = omap.dim(2);
auto Wout = omap.dim(3);
auto Cin = imap.dim(1); // number of input channels
auto Hin = imap.dim(2);
auto Win = imap.dim(3);
auto Hk = filter.dim(2);
auto Wk = filter.dim(3);

size_t h_blocks_per_out_channel = Hout / BLOCK_DIM_Y;
size_t w_blocks_per_out_channel = Wout / BLOCK_DIM_X;
if (Hout % BLOCK_DIM_Y != 0) {
h_blocks_per_out_channel++;
}
if (Wout % BLOCK_DIM_X != 0) {
w_blocks_per_out_channel++;
}
size_t blocks_per_out_channel = h_blocks_per_out_channel * w_blocks_per_out_channel;
size_t num_blocks = N * Cout * blocks_per_out_channel;

float filter_buf[FILTER_DIM * FILTER_DIM]; // 5x5 * 4 = 100B
float omap_buf[BLOCK_DIM_X * BLOCK_DIM_Y]; // 14x14 * 4 = 784B
float imap_buf[IMAP_DIM_X * IMAP_DIM_Y]; // 18x18 * 4 = 1296B

// cross check
hb_assert(FILTER_DIM == Hk);
hb_assert(FILTER_DIM == Wk);
hb_assert(NUM_FILTERS == Cout);


auto filterDMA = [&](size_t filter_id, size_t channel_id) {
float* filter_src_base = (float*)filter.data_ptr();
uint32_t* filter_src_strides = filter.get_strides();
filter_src_base += filter_id * filter_src_strides[0] + channel_id * filter_src_strides[1];
fill_filter_buffer<FILTER_DIM>(filter_src_base, filter_buf);
};

auto imapDMA = [&](size_t image_id, size_t channel_id, size_t block_x, size_t block_y) {
size_t imap_x = block_x * BLOCK_DIM_X;
size_t imap_y = block_y * BLOCK_DIM_Y;
float* imap_src_base = (float*)imap.data_ptr();
uint32_t* imap_src_strides = imap.get_strides();
imap_src_base += image_id * imap_src_strides[0] + channel_id * imap_src_strides[1];
imap_src_base += imap_y * imap_src_strides[2] + imap_x * imap_src_strides[3];
size_t y_step = imap_src_strides[2];
fill_imap_buffer<IMAP_DIM_X, IMAP_DIM_Y>(imap_src_base, imap_buf, y_step);
};

auto omapDMA = [&](size_t image_id, size_t filter_id, size_t block_x, size_t block_y) {
size_t omap_x = block_x * BLOCK_DIM_X;
size_t omap_y = block_y * BLOCK_DIM_Y;
float* omap_src_base = (float*)omap.data_ptr();
uint32_t* omap_src_strides = omap.get_strides();
omap_src_base += image_id * omap_src_strides[0] + filter_id * omap_src_strides[1];
omap_src_base += omap_y * omap_src_strides[2] + omap_x * omap_src_strides[3];
size_t y_step = omap_src_strides[2];
drain_omap_buffer<BLOCK_DIM_X, BLOCK_DIM_Y>(omap_buf, omap_src_base, y_step);
};

bsg_cuda_print_stat_kernel_start();

// main loop
for (size_t idx = bsg_id; idx < num_blocks; idx += (BSG_TILE_GROUP_X_DIM * BSG_TILE_GROUP_Y_DIM)) {
if (idx < num_blocks) {

// figure out what we are producing
size_t tmp = idx;
size_t image_id = tmp / (Cout * blocks_per_out_channel);
tmp = tmp % (Cout * blocks_per_out_channel);
size_t filter_id = tmp / blocks_per_out_channel;
tmp = tmp % blocks_per_out_channel;
size_t block_y = tmp / w_blocks_per_out_channel;
size_t block_x = tmp % w_blocks_per_out_channel;

// reset output buffer
reset_buffer<BLOCK_DIM_X, BLOCK_DIM_Y>(omap_buf);

for (size_t channel_id = 0; channel_id < Cin; channel_id++) {

// read in the image
imapDMA(image_id, channel_id, block_x, block_y);

// read in the filter
filterDMA(filter_id, channel_id);

// do conv
conv2d_5x5(imap_buf, filter_buf, omap_buf);

} // channel

// write omap back
omapDMA(image_id, filter_id, block_x, block_y);

} // if (idx < num_blocks)
} // main loop

bsg_cuda_print_stat_kernel_end();

g_barrier.sync();
return 0;
}

HB_EMUL_REG_KERNEL(tensorlib_conv_baseline, hb_tensor_t*, hb_tensor_t*, hb_tensor_t*,
hb_vector_t*, hb_vector_t*)

}
Loading