Skip to content

Commit

Permalink
Add dynamic shared memory allocation
Browse files Browse the repository at this point in the history
  • Loading branch information
michael-kenzel committed Aug 24, 2023
1 parent 17b1fb7 commit d8902e2
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 11 deletions.
14 changes: 10 additions & 4 deletions platforms/artic/intrinsics_thorin.impala
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
#[import(cc = "thorin")] fn cmpxchg_weak[T](_addr: &mut T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool); // only for integer data types
#[import(cc = "thorin")] fn fence(_order: u32, _scope: &[u8]) -> ();
#[import(cc = "thorin")] fn pe_info[T](_src: &[u8], _val: T) -> ();
#[import(cc = "thorin")] fn cuda(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn nvvm(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn opencl(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn amdgpu(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _body: fn() -> ()) -> ();
#[import(cc = "thorin", name = "cuda")] fn cuda_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _lmem: i32, _body: fn() -> ()) -> ();
#[import(cc = "thorin", name = "nvvm")] fn nvvm_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _lmem: i32, _body: fn() -> ()) -> ();
#[import(cc = "thorin", name = "opencl")] fn opencl_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _lmem: i32, _body: fn() -> ()) -> ();
#[import(cc = "thorin", name = "amdgpu")] fn amdgpu_with_lmem(_dev: i32, _grid: (i32, i32, i32), _block: (i32, i32, i32), _lmem: i32, _body: fn() -> ()) -> ();
#[import(cc = "thorin")] fn local_memory() -> &mut addrspace(3)[u8];
#[import(cc = "thorin")] fn reserve_shared[T](_size: i32) -> &mut addrspace(3)[T];
#[import(cc = "thorin")] fn hls(_dev: i32, _body: fn() -> ()) -> ();
#[import(cc = "thorin", name = "pipeline")] fn thorin_pipeline(_initiation_interval: i32, _lower: i32, _upper: i32, _body: fn(i32) -> ()) -> (); // only for HLS/OpenCL backend
Expand All @@ -35,6 +36,11 @@
#[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p1[T](_addr: &mut addrspace(1)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);
#[import(cc = "thorin", name = "cmpxchg_weak")] fn cmpxchg_weak_p3[T](_addr: &mut addrspace(3)T, _cmp: T, _new: T, _success_order: u32, _failure_order: u32, _scope: &[u8]) -> (T, bool);

fn @cuda(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = cuda_with_lmem(dev, grid, block, 0, body);
fn @nvvm(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = nvvm_with_lmem(dev, grid, block, 0, body);
fn @opencl(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = opencl_with_lmem(dev, grid, block, 0, body);
fn @amdgpu(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) = amdgpu_with_lmem(dev, grid, block, 0, body);

fn @pipeline(body: fn(i32) -> ()) = @|initiation_interval: i32, lower: i32, upper: i32| thorin_pipeline(initiation_interval, lower, upper, body);
fn @parallel(body: fn(i32) -> ()) = @|num_threads: i32, lower: i32, upper: i32| thorin_parallel(num_threads, lower, upper, body);
fn @spawn(body: fn() -> ()) = @|| thorin_spawn(body);
14 changes: 10 additions & 4 deletions platforms/impala/intrinsics_thorin.impala
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ extern "thorin" {
fn insert[T, U](T, i32, U) -> T;
//fn shuffle[T](T, T, T) -> T;

fn cuda(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
fn nvvm(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
fn opencl(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
fn amdgpu(i32, (i32, i32, i32), (i32, i32, i32), fn() -> ()) -> ();
fn "cuda" cuda_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> ();
fn "nvvm" nvvm_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> ();
fn "opencl" opencl_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> ();
fn "amdgpu" amdgpu_with_lmem(i32, (i32, i32, i32), (i32, i32, i32), i32, fn() -> ()) -> ();
fn local_memory() -> &mut[3][u8];
fn reserve_shared[T](i32) -> &mut[3][T];

fn hls(dev: i32, body: fn() -> ()) -> ();
Expand Down Expand Up @@ -42,3 +43,8 @@ extern "thorin" {

fn vectorize(vector_length: i32, body: fn(i32) -> ()) -> ();
}

fn @@cuda(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { cuda_with_lmem(dev, grid, block, 0, body) }
fn @@nvvm(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { nvvm_with_lmem(dev, grid, block, 0, body) }
fn @@opencl(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { opencl_with_lmem(dev, grid, block, 0, body) }
fn @@amdgpu(dev: i32, grid: (i32, i32, i32), block: (i32, i32, i32), body: fn() -> ()) { amdgpu_with_lmem(dev, grid, block, 0, body) }
2 changes: 2 additions & 0 deletions src/anydsl_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ void anydsl_copy(
void anydsl_launch_kernel(
int32_t mask, const char* file_name, const char* kernel_name,
const uint32_t* grid, const uint32_t* block,
uint32_t lmem,
void** arg_data,
const uint32_t* arg_sizes,
const uint32_t* arg_aligns,
Expand All @@ -128,6 +129,7 @@ void anydsl_launch_kernel(
kernel_name,
grid,
block,
lmem,
{
arg_data,
arg_sizes,
Expand Down
1 change: 1 addition & 0 deletions src/anydsl_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ AnyDSL_runtime_API void anydsl_copy(int32_t, const void*, int64_t, int32_t, void
AnyDSL_runtime_API void anydsl_launch_kernel(
int32_t, const char*, const char*,
const uint32_t*, const uint32_t*,
uint32_t,
void**, const uint32_t*, const uint32_t*, const uint32_t*, const uint8_t*,
uint32_t);
AnyDSL_runtime_API void anydsl_synchronize(int32_t);
Expand Down
4 changes: 3 additions & 1 deletion src/cuda_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,9 @@ void CudaPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_params
launch_params.grid[1] / launch_params.block[1],
launch_params.grid[2] / launch_params.block[2],
launch_params.block[0], launch_params.block[1], launch_params.block[2],
0, nullptr, launch_params.args.data, nullptr);
launch_params.lmem,
nullptr,
launch_params.args.data, nullptr);
CHECK_CUDA(err, "cuLaunchKernel()");

if (runtime_->profiling_enabled()) {
Expand Down
2 changes: 1 addition & 1 deletion src/hsa_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ void HSAPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_params)
aql.kernel_object = kernel_info.kernel;
aql.kernarg_address = kernel_info.kernarg_segment;
aql.private_segment_size = kernel_info.private_segment_size;
aql.group_segment_size = kernel_info.group_segment_size;
aql.group_segment_size = (kernel_info.group_segment_size + 15) / 16 * kernel_info.group_segment_size + launch_params.lmem;

// write to command queue
const uint64_t index = hsa_queue_load_write_index_relaxed(queue);
Expand Down
8 changes: 7 additions & 1 deletion src/opencl_platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,8 @@ void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_para
cl_mem struct_buf = clCreateBuffer(devices_[dev].ctx, flags, launch_params.args.sizes[i], launch_params.args.data[i], &err);
CHECK_OPENCL(err, "clCreateBuffer()");
kernel_structs[i] = struct_buf;
clSetKernelArg(kernel, i, sizeof(cl_mem), &kernel_structs[i]);
err = clSetKernelArg(kernel, i, sizeof(cl_mem), &kernel_structs[i]);
CHECK_OPENCL(err, "clSetKernelArg()");
} else {
#ifdef CL_VERSION_2_0
if (launch_params.args.types[i] == KernelArgType::Ptr && devices_[dev].version_major == 2) {
Expand All @@ -391,6 +392,11 @@ void OpenCLPlatform::launch_kernel(DeviceId dev, const LaunchParams& launch_para
}
}

if (launch_params.lmem != 0) {
cl_int err = clSetKernelArg(kernel, launch_params.num_args, launch_params.lmem, nullptr);
CHECK_OPENCL(err, "clSetKernelArg()");
}

size_t global_work_size[] = {launch_params.grid [0], launch_params.grid [1], launch_params.grid [2]};
size_t local_work_size[] = {launch_params.block[0], launch_params.block[1], launch_params.block[2]};

Expand Down
1 change: 1 addition & 0 deletions src/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ struct LaunchParams {
const char* kernel_name;
const uint32_t* grid;
const uint32_t* block;
uint32_t lmem;
struct {
void** data;
const uint32_t* sizes;
Expand Down

0 comments on commit d8902e2

Please sign in to comment.