From 5c0328422235823ff2b4f78fe703c0b2a06bdd66 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Tue, 15 Oct 2024 10:38:49 +0300 Subject: [PATCH] Device-side malloc --- bitcode/CMakeLists.txt | 2 +- bitcode/devicelib.cl | 2 - bitcode/malloc.cl | 170 ++++++++++++++++++ include/hip/spirv_hip.hh | 3 + include/hip/spirv_hip_devicelib.hh | 14 +- samples/CMakeLists.txt | 1 + samples/hipDeviceMalloc/CMakeLists.txt | 8 + samples/hipDeviceMalloc/hipDeviceMalloc.hip | 183 ++++++++++++++++++++ src/CHIPBackend.cc | 20 +++ src/SPVRegister.cc | 7 +- src/common.hh | 3 + 11 files changed, 403 insertions(+), 10 deletions(-) create mode 100644 bitcode/malloc.cl create mode 100644 samples/hipDeviceMalloc/CMakeLists.txt create mode 100644 samples/hipDeviceMalloc/hipDeviceMalloc.hip diff --git a/bitcode/CMakeLists.txt b/bitcode/CMakeLists.txt index f9ba078c3..ff2b45d82 100644 --- a/bitcode/CMakeLists.txt +++ b/bitcode/CMakeLists.txt @@ -57,7 +57,7 @@ set(BITCODE_C_COMPILE_FLAGS -emit-llvm ${EXTRA_FLAGS}) # non-OCML sources -set(NON_OCML_SOURCES "devicelib" "_cl_print_str" "texture") # "printf_support" +set(NON_OCML_SOURCES "devicelib" "_cl_print_str" "texture" "malloc") # "printf_support" # Compiles SOURCE treated as OpenCL to LLVM bitcode. function(add_opencl_bitcode SOURCE OUTPUT) diff --git a/bitcode/devicelib.cl b/bitcode/devicelib.cl index a24abd49d..b43561b3f 100644 --- a/bitcode/devicelib.cl +++ b/bitcode/devicelib.cl @@ -43,8 +43,6 @@ #error __opencl_c_generic_address_space needed! #endif -NOOPT void* device_malloc(unsigned int size) {return (void*)0;}; -NOOPT void device_free(void* ptr) {}; // Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE), // find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position. diff --git a/bitcode/malloc.cl b/bitcode/malloc.cl new file mode 100644 index 000000000..acb760b89 --- /dev/null +++ b/bitcode/malloc.cl @@ -0,0 +1,170 @@ +#define CL_TARGET_OPENCL_VERSION 200 +#include + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable + +#define ALIGNMENT 16 +#define ALIGN_SIZE(size) (((size) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1)) +#define DEVICE_HEAP_SIZE (1024 * 1024) // 1MB heap + +__global uchar* __chipspv_device_heap; + + +void __chip_init_device_heap(uchar* device_heap) { + __chipspv_device_heap = (__global uchar*)device_heap; +} + +// Structure for the header of each block in the heap +typedef struct { + int size; // Size of the block + int used; // Flag indicating if the block is used (1) or free (0) +} block_header_t; + +void lock(__global volatile atomic_int* mutex) { + int attempts = 0; + int backoff = 1; + do { + if (atomic_exchange_explicit(mutex, 1, memory_order_acquire, memory_scope_device) == 0) { + return; // Lock acquired + } + for (int i = 0; i < backoff; i++) { + barrier(CLK_LOCAL_MEM_FENCE); + } + attempts++; + backoff = min(backoff * 2, 1024); + } while (attempts < 100); +} + +void unlock(__global volatile atomic_int* mutex) { + atomic_store_explicit(mutex, 0, memory_order_release, memory_scope_device); +} + +// Add these debug macros +// #define DEBUG_PRINT(fmt, ...) printf("[DEBUG] " fmt "\n", ##__VA_ARGS__) +// #define ERROR_PRINT(fmt, ...) printf("[ERROR] " fmt "\n", ##__VA_ARGS__) +#define DEBUG_PRINT(fmt, ...) +#define ERROR_PRINT(fmt, ...) + + + +void* __chip_malloc(unsigned int size) { + __global void* result = NULL; + + // Ensure only the first thread in the 3D workgroup performs malloc + if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0) { + // Ensure the size is aligned + size = ALIGN_SIZE(size); + + // Pointers to the mutex and initialization flag + __global volatile atomic_int* mutex = (__global volatile atomic_int*)&__chipspv_device_heap[0]; + __global int* initialized = (__global int*)&__chipspv_device_heap[sizeof(atomic_int)]; + + // Pointer to the start of the heap + __global uchar* heap = (__global uchar*)__chipspv_device_heap + sizeof(atomic_int) + sizeof(int); + int real_heap_size = DEVICE_HEAP_SIZE - sizeof(atomic_int) - sizeof(int); + + lock(mutex); + + // Initialize the heap if not already done + if (*initialized == 0) { + __global block_header_t* first_header = (__global block_header_t*)heap; + first_header->size = real_heap_size - sizeof(block_header_t); + first_header->used = 0; + *initialized = 1; + DEBUG_PRINT("Heap initialized with size %d", first_header->size); + } + + // Start of malloc algorithm + __global uchar* heap_end = heap + real_heap_size; + __global uchar* ptr = heap; + + DEBUG_PRINT("Attempting to allocate %zu bytes", size); + + while (ptr + sizeof(block_header_t) <= heap_end) { + __global block_header_t* header = (__global block_header_t*)ptr; + + // Add error checking + if (header == NULL) { + ERROR_PRINT("Invalid header pointer"); + break; + } + + DEBUG_PRINT("Checking block at %p, size: %d, used: %d", (void*)ptr, header->size, header->used); + + if (header->used == 0 && header->size >= size) { + // Found a suitable block + int remaining_size = header->size - size - sizeof(block_header_t); + if (remaining_size > ALIGNMENT) { + // Split the block + __global uchar* next_block_ptr = ptr + sizeof(block_header_t) + size; + __global block_header_t* next_header = (__global block_header_t*)next_block_ptr; + next_header->size = remaining_size; + next_header->used = 0; + + header->size = size; + DEBUG_PRINT("Split block. New block at %p with size %d", next_block_ptr, remaining_size); + } + header->used = 1; + result = ptr + sizeof(block_header_t); + DEBUG_PRINT("Allocated block at %p with size %d", result, size); + break; + } + // Move to the next block + ptr = ptr + sizeof(block_header_t) + header->size; + } + + if (result == NULL) { + // No suitable block found + ERROR_PRINT("device_malloc: Out of memory"); + } + + unlock(mutex); + } + + // Broadcast the result to all threads in the workgroup + result = (__global void*)work_group_broadcast((uintptr_t)result, 0); + return result; +} + +void __chip_free(void* ptr) { + if (ptr == NULL) return; + + // Ensure only the first thread in the 3D workgroup performs free + if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0) { + uchar* device_heap = (__global uchar*)__chipspv_device_heap; + __global volatile atomic_int* mutex = (__global volatile atomic_int*)&device_heap[0]; + lock(mutex); + + __global block_header_t* header = (__global block_header_t*)(((__global uchar*)ptr) - sizeof(block_header_t)); + + if (header->used) { + header->used = 0; + DEBUG_PRINT("Freed block at %p with size %d", ptr, header->size); + + // Attempt to coalesce with next block if it's free + __global block_header_t* next_header = (__global block_header_t*)((__global uchar*)ptr + header->size); + if (((__global uchar*)next_header < device_heap + DEVICE_HEAP_SIZE) && !next_header->used) { + header->size += sizeof(block_header_t) + next_header->size; + DEBUG_PRINT("Coalesced with next block, new size: %d", header->size); + } + + // Attempt to coalesce with previous block if it's free + __global block_header_t* prev_header = (__global block_header_t*)device_heap; + while (((__global uchar*)prev_header + sizeof(block_header_t) + prev_header->size) < (__global uchar*)header) { + if (!prev_header->used && ((__global uchar*)prev_header + sizeof(block_header_t) + prev_header->size == (__global uchar*)header)) { + prev_header->size += sizeof(block_header_t) + header->size; + DEBUG_PRINT("Coalesced with previous block, new size: %d", prev_header->size); + break; + } + prev_header = (__global block_header_t*)((__global uchar*)prev_header + sizeof(block_header_t) + prev_header->size); + } + } else { + ERROR_PRINT("Attempted to free an already free block at %p", ptr); + } + + unlock(mutex); + } +} diff --git a/include/hip/spirv_hip.hh b/include/hip/spirv_hip.hh index e71c0e32c..f96a76660 100644 --- a/include/hip/spirv_hip.hh +++ b/include/hip/spirv_hip.hh @@ -46,6 +46,9 @@ extern "C" { // A global flag included in all HIP device modules for signaling // abort request. __attribute__((weak)) __device__ int32_t __chipspv_abort_called; + +// Global pointer for the device heap for device-side malloc/free +__attribute__((weak)) __device__ void* __chipspv_device_heap; __device__ void __chipspv_abort(int32_t *abort_flag); diff --git a/include/hip/spirv_hip_devicelib.hh b/include/hip/spirv_hip_devicelib.hh index 5116db729..f1bb40d76 100644 --- a/include/hip/spirv_hip_devicelib.hh +++ b/include/hip/spirv_hip_devicelib.hh @@ -73,10 +73,16 @@ THE SOFTWARE. #pragma push_macro("__HIP_OVERLOAD") #pragma push_macro("__HIP_OVERLOAD2") -__device__ void *device_malloc(unsigned int size); -__device__ void device_free(void *ptr); -EXPORT void *malloc(size_t size) { return device_malloc(size); } -EXPORT void free(void *ptr) { device_free(ptr); }; +extern "C" __device__ void * __chip_malloc(unsigned int size); +extern "C" __device__ void __chip_free(void *ptr); +extern "C" __device__ void __chip_init_device_heap(void* device_heap); + +EXPORT void * malloc(unsigned int size) { + return __chip_malloc(size); +} +EXPORT void free(void *ptr) { + __chip_free(ptr); +} // __hip_enable_if::type is a type function which returns __T if __B is true. template struct __hip_enable_if {}; diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 2fc7aafca..b5c6bdc40 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -131,6 +131,7 @@ set(SAMPLES ccompat hipComplex hipHostMallocSample + hipDeviceMalloc ) if (NOT "${DETECTED_ARCHITECTURE}" STREQUAL "riscv64") diff --git a/samples/hipDeviceMalloc/CMakeLists.txt b/samples/hipDeviceMalloc/CMakeLists.txt new file mode 100644 index 000000000..586f6f245 --- /dev/null +++ b/samples/hipDeviceMalloc/CMakeLists.txt @@ -0,0 +1,8 @@ +add_chip_binary(hipDeviceMalloc hipDeviceMalloc.hip) + +add_test(NAME hipDeviceMalloc + COMMAND ${SKIP_DOUBLE_TESTS} "${CMAKE_CURRENT_BINARY_DIR}/hipDeviceMalloc") + +set_tests_properties(hipDeviceMalloc PROPERTIES + PASS_REGULAR_EXPRESSION PASSED + SKIP_REGULAR_EXPRESSION "HIP_SKIP_THIS_TEST") diff --git a/samples/hipDeviceMalloc/hipDeviceMalloc.hip b/samples/hipDeviceMalloc/hipDeviceMalloc.hip new file mode 100644 index 000000000..1d77a90e9 --- /dev/null +++ b/samples/hipDeviceMalloc/hipDeviceMalloc.hip @@ -0,0 +1,183 @@ +#include +#include +#include + +#define DEVICE_HEAP_SIZE 1024 * 1024 // 1 MB + +#define HIP_CHECK(call) \ + do { \ + hipError_t err = call; \ + if (err != hipSuccess) { \ + printf("HIP error %s:%d: '%s'\n", __FILE__, __LINE__, \ + hipGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// __global__ void init_device_heap_kernel(void* device_heap) { +// __chip_init_device_heap(device_heap); +// } + +__global__ void dynamicAllocationKernel(int* result, int size, bool* alloc_success) +{ + // Allocate memory dynamically on the device + int* data = (int*)(malloc(size * sizeof(int))); // Cast size to 64-bit + + if (data == NULL) { + *alloc_success = false; + return; + } + + *alloc_success = true; + + // Initialize the allocated memory + for (int i = 0; i < size; i++) { + data[i] = i; + } + + // Compute a checksum of the array + int checksum = 0; + for (int i = 0; i < size; i++) { + checksum ^= data[i]; // XOR operation to compute checksum + } + + // Store the result + atomicXor(result, checksum); // Use atomicXor for checksum + + // Free the dynamically allocated memory + free(data); +} + +__global__ void stressTestKernel(int* result, int size, int iterations, bool* alloc_success, int* debug_info) +{ + for (int i = 0; i < iterations; i++) { + int* data = (int*)malloc(size * sizeof(int)); + if (data == NULL) { + printf("Device-side malloc failed in stress test, iteration %d\n", i); + *alloc_success = false; + atomicAdd(&debug_info[0], 1); // Count failed allocations + atomicAdd(&debug_info[1], i); // Store the last failed iteration + return; + } + + for (int j = 0; j < size; j++) { + data[j] = j; + } + + int checksum = 0; + for (int j = 0; j < size; j++) { + checksum ^= data[j]; // XOR operation to compute checksum + } + + atomicXor(result, checksum); // Use atomicXor for checksum + + free(data); + } + + *alloc_success = true; + atomicAdd(&debug_info[2], iterations); // Count successful iterations +} + +int main() +{ + const int size = 100; // Reduced problem size + const int stress_iterations = 10000; + int* d_result; + int h_result; + bool* d_alloc_success; + bool h_alloc_success; + int* d_debug_info; + int h_debug_info[3] = {0}; // [0]: failed allocations, [1]: last failed iteration, [2]: successful iterations + + void* device_heap; + HIP_CHECK(hipMalloc(&device_heap, DEVICE_HEAP_SIZE)); + + // init_device_heap_kernel<<<1, 1>>>(device_heap); + // HIP_CHECK(hipDeviceSynchronize()); + + // Allocate memory for the result and allocation success flag on the device + HIP_CHECK(hipMalloc(&d_result, sizeof(int))); + HIP_CHECK(hipMalloc(&d_alloc_success, sizeof(bool))); + HIP_CHECK(hipMalloc(&d_debug_info, 3 * sizeof(int))); + + // Initialize the result to 0 + HIP_CHECK(hipMemset(d_result, 0, sizeof(int))); + HIP_CHECK(hipMemset(d_debug_info, 0, 3 * sizeof(int))); + + // Launch the kernel + dim3 global(4, 4, 4); + dim3 local(2, 2, 2); + dynamicAllocationKernel<<>>(d_result, size, d_alloc_success); + HIP_CHECK(hipDeviceSynchronize()); + + // Copy the allocation success flag back to the host + HIP_CHECK(hipMemcpy(&h_alloc_success, d_alloc_success, sizeof(bool), hipMemcpyDeviceToHost)); + + if (!h_alloc_success) { + printf("Device-side malloc failed in the main kernel\n"); + exit(EXIT_FAILURE); + } + + // Copy the result back to the host + HIP_CHECK(hipMemcpy(&h_result, d_result, sizeof(int), hipMemcpyDeviceToHost)); + + // Check correctness using checksum + int num_blocks = (global.x * global.y * global.z) / (local.x * local.y * local.z); + int expected_checksum = 0; + for (int i = 0; i < size; i++) { + expected_checksum ^= i; // Compute expected checksum + } + expected_checksum *= num_blocks; + + if (h_result != expected_checksum) { + printf("Error: Incorrect checksum. Expected %d, got %d\n", expected_checksum, h_result); + exit(EXIT_FAILURE); + } + + printf("Main kernel checksum: %d (correct)\n", h_result); + + // Reset the result and allocation success flag + HIP_CHECK(hipMemset(d_result, 0, sizeof(int))); + HIP_CHECK(hipMemset(d_alloc_success, 0, sizeof(bool))); + + // Launch the stress test kernel + stressTestKernel<<<1, 1>>>(d_result, size, stress_iterations, d_alloc_success, d_debug_info); + HIP_CHECK(hipDeviceSynchronize()); + + // Copy the allocation success flag back to the host + HIP_CHECK(hipMemcpy(&h_alloc_success, d_alloc_success, sizeof(bool), hipMemcpyDeviceToHost)); + + if (!h_alloc_success) { + printf("Device-side malloc failed in the stress test kernel\n"); + exit(EXIT_FAILURE); + } + + // Copy the result back to the host + HIP_CHECK(hipMemcpy(&h_result, d_result, sizeof(int), hipMemcpyDeviceToHost)); + + // Check correctness for stress test using checksum + expected_checksum *= stress_iterations; + if (h_result != expected_checksum) { + printf("FAILED: Incorrect checksum in stress test. Expected %d, got %d\n", expected_checksum, h_result); + exit(EXIT_FAILURE); + } + + printf("Stress test checksum: %d (correct)\n", h_result); + printf("Stress test completed successfully\n"); + + // Copy debug info back to host + HIP_CHECK(hipMemcpy(h_debug_info, d_debug_info, 3 * sizeof(int), hipMemcpyDeviceToHost)); + + printf("Debug Info:\n"); + printf(" Failed allocations: %d\n", h_debug_info[0]); + printf(" Last failed iteration: %d\n", h_debug_info[1]); + printf(" Successful iterations: %d\n", h_debug_info[2]); + printf("PASSED\n"); + + // Free device memory + HIP_CHECK(hipFree(d_result)); + HIP_CHECK(hipFree(d_alloc_success)); + HIP_CHECK(hipFree(d_debug_info)); + + return 0; +} diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc index daf147ef1..b7b16a70d 100644 --- a/src/CHIPBackend.cc +++ b/src/CHIPBackend.cc @@ -84,6 +84,23 @@ static void queueVariableInitShadowKernel(chipstar::Queue *Q, queueKernel(Q, K); } +static void initDeviceHeap(chipstar::Queue *Q, chipstar::Module *M) { + logTrace("initDeviceHeap()"); + chipstar::DeviceVar *Var = M->getGlobalVar(ChipDeviceHeapName); + if (!Var) + return; + void *device_heap = Var->getDevAddr(); + auto *Ctx = Q->getContext(); + void *init_device_heap_ptr = + Ctx->allocate(sizeof(void *), 8, hipMemoryTypeDevice); + Var->setDevAddr(init_device_heap_ptr); + logInfo("initDeviceHeap() device_heap: {}", (void *)device_heap); + logInfo("initDeviceHeap() init_device_heap_ptr: {}", + (void *)init_device_heap_ptr); + queueVariableBindShadowKernel(Q, M, Var); + Q->finish(); +} + chipstar::CallbackData::CallbackData(hipStreamCallback_t TheCallbackF, void *TheCallbackArgs, chipstar::Queue *TheChipQueue) @@ -372,6 +389,9 @@ void chipstar::Module::prepareDeviceVariablesNoLock(chipstar::Device *Device, return; } + // Initialize device heap + initDeviceHeap(Queue, this); + auto Err = allocateDeviceVariablesNoLock(Device, Queue); (void)Err; diff --git a/src/SPVRegister.cc b/src/SPVRegister.cc index 52c4c0b12..fd49d13e2 100644 --- a/src/SPVRegister.cc +++ b/src/SPVRegister.cc @@ -101,9 +101,10 @@ void SPVRegister::bindVariable(SPVRegister::Handle Handle, HostPtr Ptr, // Host pointer should be associated with one source module and variable // at most. (!HostPtrLookup_.count(Ptr)) || - // A variable made for abort() implementation is an exception to this due - // to the way it's modeled. - (Name == ChipDeviceAbortFlagName && HostPtrLookup_[Ptr]->Name == Name) && + // A variable made for abort() and device-side malloc implementation is an + // exception to this due to the way it's modeled. + ((Name == ChipDeviceAbortFlagName || Name == ChipDeviceHeapName) && + HostPtrLookup_[Ptr]->Name == Name) && "Host-pointer is already mapped."); if (Name == ChipDeviceAbortFlagName) { diff --git a/src/common.hh b/src/common.hh index 8c2b33349..3bd9bd337 100644 --- a/src/common.hh +++ b/src/common.hh @@ -90,4 +90,7 @@ constexpr char ChipSpilledArgsVarPrefix[] = "__chip_spilled_args_"; /// the abort() function was called by a kernel. constexpr char ChipDeviceAbortFlagName[] = "__chipspv_abort_called"; +/// The name of a global variable which is the device heap. +constexpr char ChipDeviceHeapName[] = "__chipspv_device_heap"; + #endif