diff --git a/Makefile.am b/Makefile.am index 89157bf8..582a7a45 100755 --- a/Makefile.am +++ b/Makefile.am @@ -57,6 +57,10 @@ if HABANALABS libperftest_a_SOURCES += src/hl_memory.c endif +if OPENCL +libperftest_a_SOURCES += src/opencl_memory.c +endif + bin_PROGRAMS = ib_send_bw ib_send_lat ib_write_lat ib_write_bw ib_read_lat ib_read_bw ib_atomic_lat ib_atomic_bw bin_SCRIPTS = run_perftest_loopback run_perftest_multi_devices @@ -107,6 +111,17 @@ else LIBMLX4= endif +if HAVE_CUDA +libperftest_a_SOURCES += src/cuda_utils.cu +libperftest_a_LIBADD = src/cuda_utils.cu.o src/cuda_utils.cu.lo +SUFFIXES= .cu +%.cu.lo: %.cu.o + $(NVCC) -dlink -o $@ $< -L$(CUDA_LIB_DIR) -lcudadevrt -lcudart +%.cu.o: %.cu + $(NVCC) -DHAVE_CONFIG_H $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) -o $@ -c $< $(NVCCFLAGS) +endif + + ib_send_bw_SOURCES = src/send_bw.c src/multicast_resources.c src/multicast_resources.h ib_send_bw_LDADD = libperftest.a $(LIBUMAD) $(LIBMATH) $(LIBMLX4) $(LIBMLX5) $(LIBEFA) $(LIBHNS) diff --git a/configure.ac b/configure.ac index aa91abe0..bb14e740 100755 --- a/configure.ac +++ b/configure.ac @@ -39,6 +39,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) AC_SUBST(MAJOR_VERSION) AC_SUBST(MINOR_VERSION) +AC_ARG_VAR(HAVE_OPENCL, opencl support) AC_ARG_VAR(CUDA_H_PATH, help-string) AC_ARG_VAR(RANLIB, ranlib tool) @@ -273,7 +274,7 @@ fi if [test "$CUDA_H_PATH" ]; then AC_DEFINE([HAVE_CUDA], [1], [Enable CUDA feature]) AC_DEFINE_UNQUOTED([CUDA_PATH], "$CUDA_H_PATH" , [Enable CUDA feature]) - LIBS=$LIBS" -lcuda" + LIBS=$LIBS" -lcuda -lcudart" AC_CHECK_LIB([cuda], [cuMemGetHandleForAddressRange], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=yes], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=no]) AC_TRY_LINK([ #include <$CUDA_H_PATH>], @@ -282,10 +283,34 @@ if [test "$CUDA_H_PATH" ]; then if [test "x$HAVE_REG_DMABUF_MR" = "xyes"] && [test "x$HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE" = "xyes"] && [test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"]; then AC_DEFINE([HAVE_CUDA_DMABUF], [1], [Enable CUDA DMABUF feature]) fi + + if [test "x${gpu_arch}" != "x"]; then + NVCCFLAGS="${NVCCFLAGS} -arch compute_${gpu_arch} -code compute_${gpu_arch},sm_${gpu_arch}" + AC_MSG_NOTICE([Setting GPU_ARCH = ${gpu_arch}]) + fi + NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive" + AC_CHECK_DECLS([CUDA_VERSION], [HAVE_CUDA_VERSION=yes], [HAVE_CUDA_VERSION=no], [[#include "$CUDA_H_PATH"]]) + if [test "x$HAVE_CUDA_VERSION" = "xyes"]; then + cuda_toolkit_version=`grep "define CUDA_VERSION" $CUDA_H_PATH | cut -d' ' -f3` + AS_VERSION_COMPARE([$cuda_toolkit_version], [11070], [HAVE_CUDA_DIAGSUPPRESS=no], [HAVE_CUDA_DIAGSUPPRESS=yes], [HAVE_CUDA_DIAGSUPPRESS=yes]) + if [test "x$HAVE_CUDA_DIAGSUPPRESS" = "xyes"]; then + NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive -diag-suppress 2464 -diag-suppress 815" + else + NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive" + fi + fi + + CUDA_TOOLKIT_PATH=$(dirname $(dirname $(dirname $(dirname $CUDA_H_PATH)))) + AC_DEFINE_UNQUOTED([CUDA_TOOLKIT_PATH], "$CUDA_TOOLKIT_PATH", [Path to the CUDA Toolkit]) + AC_SUBST([NVCCFLAGS], ["${NVCCFLAGS}"]) + AC_SUBST([NVCC], ["$CUDA_TOOLKIT_PATH/bin/nvcc"]) + + AC_CHECK_LIB([cudart], [cudaMalloc], [], [AC_MSG_ERROR([libcudart not found])]) fi AM_CONDITIONAL([CUDA_DMA_BUF_PARAMETERS_SUPPORT],[test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"]) AM_CONDITIONAL([CUDA], [test "$CUDA_H_PATH"]) +AM_CONDITIONAL([HAVE_CUDA], [test "$CUDA_H_PATH"]) AC_ARG_ENABLE([neuron], [AS_HELP_STRING([--enable-neuron], @@ -446,6 +471,12 @@ fi CFLAGS="-g -Wall -D_GNU_SOURCE -O3 $CFLAGS" LDFLAGS="$LDFLAGS" LIBS=$LIBS" -lpthread" +AM_CONDITIONAL([OPENCL], [test "$HAVE_OPENCL"]) +if [test $HAVE_OPENCL = yes]; then + AC_DEFINE([HAVE_OPENCL], [1], [Enable OPENCL feature]) + AC_CHECK_LIB([OpenCL], [clGetDeviceIDs], [], [AC_MSG_ERROR([libOpenCL not found])]) +fi + AC_SUBST([LIBUMAD]) AC_SUBST([LIBMATH]) AC_CONFIG_FILES([Makefile]) diff --git a/man/perftest.1 b/man/perftest.1 index a170a0fe..9bcc77ad 100644 --- a/man/perftest.1 +++ b/man/perftest.1 @@ -353,6 +353,11 @@ many different options and modes. Not relevant for raw_ethernet_fs_rate. System support required. .TP +.B --cuda_mem_type= + Set CUDA memory type =0(device,default),1(managed),4(malloc) + Not relevant for raw_ethernet_fs_rate. + System support required. +.TP .B --use_cuda_dmabuf Use CUDA DMA-BUF for GPUDirect RDMA testing. Not relevant for raw_ethernet_fs_rate. @@ -375,6 +380,19 @@ many different options and modes. Not relevant for raw_ethernet_fs_rate. System support required. .TP +.B --use_opencl= + Use OpenCl specific device for GPUDirect RDMA testing + Not relevant for raw_ethernet_fs_rate. + System support required. +.TP +.B --opencl_platform_id= + Use OpenCl specific platform ID + System support required. +.TP +.B --gpu_touch= + Set GPU touch mode to test memory accesses during the testing process. + Relevant only for CUDA and OpenCL memory types +.TP .B --use_hugepages Use Hugepages instead of contig, memalign allocations. Not relevant for raw_ethernet_fs_rate. diff --git a/src/cuda_memory.c b/src/cuda_memory.c index bf71f53a..821f71eb 100644 --- a/src/cuda_memory.c +++ b/src/cuda_memory.c @@ -18,11 +18,24 @@ #define ACCEL_PAGE_SIZE (64 * 1024) +static const char *cuda_mem_type_str[] = { + "CUDA_MEM_DEVICE", + "CUDA_MEM_MANAGED", + "CUDA_MEM_HOSTALLOC", + "CUDA_MEM_HOSTREGISTER", + "CUDA_MEM_MALLOC", + "CUDA_MEM_TYPES" +}; + +int touch_gpu_pages(uint8_t *addr, int buf_size, int is_infinitely, volatile int **stop_flag); struct cuda_memory_ctx { struct memory_ctx base; + int mem_type; + int gpu_touch; int device_id; char *device_bus_id; + volatile int *stop_touch_gpu_kernel_flag; // used for stopping cuda gpu_touch kernel CUdevice cuDevice; CUcontext cuContext; bool use_dmabuf; @@ -152,20 +165,17 @@ int cuda_memory_destroy(struct memory_ctx *ctx) { return SUCCESS; } -int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t size, int *dmabuf_fd, - uint64_t *dmabuf_offset, void **addr, bool *can_init) { +static int cuda_allocate_device_memory_buffer(struct cuda_memory_ctx *cuda_ctx, uint64_t size, int *dmabuf_fd, + uint64_t *dmabuf_offset, void **addr, bool *can_init) { int error; size_t buf_size = (size + ACCEL_PAGE_SIZE - 1) & ~(ACCEL_PAGE_SIZE - 1); // Check if discrete or integrated GPU (tegra), for allocating memory where adequate - struct cuda_memory_ctx *cuda_ctx = container_of(ctx, struct cuda_memory_ctx, base); int cuda_device_integrated; cuDeviceGetAttribute(&cuda_device_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_ctx->cuDevice); printf("CUDA device integrated: %X\n", (unsigned int)cuda_device_integrated); if (cuda_device_integrated == 1) { - printf("cuMemAllocHost() of a %lu bytes GPU buffer\n", size); - error = cuMemAllocHost(addr, buf_size); if (error != CUDA_SUCCESS) { printf("cuMemAllocHost error=%d\n", error); @@ -176,15 +186,12 @@ int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t *can_init = false; } else { CUdeviceptr d_A; - printf("cuMemAlloc() of a %lu bytes GPU buffer\n", size); - error = cuMemAlloc(&d_A, buf_size); if (error != CUDA_SUCCESS) { printf("cuMemAlloc error=%d\n", error); return FAILURE; } - printf("allocated GPU buffer address at %016llx pointer=%p\n", d_A, (void *)d_A); *addr = (void *)d_A; *can_init = false; @@ -215,6 +222,64 @@ int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t #endif } + return CUDA_SUCCESS; +} + +int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t size, int *dmabuf_fd, + uint64_t *dmabuf_offset, void **addr, bool *can_init) { + int error; + CUdeviceptr d_ptr; + + struct cuda_memory_ctx *cuda_ctx = container_of(ctx, struct cuda_memory_ctx, base); + + switch (cuda_ctx->mem_type) { + case CUDA_MEM_DEVICE: + error = cuda_allocate_device_memory_buffer(cuda_ctx, size, dmabuf_fd, + dmabuf_offset, addr, can_init); + if (error != CUDA_SUCCESS) + return FAILURE; + break; + case CUDA_MEM_MANAGED: + error = cuMemAllocManaged(&d_ptr, size, CU_MEM_ATTACH_GLOBAL); + if (error != CUDA_SUCCESS) { + printf("cuMemAllocManaged error=%d\n", error); + return FAILURE; + } + + *addr = (void *)d_ptr; + *can_init = false; + break; + + case CUDA_MEM_MALLOC: + *can_init = false; + // Fall through + + printf("Host allocation selected, calling memalign allocator for %zd bytes with %d page size\n", size, alignment); + *addr = memalign(alignment, size); + if (!*addr) { + printf("memalign error=%d\n", errno); + return FAILURE; + } + + break; + /* + * TODO: Add Implementation for HOSTALLOC and HOSTREGISTER + * buffer allocations + */ + case CUDA_MEM_HOSTALLOC: + case CUDA_MEM_HOSTREGISTER: + default: + printf("invalid CUDA memory type\n"); + return FAILURE; + } + + printf("allocated GPU buffer of a %lu address at %p for type %s\n", size, addr, cuda_mem_type_str[cuda_ctx->mem_type]); + + if (cuda_ctx->gpu_touch != GPU_NO_TOUCH) { + printf("Starting GPU touching process\n"); + return touch_gpu_pages((uint8_t *)*addr, size, cuda_ctx->gpu_touch == GPU_TOUCH_INFINITE, &cuda_ctx->stop_touch_gpu_kernel_flag); + } + return SUCCESS; } @@ -223,13 +288,30 @@ int cuda_memory_free_buffer(struct memory_ctx *ctx, int dmabuf_fd, void *addr, u int cuda_device_integrated; cuDeviceGetAttribute(&cuda_device_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_ctx->cuDevice); - if (cuda_device_integrated == 1) { - printf("deallocating GPU buffer %p\n", addr); - cuMemFreeHost(addr); - } else { - CUdeviceptr d_A = (CUdeviceptr)addr; - printf("deallocating GPU buffer %016llx\n", d_A); - cuMemFree(d_A); + if (cuda_ctx->stop_touch_gpu_kernel_flag) { + *cuda_ctx->stop_touch_gpu_kernel_flag = 1; + printf("stopping CUDA gpu touch running kernel\n"); + cuCtxSynchronize(); + cuMemFree((CUdeviceptr)cuda_ctx->stop_touch_gpu_kernel_flag); + } + + switch (cuda_ctx->mem_type) { + case CUDA_MEM_DEVICE: + if (cuda_device_integrated == 1) { + printf("deallocating GPU buffer %p\n", addr); + cuMemFreeHost(addr); + } else { + CUdeviceptr d_A = (CUdeviceptr)addr; + printf("deallocating GPU buffer %016llx\n", d_A); + cuMemFree(d_A); + } + break; + case CUDA_MEM_MANAGED: + CUCHECK(cuMemFree((CUdeviceptr)addr)); + break; + case CUDA_MEM_MALLOC: + free((void *) addr); + break; } return SUCCESS; @@ -271,6 +353,9 @@ struct memory_ctx *cuda_memory_create(struct perftest_parameters *params) { ctx->device_id = params->cuda_device_id; ctx->device_bus_id = params->cuda_device_bus_id; ctx->use_dmabuf = params->use_cuda_dmabuf; + ctx->gpu_touch = params->gpu_touch; + ctx->stop_touch_gpu_kernel_flag = NULL; + ctx->mem_type = params->cuda_mem_type; return &ctx->base; } diff --git a/src/cuda_utils.cu b/src/cuda_utils.cu new file mode 100644 index 00000000..53946a3b --- /dev/null +++ b/src/cuda_utils.cu @@ -0,0 +1,30 @@ +#include +#include +#include "cuda.h" + +#include +#define GPU_TOUCH_STEP 4096 + +__global__ void cuda_touch_pages(volatile uint8_t *c, int size, + volatile int *stop_flag, int is_infinite) { + do { + for (int iter = 0; iter < size; iter += GPU_TOUCH_STEP) + c[iter] = 0; + } + while (is_infinite && !*stop_flag); +} + +extern "C" int touch_gpu_pages(uint8_t *addr, int buf_size, + int is_infinite, volatile int **stop_flag) +{ + cudaError_t ret = cudaMallocManaged((void **)stop_flag, sizeof(int)); + if (ret) { + printf("failed to allocate stop flag\n"); + return -1; + } + + *stop_flag = 0; + cuda_touch_pages<<<1, 1>>>(addr, buf_size, *stop_flag, is_infinite); + + return 0; +} diff --git a/src/opencl_memory.c b/src/opencl_memory.c new file mode 100644 index 00000000..866a05d5 --- /dev/null +++ b/src/opencl_memory.c @@ -0,0 +1,248 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ + +#include +#include +#include +#include +#include +#include "perftest_parameters.h" +#include "perftest_resources.h" +#define CL_TARGET_OPENCL_VERSION 220 +#include + +__attribute__ ((unused)) static const char *opencl_mem_types_str[] = { + "OPENCL_MEM_SVM", +}; + +// Allocating this context on heap so it can be passed to other thread +struct buffer_ctx { + pthread_t thread; + const void * addr; + bool gpu_touch_infinite; + bool gpu_touch_stop; + size_t size; + cl_command_queue command_queue; +}; + +struct opencl_memory_ctx { + struct buffer_ctx *buffer_ctx; + struct memory_ctx base; + int platform_ix; + int device_ix; + cl_context cl_context; + cl_command_queue command_queue; + int gpu_touch; +}; + +static int init_gpu(struct opencl_memory_ctx *ctx) +{ + cl_uint num_devices; + cl_uint num_platforms; + cl_device_svm_capabilities caps; + cl_int error; + cl_platform_id *platform_ids; + cl_device_id *device_ids; + int ret = SUCCESS; + + platform_ids = malloc(sizeof(cl_platform_id) * (ctx->platform_ix + 1)); + if (!platform_ids) { + printf("Allocation of platform_ids failed\n"); + return FAILURE; + } + + device_ids = malloc(sizeof(cl_device_id) * (ctx->device_ix + 1)); + if (!device_ids) { + printf("Allocation of device_ids failed\n"); + ret = FAILURE; + goto free_platform_ids; + } + + error = clGetPlatformIDs((ctx->platform_ix + 1), platform_ids, &num_platforms); + if (error) { + printf("clGetPlatformIDs returned %d\n", error); + ret = FAILURE; + goto free_device_ids; + } + + if (num_platforms < ctx->platform_ix) + { + printf("platform_id (%d) is not in the range of num_platforms (%d)\n", + ctx->platform_ix, num_platforms); + ret = FAILURE; + goto free_device_ids; + } + + error = clGetDeviceIDs(platform_ids[ctx->platform_ix], CL_DEVICE_TYPE_ALL, (ctx->device_ix + 1), device_ids, &num_devices); + if (error) { + printf("clGetDeviceIDs returned %d\n", error); + ret = FAILURE; + goto free_device_ids; + } + + if (num_devices < ctx->device_ix) + { + printf("device_id (%d) is not in the range of num_devices (%d)\n", + ctx->device_ix, num_devices); + ret = FAILURE; + goto free_device_ids; + } + + error = clGetDeviceInfo(device_ids[ctx->device_ix], CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, 0); + if (error) { + printf("clGetDeviceInfo returned %d\n", error); + ret = FAILURE; + goto free_device_ids; + } + + if (!(caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) + { + printf("SVM needed caps are not supported\n"); + ret = FAILURE; + goto free_device_ids; + } + + ctx->cl_context = clCreateContext(NULL, 1, &device_ids[ctx->device_ix], NULL, NULL, &error); + if (error) { + printf("clCreateContext returned %d\n", error); + ret = FAILURE; + goto free_device_ids; + } + + ctx->command_queue = clCreateCommandQueueWithProperties(ctx->cl_context, device_ids[ctx->device_ix], NULL, &error); + if (error) { + printf("clCreateCommandQueueWithProperties failed with ret=%d\n", error); + clReleaseContext(ctx->cl_context); + ret = FAILURE; + } + +free_device_ids: + free(device_ids); +free_platform_ids: + free(platform_ids); + + return ret; +} + +static void free_gpu(struct opencl_memory_ctx *ctx) +{ + printf("destroying current OpenCL ctx\n"); + clReleaseCommandQueue(ctx->command_queue); + clReleaseContext(ctx->cl_context); +} + +int opencl_memory_init(struct memory_ctx *ctx) +{ + struct opencl_memory_ctx *opencl_ctx = container_of(ctx, struct opencl_memory_ctx, base); + int return_value = 0; + + return_value = init_gpu(opencl_ctx); + if (return_value) { + fprintf(stderr, "Couldn't init GPU context: %d\n", return_value); + return FAILURE; + } + + return SUCCESS; +} + +int opencl_memory_destroy(struct memory_ctx *ctx) { + struct opencl_memory_ctx *opencl_ctx = container_of(ctx, struct opencl_memory_ctx, base); + + free_gpu(opencl_ctx); + free(opencl_ctx); + return SUCCESS; +} + +void *touch_gpu_pages(void *ctx_param) { + struct buffer_ctx *ctx = (struct buffer_ctx *)ctx_param; + int ret; + do { + ret = clEnqueueSVMMigrateMem(ctx->command_queue, 1, &ctx->addr, &ctx->size, 0, 0, NULL, NULL); + if (ret) { + printf("clEnqueueSVMMigrateMem failed with ret=%d\n", ret); + break; + } + + ret = clFlush(ctx->command_queue); + if (ret) { + printf("clFlush with ret=%d\n", ret); + break; + } + + ret = clFinish(ctx->command_queue); + if (ret) { + printf("clFinish with ret=%d\n", ret); + break; + } + } while (ctx->gpu_touch_infinite && !ctx->gpu_touch_stop); + return NULL; +} + +int opencl_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t size, int *dmabuf_fd, uint64_t *dmabuf_offset, void **addr, bool *can_init) { + struct opencl_memory_ctx *opencl_ctx = container_of(ctx, struct opencl_memory_ctx, base); + + *addr = clSVMAlloc(opencl_ctx->cl_context, CL_MEM_READ_WRITE, size, MAX(alignment, sysconf(_SC_PAGESIZE))); + if (!*addr) + { + printf("clSVMAlloc failed\n"); + return -1; + } + + opencl_ctx->buffer_ctx = NULL; + + if (opencl_ctx->gpu_touch != GPU_NO_TOUCH) { + opencl_ctx->buffer_ctx = malloc(sizeof(struct buffer_ctx)); + if (!opencl_ctx->buffer_ctx) { + clSVMFree(opencl_ctx->cl_context, addr); + printf("Failed to allocate context for gpu_touch\n"); + return -ENOMEM; + } + + opencl_ctx->buffer_ctx->addr = *addr; + opencl_ctx->buffer_ctx->gpu_touch_infinite = opencl_ctx->gpu_touch == GPU_TOUCH_INFINITE; + opencl_ctx->buffer_ctx->gpu_touch_stop = 0; + opencl_ctx->buffer_ctx->size = size; + opencl_ctx->buffer_ctx->command_queue = opencl_ctx->command_queue; + *can_init = false; + + return pthread_create(&opencl_ctx->buffer_ctx->thread, NULL, touch_gpu_pages, opencl_ctx->buffer_ctx); + } + + return 0; +} + +int opencl_memory_free_buffer(struct memory_ctx *ctx, int dmabuf_fd, void *addr, uint64_t size) { + struct opencl_memory_ctx *opencl_ctx = container_of(ctx, struct opencl_memory_ctx, base); + if (opencl_ctx->buffer_ctx) { + opencl_ctx->buffer_ctx->gpu_touch_stop = 1; + if (pthread_join(opencl_ctx->buffer_ctx->thread, NULL)) { + free(opencl_ctx->buffer_ctx); + printf("Error stopping gpu_touch thread\n"); + return -1; + } + free(opencl_ctx->buffer_ctx); + } + clSVMFree(opencl_ctx->cl_context, addr); + return 0; +} + +bool opencl_memory_supported() { + return true; +} + +struct memory_ctx *opencl_memory_create(struct perftest_parameters *params) { + struct opencl_memory_ctx *ctx; + + ALLOCATE(ctx, struct opencl_memory_ctx, 1); + ctx->base.init = opencl_memory_init; + ctx->base.destroy = opencl_memory_destroy; + ctx->base.allocate_buffer = opencl_memory_allocate_buffer; + ctx->base.free_buffer = opencl_memory_free_buffer; + ctx->base.copy_host_to_buffer = memcpy; + ctx->base.copy_buffer_to_host = memcpy; + ctx->base.copy_buffer_to_buffer = memcpy; + ctx->device_ix = params->opencl_device_id; + ctx->platform_ix = params->opencl_platform_id; + ctx->gpu_touch = params->gpu_touch; + + return &ctx->base; +} diff --git a/src/opencl_memory.h b/src/opencl_memory.h new file mode 100644 index 00000000..c5d7969a --- /dev/null +++ b/src/opencl_memory.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ + +#ifndef OPENCL_MEMORY_H +#define OPENCL_MEMORY_H + +#include "memory.h" +#include "config.h" + + +struct perftest_parameters; + +bool opencl_memory_supported(); + +struct memory_ctx *opencl_memory_create(struct perftest_parameters *params); + + +#ifndef HAVE_OPENCL + +inline bool opencl_memory_supported() { + return false; +} + +inline struct memory_ctx *opencl_memory_create(struct perftest_parameters *params) { + return NULL; +} + +#endif + +#endif /* OPENCL_MEMORY_H */ diff --git a/src/perftest_parameters.c b/src/perftest_parameters.c index 659250db..7839b883 100755 --- a/src/perftest_parameters.c +++ b/src/perftest_parameters.c @@ -19,6 +19,7 @@ #include "neuron_memory.h" #include "hl_memory.h" #include "mlu_memory.h" +#include "opencl_memory.h" #include #ifdef HAVE_RO #include @@ -576,6 +577,8 @@ static void usage(const char *argv0, VerbType verb, TestType tst, int connection if (cuda_memory_supported()) { printf(" --use_cuda="); printf(" Use CUDA specific device for GPUDirect RDMA testing\n"); + printf(" --cuda_mem_type="); + printf(" Set CUDA memory type =0(device,default),1(managed),4(malloc)\n"); printf(" --use_cuda_bus_id="); printf(" Use CUDA specific device, based on its full PCIe address, for GPUDirect RDMA testing\n"); @@ -611,6 +614,19 @@ static void usage(const char *argv0, VerbType verb, TestType tst, int connection printf(" Use selected MLU device for MLUDirect RDMA testing\n"); } + if (opencl_memory_supported()) { + printf(" --use_opencl="); + printf(" Use OpenCl specific device for GPUDirect RDMA testing\n"); + printf(" --opencl_platform_id="); + printf(" Use OpenCl specific platform ID\n"); + } + + if (cuda_memory_supported() || + opencl_memory_supported()) { + printf(" --gpu_touch= "); + printf(" Set GPU touch mode to test memory accesses during the testing process.\n"); + } + printf(" --use_hugepages "); printf(" Use Hugepages instead of contig, memalign allocations.\n"); } @@ -834,9 +850,13 @@ static void init_perftest_params(struct perftest_parameters *user_param) user_param->cuda_device_id = 0; user_param->cuda_device_bus_id = NULL; user_param->use_cuda_dmabuf = 0; + user_param->cuda_mem_type = CUDA_MEM_DEVICE; user_param->rocm_device_id = 0; user_param->neuron_core_id = 0; user_param->mlu_device_id = 0; + user_param->opencl_platform_id = 0; + user_param->opencl_device_id = 0; + user_param->gpu_touch = GPU_NO_TOUCH; user_param->mmap_file = NULL; user_param->mmap_offset = 0; user_param->iters_per_port[0] = 0; @@ -2326,11 +2346,15 @@ int parser(struct perftest_parameters *user_param,char *argv[], int argc) static int use_cuda_flag = 0; static int use_cuda_bus_id_flag = 0; static int use_cuda_dmabuf_flag = 0; + static int cuda_mem_type_flag = 0; static int use_rocm_flag = 0; static int use_neuron_flag = 0; static int use_neuron_dmabuf_flag = 0; static int use_hl_flag = 0; static int use_mlu_flag = 0; + static int use_opencl_flag = 0; + static int opencl_platform_id_flag = 0; + static int gpu_touch_flag = 0; static int disable_pcir_flag = 0; static int mmap_file_flag = 0; static int mmap_offset_flag = 0; @@ -2498,11 +2522,15 @@ int parser(struct perftest_parameters *user_param,char *argv[], int argc) { .name = "use_cuda", .has_arg = 1, .flag = &use_cuda_flag, .val = 1}, { .name = "use_cuda_bus_id", .has_arg = 1, .flag = &use_cuda_bus_id_flag, .val = 1}, { .name = "use_cuda_dmabuf", .has_arg = 0, .flag = &use_cuda_dmabuf_flag, .val = 1}, + { .name = "cuda_mem_type", .has_arg = 1, .flag = &cuda_mem_type_flag, .val = 1}, { .name = "use_rocm", .has_arg = 1, .flag = &use_rocm_flag, .val = 1}, { .name = "use_neuron", .has_arg = 1, .flag = &use_neuron_flag, .val = 1}, { .name = "use_neuron_dmabuf", .has_arg = 0, .flag = &use_neuron_dmabuf_flag, .val = 1}, { .name = "use_hl", .has_arg = 1, .flag = &use_hl_flag, .val = 1}, { .name = "use_mlu", .has_arg = 1, .flag = &use_mlu_flag, .val = 1}, + { .name = "use_opencl", .has_arg = 1, .flag = &use_opencl_flag, .val = 1}, + { .name = "opencl_platform_id", .has_arg = 1, .flag = &opencl_platform_id_flag, .val = 1}, + { .name = "gpu_touch", .has_arg = 1, .flag = &gpu_touch_flag, .val = 1}, { .name = "mmap", .has_arg = 1, .flag = &mmap_file_flag, .val = 1}, { .name = "mmap-offset", .has_arg = 1, .flag = &mmap_offset_flag, .val = 1}, { .name = "ipv6", .has_arg = 0, .flag = &ipv6_flag, .val = 1}, @@ -2934,6 +2962,9 @@ int parser(struct perftest_parameters *user_param,char *argv[], int argc) CHECK_VALUE_NON_NEGATIVE(user_param->latency_gap,int,"Latency gap time",not_int_ptr); latency_gap_flag = 0; } + if (odp_flag) { + user_param->use_odp = 1; + } /* We statically define memory type options so check if requested option is actually supported. */ if (((use_cuda_flag || use_cuda_bus_id_flag) && !cuda_memory_supported()) || (use_cuda_dmabuf_flag && !cuda_memory_dmabuf_supported()) || @@ -2941,7 +2972,8 @@ int parser(struct perftest_parameters *user_param,char *argv[], int argc) (use_neuron_flag && !neuron_memory_supported()) || (use_neuron_dmabuf_flag && !neuron_memory_dmabuf_supported()) || (use_hl_flag && !hl_memory_supported()) || - (use_mlu_flag && !mlu_memory_supported())) { + (use_mlu_flag && !mlu_memory_supported()) || + (use_opencl_flag && !opencl_memory_supported())) { printf(" Unsupported memory type\n"); return FAILURE; } @@ -2974,6 +3006,27 @@ int parser(struct perftest_parameters *user_param,char *argv[], int argc) } use_cuda_dmabuf_flag = 0; } + if (cuda_mem_type_flag) { + user_param->cuda_mem_type = strtol(optarg,NULL,0); + if (user_param->memory_type != MEMORY_CUDA) { + fprintf(stderr, "CUDA MEM TYPE cannot be used without CUDA\n"); + free(duplicates_checker); + return FAILURE; + } + if (user_param->cuda_mem_type < CUDA_MEM_DEVICE || user_param->cuda_mem_type >= CUDA_MEM_TYPES) { + fprintf(stderr, "invalid CUDA memory type %d\n", user_param->cuda_mem_type); + free(duplicates_checker); + return FAILURE; + } + if ((user_param->cuda_mem_type == CUDA_MEM_MALLOC || + user_param->cuda_mem_type == CUDA_MEM_MANAGED) && + (!user_param->use_odp || user_param->use_cuda_dmabuf)) { + fprintf(stderr, "CUDA Memory type is not supported with no odp MR or with dmabuf\n"); + free(duplicates_checker); + return FAILURE; + } + cuda_mem_type_flag = 0; + } if (use_rocm_flag) { CHECK_VALUE_NON_NEGATIVE(user_param->rocm_device_id,int,"ROCm device",not_int_ptr); user_param->memory_type = MEMORY_ROCM; @@ -3005,12 +3058,59 @@ int parser(struct perftest_parameters *user_param,char *argv[], int argc) user_param->memory_create = hl_memory_create; use_hl_flag = 0; } + if (use_mlu_flag) { CHECK_VALUE_NON_NEGATIVE(user_param->mlu_device_id,int,"MLU device",not_int_ptr); user_param->memory_type = MEMORY_MLU; user_param->memory_create = mlu_memory_create; use_mlu_flag = 0; } + + if (use_opencl_flag) { + CHECK_VALUE_NON_NEGATIVE(user_param->opencl_device_id,int,"OPENCL device",not_int_ptr); + if (!user_param->use_odp) { + fprintf(stderr, "OPENCL flag is only supported for ODP MR\n"); + free(duplicates_checker); + return FAILURE; + } + user_param->memory_type = MEMORY_OPENCL; + user_param->memory_create = opencl_memory_create; + use_opencl_flag = 0; + } + if (opencl_platform_id_flag) { + CHECK_VALUE_NON_NEGATIVE(user_param->opencl_platform_id,int,"OPENCL Platform ID",not_int_ptr); + if (user_param->memory_type != MEMORY_OPENCL) { + fprintf(stderr, "OpenCL platform ID cannot be used without OpenCL device\n"); + free(duplicates_checker); + return FAILURE; + } + opencl_platform_id_flag = 0; + } + if (gpu_touch_flag) { + if (user_param->memory_type != MEMORY_CUDA && + user_param->memory_type != MEMORY_OPENCL) { + fprintf(stderr, "GPU touch is not supported for this MEMORY_TYPE\n"); + free(duplicates_checker); + return FAILURE; + } + + if (!user_param->use_odp) { + fprintf(stderr, "GPU touch is only supported for ODP MR\n"); + free(duplicates_checker); + return FAILURE; + } + + if (strcmp("ONCE", optarg) == 0 || strcmp("once", optarg) == 0) + user_param->gpu_touch = GPU_TOUCH_ONCE; + else if (strcmp("INFINITE", optarg) == 0 || strcmp("infinite", optarg) == 0) + user_param->gpu_touch = GPU_TOUCH_INFINITE; + else { + fprintf(stderr," Unsupported value for gpu_touch\n"); + free(duplicates_checker); + return FAILURE; + } + gpu_touch_flag = 0; + } if (flow_label_flag) { CHECK_VALUE(user_param->flow_label,int,"flow label",not_int_ptr); if (user_param->connection_type == RawEth && user_param->flow_label < 0) { @@ -3367,9 +3467,6 @@ int parser(struct perftest_parameters *user_param,char *argv[], int argc) } } - if(odp_flag) { - user_param->use_odp = 1; - } if(hugepages_flag) { user_param->use_hugepages = 1; diff --git a/src/perftest_parameters.h b/src/perftest_parameters.h index 3f3ce235..cb22c757 100755 --- a/src/perftest_parameters.h +++ b/src/perftest_parameters.h @@ -443,7 +443,24 @@ enum memory_type { MEMORY_ROCM, MEMORY_NEURON, MEMORY_HL, - MEMORY_MLU + MEMORY_MLU, + MEMORY_OPENCL +}; + +enum cuda_mem_type { + CUDA_MEM_DEVICE = 0, + CUDA_MEM_MANAGED, + CUDA_MEM_HOSTALLOC, + CUDA_MEM_HOSTREGISTER, + CUDA_MEM_MALLOC, + CUDA_MEM_TYPES +}; + +enum gpu_touch_type { + GPU_NO_TOUCH, + GPU_TOUCH_ONCE, + GPU_TOUCH_INFINITE, + GPU_TOUCH_TYPES }; struct perftest_parameters { @@ -572,12 +589,16 @@ struct perftest_parameters { struct memory_ctx *(*memory_create)(struct perftest_parameters *params); int cuda_device_id; char *cuda_device_bus_id; + int cuda_mem_type; int use_cuda_dmabuf; int rocm_device_id; int neuron_core_id; int use_neuron_dmabuf; char *hl_device_bus_id; int mlu_device_id; + int opencl_platform_id; + int opencl_device_id; + int gpu_touch; char *mmap_file; unsigned long mmap_offset; /* New test params format pilot. will be used in all flags soon,. */