Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perftest: Add GPU Touch Flag for Testing GPU Memory Interference with CUDA and OpenCL Support #302

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ if HABANALABS
libperftest_a_SOURCES += src/hl_memory.c
endif

if OPENCL
libperftest_a_SOURCES += src/opencl_memory.c
endif

bin_PROGRAMS = ib_send_bw ib_send_lat ib_write_lat ib_write_bw ib_read_lat ib_read_bw ib_atomic_lat ib_atomic_bw
bin_SCRIPTS = run_perftest_loopback run_perftest_multi_devices

Expand Down Expand Up @@ -107,6 +111,17 @@ else
LIBMLX4=
endif

if HAVE_CUDA
libperftest_a_SOURCES += src/cuda_utils.cu
libperftest_a_LIBADD = src/cuda_utils.cu.o src/cuda_utils.cu.lo
SUFFIXES= .cu
%.cu.lo: %.cu.o
$(NVCC) -dlink -o $@ $< -L$(CUDA_LIB_DIR) -lcudadevrt -lcudart
%.cu.o: %.cu
$(NVCC) -DHAVE_CONFIG_H $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) -o $@ -c $< $(NVCCFLAGS)
endif


ib_send_bw_SOURCES = src/send_bw.c src/multicast_resources.c src/multicast_resources.h
ib_send_bw_LDADD = libperftest.a $(LIBUMAD) $(LIBMATH) $(LIBMLX4) $(LIBMLX5) $(LIBEFA) $(LIBHNS)

Expand Down
33 changes: 32 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])])
AC_SUBST(MAJOR_VERSION)
AC_SUBST(MINOR_VERSION)

AC_ARG_VAR(HAVE_OPENCL, opencl support)
AC_ARG_VAR(CUDA_H_PATH, help-string)
AC_ARG_VAR(RANLIB, ranlib tool)

Expand Down Expand Up @@ -273,7 +274,7 @@ fi
if [test "$CUDA_H_PATH" ]; then
AC_DEFINE([HAVE_CUDA], [1], [Enable CUDA feature])
AC_DEFINE_UNQUOTED([CUDA_PATH], "$CUDA_H_PATH" , [Enable CUDA feature])
LIBS=$LIBS" -lcuda"
LIBS=$LIBS" -lcuda -lcudart"
AC_CHECK_LIB([cuda], [cuMemGetHandleForAddressRange], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=yes], [HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE=no])
AC_TRY_LINK([
#include <$CUDA_H_PATH>],
Expand All @@ -282,10 +283,34 @@ if [test "$CUDA_H_PATH" ]; then
if [test "x$HAVE_REG_DMABUF_MR" = "xyes"] && [test "x$HAVE_CUDA_CUMEMGETHANDLEFORADDRESSRANGE" = "xyes"] && [test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"]; then
AC_DEFINE([HAVE_CUDA_DMABUF], [1], [Enable CUDA DMABUF feature])
fi

if [test "x${gpu_arch}" != "x"]; then
NVCCFLAGS="${NVCCFLAGS} -arch compute_${gpu_arch} -code compute_${gpu_arch},sm_${gpu_arch}"
AC_MSG_NOTICE([Setting GPU_ARCH = ${gpu_arch}])
fi
NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive"
AC_CHECK_DECLS([CUDA_VERSION], [HAVE_CUDA_VERSION=yes], [HAVE_CUDA_VERSION=no], [[#include "$CUDA_H_PATH"]])
if [test "x$HAVE_CUDA_VERSION" = "xyes"]; then
cuda_toolkit_version=`grep "define CUDA_VERSION" $CUDA_H_PATH | cut -d' ' -f3`
AS_VERSION_COMPARE([$cuda_toolkit_version], [11070], [HAVE_CUDA_DIAGSUPPRESS=no], [HAVE_CUDA_DIAGSUPPRESS=yes], [HAVE_CUDA_DIAGSUPPRESS=yes])
if [test "x$HAVE_CUDA_DIAGSUPPRESS" = "xyes"]; then
NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive -diag-suppress 2464 -diag-suppress 815"
else
NVCCFLAGS="${NVCCFLAGS} -Xcompiler -fpermissive"
fi
fi

CUDA_TOOLKIT_PATH=$(dirname $(dirname $(dirname $(dirname $CUDA_H_PATH))))
AC_DEFINE_UNQUOTED([CUDA_TOOLKIT_PATH], "$CUDA_TOOLKIT_PATH", [Path to the CUDA Toolkit])
AC_SUBST([NVCCFLAGS], ["${NVCCFLAGS}"])
AC_SUBST([NVCC], ["$CUDA_TOOLKIT_PATH/bin/nvcc"])

AC_CHECK_LIB([cudart], [cudaMalloc], [], [AC_MSG_ERROR([libcudart not found])])
fi
AM_CONDITIONAL([CUDA_DMA_BUF_PARAMETERS_SUPPORT],[test "x$CUDA_DMA_BUF_PARAMETERS_SUPPORT" = "xyes"])

AM_CONDITIONAL([CUDA], [test "$CUDA_H_PATH"])
AM_CONDITIONAL([HAVE_CUDA], [test "$CUDA_H_PATH"])

AC_ARG_ENABLE([neuron],
[AS_HELP_STRING([--enable-neuron],
Expand Down Expand Up @@ -446,6 +471,12 @@ fi
CFLAGS="-g -Wall -D_GNU_SOURCE -O3 $CFLAGS"
LDFLAGS="$LDFLAGS"
LIBS=$LIBS" -lpthread"
AM_CONDITIONAL([OPENCL], [test "$HAVE_OPENCL"])
if [test $HAVE_OPENCL = yes]; then
AC_DEFINE([HAVE_OPENCL], [1], [Enable OPENCL feature])
AC_CHECK_LIB([OpenCL], [clGetDeviceIDs], [], [AC_MSG_ERROR([libOpenCL not found])])
fi

AC_SUBST([LIBUMAD])
AC_SUBST([LIBMATH])
AC_CONFIG_FILES([Makefile])
Expand Down
18 changes: 18 additions & 0 deletions man/perftest.1
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,11 @@ many different options and modes.
Not relevant for raw_ethernet_fs_rate.
System support required.
.TP
.B --cuda_mem_type=<value>
Set CUDA memory type <value>=0(device,default),1(managed),4(malloc)
Not relevant for raw_ethernet_fs_rate.
System support required.
.TP
.B --use_cuda_dmabuf
Use CUDA DMA-BUF for GPUDirect RDMA testing.
Not relevant for raw_ethernet_fs_rate.
Expand All @@ -375,6 +380,19 @@ many different options and modes.
Not relevant for raw_ethernet_fs_rate.
System support required.
.TP
.B --use_opencl=<opencl device id>
Use OpenCl specific device for GPUDirect RDMA testing
Not relevant for raw_ethernet_fs_rate.
System support required.
.TP
.B --opencl_platform_id=<opencl platform id>
Use OpenCl specific platform ID
System support required.
.TP
.B --gpu_touch=<once\\infinte>
Set GPU touch mode to test memory accesses during the testing process.
Relevant only for CUDA and OpenCL memory types
.TP
.B --use_hugepages
Use Hugepages instead of contig, memalign allocations.
Not relevant for raw_ethernet_fs_rate.
Expand Down
115 changes: 100 additions & 15 deletions src/cuda_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,24 @@

#define ACCEL_PAGE_SIZE (64 * 1024)

static const char *cuda_mem_type_str[] = {
"CUDA_MEM_DEVICE",
"CUDA_MEM_MANAGED",
"CUDA_MEM_HOSTALLOC",
"CUDA_MEM_HOSTREGISTER",
"CUDA_MEM_MALLOC",
"CUDA_MEM_TYPES"
};

int touch_gpu_pages(uint8_t *addr, int buf_size, int is_infinitely, volatile int **stop_flag);

struct cuda_memory_ctx {
struct memory_ctx base;
int mem_type;
int gpu_touch;
int device_id;
char *device_bus_id;
volatile int *stop_touch_gpu_kernel_flag; // used for stopping cuda gpu_touch kernel
CUdevice cuDevice;
CUcontext cuContext;
bool use_dmabuf;
Expand Down Expand Up @@ -152,20 +165,17 @@ int cuda_memory_destroy(struct memory_ctx *ctx) {
return SUCCESS;
}

int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t size, int *dmabuf_fd,
uint64_t *dmabuf_offset, void **addr, bool *can_init) {
static int cuda_allocate_device_memory_buffer(struct cuda_memory_ctx *cuda_ctx, uint64_t size, int *dmabuf_fd,
uint64_t *dmabuf_offset, void **addr, bool *can_init) {
int error;
size_t buf_size = (size + ACCEL_PAGE_SIZE - 1) & ~(ACCEL_PAGE_SIZE - 1);

// Check if discrete or integrated GPU (tegra), for allocating memory where adequate
struct cuda_memory_ctx *cuda_ctx = container_of(ctx, struct cuda_memory_ctx, base);
int cuda_device_integrated;
cuDeviceGetAttribute(&cuda_device_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_ctx->cuDevice);
printf("CUDA device integrated: %X\n", (unsigned int)cuda_device_integrated);

if (cuda_device_integrated == 1) {
printf("cuMemAllocHost() of a %lu bytes GPU buffer\n", size);

error = cuMemAllocHost(addr, buf_size);
if (error != CUDA_SUCCESS) {
printf("cuMemAllocHost error=%d\n", error);
Expand All @@ -176,15 +186,12 @@ int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t
*can_init = false;
} else {
CUdeviceptr d_A;
printf("cuMemAlloc() of a %lu bytes GPU buffer\n", size);

error = cuMemAlloc(&d_A, buf_size);
if (error != CUDA_SUCCESS) {
printf("cuMemAlloc error=%d\n", error);
return FAILURE;
}

printf("allocated GPU buffer address at %016llx pointer=%p\n", d_A, (void *)d_A);
*addr = (void *)d_A;
*can_init = false;

Expand Down Expand Up @@ -215,6 +222,64 @@ int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t
#endif
}

return CUDA_SUCCESS;
}

int cuda_memory_allocate_buffer(struct memory_ctx *ctx, int alignment, uint64_t size, int *dmabuf_fd,
uint64_t *dmabuf_offset, void **addr, bool *can_init) {
int error;
CUdeviceptr d_ptr;

struct cuda_memory_ctx *cuda_ctx = container_of(ctx, struct cuda_memory_ctx, base);

switch (cuda_ctx->mem_type) {
case CUDA_MEM_DEVICE:
error = cuda_allocate_device_memory_buffer(cuda_ctx, size, dmabuf_fd,
dmabuf_offset, addr, can_init);
if (error != CUDA_SUCCESS)
return FAILURE;
break;
case CUDA_MEM_MANAGED:
error = cuMemAllocManaged(&d_ptr, size, CU_MEM_ATTACH_GLOBAL);
if (error != CUDA_SUCCESS) {
printf("cuMemAllocManaged error=%d\n", error);
return FAILURE;
}

*addr = (void *)d_ptr;
*can_init = false;
break;

case CUDA_MEM_MALLOC:
*can_init = false;
// Fall through

printf("Host allocation selected, calling memalign allocator for %zd bytes with %d page size\n", size, alignment);
*addr = memalign(alignment, size);
if (!*addr) {
printf("memalign error=%d\n", errno);
return FAILURE;
}

break;
/*
* TODO: Add Implementation for HOSTALLOC and HOSTREGISTER
* buffer allocations
*/
case CUDA_MEM_HOSTALLOC:
case CUDA_MEM_HOSTREGISTER:
default:
printf("invalid CUDA memory type\n");
return FAILURE;
}

printf("allocated GPU buffer of a %lu address at %p for type %s\n", size, addr, cuda_mem_type_str[cuda_ctx->mem_type]);

if (cuda_ctx->gpu_touch != GPU_NO_TOUCH) {
printf("Starting GPU touching process\n");
return touch_gpu_pages((uint8_t *)*addr, size, cuda_ctx->gpu_touch == GPU_TOUCH_INFINITE, &cuda_ctx->stop_touch_gpu_kernel_flag);
}

return SUCCESS;
}

Expand All @@ -223,13 +288,30 @@ int cuda_memory_free_buffer(struct memory_ctx *ctx, int dmabuf_fd, void *addr, u
int cuda_device_integrated;
cuDeviceGetAttribute(&cuda_device_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, cuda_ctx->cuDevice);

if (cuda_device_integrated == 1) {
printf("deallocating GPU buffer %p\n", addr);
cuMemFreeHost(addr);
} else {
CUdeviceptr d_A = (CUdeviceptr)addr;
printf("deallocating GPU buffer %016llx\n", d_A);
cuMemFree(d_A);
if (cuda_ctx->stop_touch_gpu_kernel_flag) {
*cuda_ctx->stop_touch_gpu_kernel_flag = 1;
printf("stopping CUDA gpu touch running kernel\n");
cuCtxSynchronize();
cuMemFree((CUdeviceptr)cuda_ctx->stop_touch_gpu_kernel_flag);
}

switch (cuda_ctx->mem_type) {
case CUDA_MEM_DEVICE:
if (cuda_device_integrated == 1) {
printf("deallocating GPU buffer %p\n", addr);
cuMemFreeHost(addr);
} else {
CUdeviceptr d_A = (CUdeviceptr)addr;
printf("deallocating GPU buffer %016llx\n", d_A);
cuMemFree(d_A);
}
break;
case CUDA_MEM_MANAGED:
CUCHECK(cuMemFree((CUdeviceptr)addr));
break;
case CUDA_MEM_MALLOC:
free((void *) addr);
break;
}

return SUCCESS;
Expand Down Expand Up @@ -271,6 +353,9 @@ struct memory_ctx *cuda_memory_create(struct perftest_parameters *params) {
ctx->device_id = params->cuda_device_id;
ctx->device_bus_id = params->cuda_device_bus_id;
ctx->use_dmabuf = params->use_cuda_dmabuf;
ctx->gpu_touch = params->gpu_touch;
ctx->stop_touch_gpu_kernel_flag = NULL;
ctx->mem_type = params->cuda_mem_type;

return &ctx->base;
}
30 changes: 30 additions & 0 deletions src/cuda_utils.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <stdint.h>
#include <stdio.h>
#include "cuda.h"

#include <cuda_runtime.h>
#define GPU_TOUCH_STEP 4096

__global__ void cuda_touch_pages(volatile uint8_t *c, int size,
volatile int *stop_flag, int is_infinite) {
do {
for (int iter = 0; iter < size; iter += GPU_TOUCH_STEP)
c[iter] = 0;
}
while (is_infinite && !*stop_flag);
}

extern "C" int touch_gpu_pages(uint8_t *addr, int buf_size,
int is_infinite, volatile int **stop_flag)
{
cudaError_t ret = cudaMallocManaged((void **)stop_flag, sizeof(int));
if (ret) {
printf("failed to allocate stop flag\n");
return -1;
}

*stop_flag = 0;
cuda_touch_pages<<<1, 1>>>(addr, buf_size, *stop_flag, is_infinite);

return 0;
}
Loading