diff --git a/scripts/nvidia_hpc_sdk b/scripts/nvidia_hpc_sdk new file mode 100755 index 00000000..c69d2632 --- /dev/null +++ b/scripts/nvidia_hpc_sdk @@ -0,0 +1,113 @@ +#!/usr/bin/env sh +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reservd. +# +# For full license terms please see the LICENSE file distributed with this +# source code +# +# Compiles & runs models supported by NVIDIA HPC SDK for NVIDIA GPUs. +# +# Usage ./scripts/nvidia_hpc_sdk +set -ex +# Pick the HPC SDK version used with this variable +IMG=nvcr.io/nvidia/nvhpc:24.1-devel-cuda12.3-ubuntu22.04 + +MODEL=cuda +if [ ! -z $1 ]; then + MODEL=$1 +fi +#SIZE=268435456 #This is a power of 2 +SIZE=268435457 +if [ ! -z $2 ]; then + SIZE=$2 +fi + +rm -rf build || true +mkdir -p build + +docker run \ + -it \ + --gpus=all \ + --privileged \ + -u $(id -u):$(id -g) \ + -v $(pwd):/src \ + -w /src \ + $IMG \ + bash -c "set -ex && \ + cmake -Bbuild -H. \ + -DMODEL=${1} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_COMPILER=nvcc \ + -DCUDA_ARCH=native \ + -DMEM=DEFAULT \ + -DAUTOTUNE=ON \ + -DVECTORIZATION=ON \ + -DCMAKE_CXX_COMPILER=g++ \ + && \ + cmake --build build -v && \ + ./build/${1}-stream --arraysize ${SIZE}" + +docker run \ + -it \ + --gpus=all \ + --privileged \ + -u $(id -u):$(id -g) \ + -v $(pwd):/src \ + -w /src \ + $IMG \ + bash -c "set -ex && \ + cmake -Bbuild -H. \ + -DMODEL=${1} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_COMPILER=nvcc \ + -DCUDA_ARCH=native \ + -DMEM=DEFAULT \ + -DAUTOTUNE=ON \ + -DVECTORIZATION=OFF \ + -DCMAKE_CXX_COMPILER=g++ \ + && \ + cmake --build build -v && \ + ./build/${1}-stream --arraysize ${SIZE}" + +docker run \ + -it \ + --gpus=all \ + --privileged \ + -u $(id -u):$(id -g) \ + -v $(pwd):/src \ + -w /src \ + $IMG \ + bash -c "set -ex && \ + cmake -Bbuild -H. \ + -DMODEL=${1} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_COMPILER=nvcc \ + -DCUDA_ARCH=native \ + -DMEM=DEFAULT \ + -DAUTOTUNE=OFF \ + -DVECTORIZATION=ON \ + -DCMAKE_CXX_COMPILER=g++ \ + && \ + cmake --build build -v && \ + ./build/${1}-stream --arraysize ${SIZE}" + +docker run \ + -it \ + --gpus=all \ + --privileged \ + -u $(id -u):$(id -g) \ + -v $(pwd):/src \ + -w /src \ + $IMG \ + bash -c "set -ex && \ + cmake -Bbuild -H. \ + -DMODEL=${1} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CUDA_COMPILER=nvcc \ + -DCUDA_ARCH=native \ + -DMEM=DEFAULT \ + -DAUTOTUNE=OFF \ + -DVECTORIZATION=OFF \ + -DCMAKE_CXX_COMPILER=g++ \ + && \ + cmake --build build -v && \ + ./build/${1}-stream --arraysize ${SIZE}" diff --git a/src/cuda-simple/CUDANaiveStream.cu b/src/cuda-simple/CUDANaiveStream.cu new file mode 100644 index 00000000..75a8f3c0 --- /dev/null +++ b/src/cuda-simple/CUDANaiveStream.cu @@ -0,0 +1,345 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + + +#include "CUDAStream.h" + +void check_error(void) +{ + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Error: " << cudaGetErrorString(err) << std::endl; + exit(err); + } +} + +template +CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) +{ + + // The array size must be divisible by TBSIZE for kernel launches + if (ARRAY_SIZE % TBSIZE != 0) + { + std::stringstream ss; + ss << "Array size must be a multiple of " << TBSIZE; + throw std::runtime_error(ss.str()); + } + + // Set device + int count; + cudaGetDeviceCount(&count); + check_error(); + if (device_index >= count) + throw std::runtime_error("Invalid device index"); + cudaSetDevice(device_index); + check_error(); + + // Print out device information + std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; +#else + std::cout << "Memory: DEFAULT" << std::endl; +#endif + array_size = ARRAY_SIZE; + + + // Query device for sensible dot kernel block count + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device_index); + check_error(); + dot_num_blocks = props.multiProcessorCount * 4; + + // Allocate the host array for partial sums for dot kernels + sums = (T*)malloc(sizeof(T) * dot_num_blocks); + + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 4; + std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; + + // Check buffers fit on the device + if (props.totalGlobalMem < total_bytes) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // Create device buffers +#if defined(MANAGED) + cudaMallocManaged(&d_a, array_bytes); + check_error(); + cudaMallocManaged(&d_b, array_bytes); + check_error(); + cudaMallocManaged(&d_c, array_bytes); + check_error(); + cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)); + check_error(); +#elif defined(PAGEFAULT) + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); + d_sum = (T*)malloc(sizeof(T)*dot_num_blocks); +#else + cudaMalloc(&d_a, array_bytes); + check_error(); + cudaMalloc(&d_b, array_bytes); + check_error(); + cudaMalloc(&d_c, array_bytes); + check_error(); + cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)); + check_error(); +#endif +} + + +template +CUDAStream::~CUDAStream() +{ + free(sums); + +#if defined(PAGEFAULT) + free(d_a); + free(d_b); + free(d_c); + free(d_sum); +#else + cudaFree(d_a); + check_error(); + cudaFree(d_b); + check_error(); + cudaFree(d_c); + check_error(); + cudaFree(d_sum); + check_error(); +#endif +} + + +template +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = initA; + b[i] = initB; + c[i] = initC; +} + +template +void CUDAStream::init_arrays(T initA, T initB, T initC) +{ + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +{ + // Copy device memory to host +#if defined(PAGEFAULT) || defined(MANAGED) + cudaDeviceSynchronize(); + for (int i = 0; i < array_size; i++) + { + a[i] = d_a[i]; + b[i] = d_b[i]; + c[i] = d_c[i]; + } +#else + cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); + cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); +#endif +} + + +template +__global__ void copy_kernel(const T * a, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i]; +} + +template +void CUDAStream::copy() +{ + copy_kernel<<>>(d_a, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void mul_kernel(T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + b[i] = scalar * c[i]; +} + +template +void CUDAStream::mul() +{ + mul_kernel<<>>(d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void add_kernel(const T * a, const T * b, T * c) +{ + const int i = blockDim.x * blockIdx.x + threadIdx.x; + c[i] = a[i] + b[i]; +} + +template +void CUDAStream::add() +{ + add_kernel<<>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void triad_kernel(T * a, const T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] = b[i] + scalar * c[i]; +} + +template +void CUDAStream::triad() +{ + triad_kernel<<>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void nstream_kernel(T * a, const T * b, const T * c) +{ + const T scalar = startScalar; + const int i = blockDim.x * blockIdx.x + threadIdx.x; + a[i] += b[i] + scalar * c[i]; +} + +template +void CUDAStream::nstream() +{ + nstream_kernel<<>>(d_a, d_b, d_c); + check_error(); + cudaDeviceSynchronize(); + check_error(); +} + +template +__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) +{ + __shared__ T tb_sum[TBSIZE]; + + int i = blockDim.x * blockIdx.x + threadIdx.x; + const size_t local_i = threadIdx.x; + + tb_sum[local_i] = {}; + for (; i < array_size; i += blockDim.x*gridDim.x) + tb_sum[local_i] += a[i] * b[i]; + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) + { + __syncthreads(); + if (local_i < offset) + { + tb_sum[local_i] += tb_sum[local_i+offset]; + } + } + + if (local_i == 0) + sum[blockIdx.x] = tb_sum[local_i]; +} + +template +T CUDAStream::dot() +{ + dot_kernel<<>>(d_a, d_b, d_sum, array_size); + check_error(); + +#if defined(MANAGED) || defined(PAGEFAULT) + cudaDeviceSynchronize(); + check_error(); +#else + cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost); + check_error(); +#endif + + T sum = 0.0; + for (int i = 0; i < dot_num_blocks; i++) + { +#if defined(MANAGED) || defined(PAGEFAULT) + sum += d_sum[i]; +#else + sum += sums[i]; +#endif + } + + return sum; +} + +void listDevices(void) +{ + // Get number of devices + int count; + cudaGetDeviceCount(&count); + check_error(); + + // Print device names + if (count == 0) + { + std::cerr << "No devices found." << std::endl; + } + else + { + std::cout << std::endl; + std::cout << "Devices:" << std::endl; + for (int i = 0; i < count; i++) + { + std::cout << i << ": " << getDeviceName(i) << std::endl; + } + std::cout << std::endl; + } +} + + +std::string getDeviceName(const int device) +{ + cudaDeviceProp props; + cudaGetDeviceProperties(&props, device); + check_error(); + return std::string(props.name); +} + + +std::string getDeviceDriver(const int device) +{ + cudaSetDevice(device); + check_error(); + int driver; + cudaDriverGetVersion(&driver); + check_error(); + return std::to_string(driver); +} + +template class CUDAStream; +template class CUDAStream; diff --git a/src/cuda-simple/CUDANaiveStream.h b/src/cuda-simple/CUDANaiveStream.h new file mode 100644 index 00000000..d16511fe --- /dev/null +++ b/src/cuda-simple/CUDANaiveStream.h @@ -0,0 +1,54 @@ + +// Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, +// University of Bristol HPC +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#pragma once + +#include +#include +#include + +#include "Stream.h" + +#define IMPLEMENTATION_STRING "CUDA" + +#define TBSIZE 1024 + +template +class CUDAStream : public Stream +{ + protected: + // Size of arrays + int array_size; + + // Host array for partial sums for dot kernel + T *sums; + + // Device side pointers to arrays + T *d_a; + T *d_b; + T *d_c; + T *d_sum; + + // Number of blocks for dot kernel + int dot_num_blocks; + + public: + + CUDAStream(const int, const int); + ~CUDAStream(); + + virtual void copy() override; + virtual void add() override; + virtual void mul() override; + virtual void triad() override; + virtual void nstream() override; + virtual T dot() override; + + virtual void init_arrays(T initA, T initB, T initC) override; + virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + +}; diff --git a/src/cuda-simple/model.cmake b/src/cuda-simple/model.cmake new file mode 100644 index 00000000..7c1b0d6e --- /dev/null +++ b/src/cuda-simple/model.cmake @@ -0,0 +1,44 @@ + +register_flag_optional(CMAKE_CXX_COMPILER + "Any CXX compiler that is supported by CMake detection, this is used for host compilation" + "c++") + +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + MANAGED - use CUDA Managed Memory. + PAGEFAULT - shared memory, only host pointers allocated." + "DEFAULT") + +register_flag_required(CMAKE_CUDA_COMPILER + "Path to the CUDA nvcc compiler") + +# XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly +register_flag_required(CUDA_ARCH + "Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc") + +register_flag_optional(CUDA_EXTRA_FLAGS + "Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" + "") + + +macro(setup) + + # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes + if(POLICY CMP0104) + cmake_policy(SET CMP0104 OLD) + endif() + + enable_language(CUDA) + register_definitions(${MEM}) + + # add -forward-unknown-to-host-compiler for compatibility reasons + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") + + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG + # appended later + wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) + + message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") +endmacro() + diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index 75a8f3c0..5fb84d94 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -1,343 +1,447 @@ - // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reservd. +// // For full license terms please see the LICENSE file distributed with this // source code - #include "CUDAStream.h" +#include +#include +#include +#include -void check_error(void) -{ - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) - { - std::cerr << "Error: " << cudaGetErrorString(err) << std::endl; - exit(err); - } -} - -template -CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) -{ - - // The array size must be divisible by TBSIZE for kernel launches - if (ARRAY_SIZE % TBSIZE != 0) - { - std::stringstream ss; - ss << "Array size must be a multiple of " << TBSIZE; - throw std::runtime_error(ss.str()); - } +static constexpr int max_sums = 512; - // Set device - int count; - cudaGetDeviceCount(&count); - check_error(); - if (device_index >= count) - throw std::runtime_error("Invalid device index"); - cudaSetDevice(device_index); - check_error(); - - // Print out device information - std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; - std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; -#if defined(MANAGED) - std::cout << "Memory: MANAGED" << std::endl; -#elif defined(PAGEFAULT) - std::cout << "Memory: PAGEFAULT" << std::endl; -#else - std::cout << "Memory: DEFAULT" << std::endl; -#endif - array_size = ARRAY_SIZE; +[[noreturn]] inline void error(char const* file, int line, cudaError_t e) { + std::fprintf(stderr, "Error at %s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e); + exit(e); +} +#define CU(EXPR) if (auto __e = (EXPR); __e != cudaSuccess) error(__FILE__, __LINE__, __e); - // Query device for sensible dot kernel block count - cudaDeviceProp props; - cudaGetDeviceProperties(&props, device_index); - check_error(); - dot_num_blocks = props.multiProcessorCount * 4; +__host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; } - // Allocate the host array for partial sums for dot kernels - sums = (T*)malloc(sizeof(T) * dot_num_blocks); +cudaStream_t* stream(long long& s) { return (cudaStream_t*)&s; } - size_t array_bytes = sizeof(T); - array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 4; - std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE << std::endl; +template +__device__ int for_each(/*grid_group _,*/ int n, UnaryFunction&& f) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + for (; i < n; i += gridDim.x * blockDim.x) + f(i); + return i; +} - // Check buffers fit on the device - if (props.totalGlobalMem < total_bytes) - throw std::runtime_error("Device does not have enough memory for all 3 buffers"); +template +__device__ int for_each16(/*grid_group _,*/ int n, UnaryFunction&& f) { + constexpr int w = 16 / sizeof(T); + int i = blockDim.x * blockIdx.x + threadIdx.x; + for (; i < ceil_div(n, w) - 1; i += gridDim.x * blockDim.x) + for (int j = i * w; j < (i * w + w); ++j) f(j); + i *= w; + for (; i < n; ++i) f(i); + return i; +} - // Create device buffers -#if defined(MANAGED) - cudaMallocManaged(&d_a, array_bytes); - check_error(); - cudaMallocManaged(&d_b, array_bytes); - check_error(); - cudaMallocManaged(&d_c, array_bytes); - check_error(); - cudaMallocManaged(&d_sum, dot_num_blocks*sizeof(T)); - check_error(); -#elif defined(PAGEFAULT) - d_a = (T*)malloc(array_bytes); - d_b = (T*)malloc(array_bytes); - d_c = (T*)malloc(array_bytes); - d_sum = (T*)malloc(sizeof(T)*dot_num_blocks); +template +struct V { + static constexpr int w = 16/sizeof(T); + alignas(16) T v[w]; + __host__ __device__ constexpr T& operator[](int i) { return v[i]; } + __host__ __device__ constexpr T operator[](int i) const { return v[i]; } +}; + +template using outs = cuda::std::array; +template using ins = cuda::std::array; +template using vals = cuda::std::array; + +template +__device__ void for_each_vec(/*grid_group _,*/ int n, outs d, ins s, UnaryFunction&& f) { +#if defined(VECTORIZATION) + using V = V; + constexpr int w = sizeof(V) / sizeof(T); + + cuda::std::array dv; + for (int i = 0; i < N; ++i) dv[i] = (V*)d[i]; + cuda::std::array sv; + for (int i = 0; i < M; ++i) sv[i] = (V*)s[i]; + + const auto vl = ceil_div(n, w); + int i = for_each(vl - 1, [&](int i) { + cuda::std::array svv; + for (int j = 0; j < M; ++j) svv[j] = sv[j][i]; + cuda::std::array dvv; + for (int k = 0; k < w; ++k) { + cuda::std::array ins; + for (int j = 0; j < M; ++j) ins[j] = svv[j][k]; + cuda::std::array outs{cuda::std::apply(f, ins)}; + for (int j = 0; j < N; ++j) dvv[j][k] = outs[k]; + } + for (int j = 0; j < N; ++j) dv[j][i] = dvv[j]; + }); + if (i == (vl - 1)) { + for (int k = w * i; k < n; ++k) { + cuda::std::array ins; + for (int j = 0; j < M; ++j) ins[j] = s[j][k]; + cuda::std::array outs{cuda::std::apply(f, ins)}; + for (int j = 0; j < N; ++j) d[j][k] = outs[k]; + } + } +#elif defined(SIMPLE_VECTORIZATION) + for_each16(n, [&](int i) { + cuda::std::array ins; + for (int j = 0; j < M; ++j) ins[j] = s[j][i]; + cuda::std::array outs{cuda::std::apply(f, ins)}; + for (int j = 0; j < N; ++j) d[j][i] = outs[j]; + }); #else - cudaMalloc(&d_a, array_bytes); - check_error(); - cudaMalloc(&d_b, array_bytes); - check_error(); - cudaMalloc(&d_c, array_bytes); - check_error(); - cudaMalloc(&d_sum, dot_num_blocks*sizeof(T)); - check_error(); -#endif + for_each(n, [&](int i) { + cuda::std::array ins; + for (int j = 0; j < M; ++j) ins[j] = s[j][i]; + cuda::std::array outs{cuda::std::apply(f, ins)}; + for (int j = 0; j < N; ++j) d[j][i] = outs[j]; + }); +#endif } - -template -CUDAStream::~CUDAStream() -{ - free(sums); - -#if defined(PAGEFAULT) - free(d_a); - free(d_b); - free(d_c); - free(d_sum); +template +struct sum_t { + alignas(512) T data; +}; + +void blocks_and_threads(int& minGridSize, int& blockSize, size_t array_size, void* func, int esize, + int maxBlockSize = 256, int maxWaveSize = 64) { + auto dyn_smem = [] __host__ __device__ (int){ return 0; }; + CU(cudaOccupancyMaxPotentialBlockSizeVariableSMem(&minGridSize, &blockSize, func, dyn_smem, 0)); + auto nthreads = minGridSize * blockSize; + // Clamp at 256 threads: + blockSize = std::min(blockSize, maxBlockSize); + minGridSize = nthreads / blockSize; +#if defined(VECTORIZATION) + int vw = 16 / esize; #else - cudaFree(d_a); - check_error(); - cudaFree(d_b); - check_error(); - cudaFree(d_c); - check_error(); - cudaFree(d_sum); - check_error(); -#endif + int vw = 1; +#endif + int actualGridSize = ceil_div(array_size / vw, blockSize); + if (maxWaveSize > -1) { + // Clamp at n thread block waves: + minGridSize = std::min(actualGridSize, maxWaveSize * minGridSize); + } else { + minGridSize = actualGridSize; + } } +template +void autotune(char const* name, int& minGridSize, int& blockSize, int& num_dot_sums, + size_t array_size, void* func, int esize, F&& kernel) { + bool with_sums = num_dot_sums != -1; + int minWaves = 0; +#if defined(AUTOTUNE) + constexpr int niter = 20; + double dt = std::numeric_limits::max(); + int minGridLocal = 0, minBlockLocal = 0, minSums = max_sums; + std::vector num_sums{-1}; + std::vector block_sizes{128, 256, 512, 1024}; + + if (with_sums) { + block_sizes = std::vector{512, 1024}; + num_sums = std::vector{1, 2, 8, 32, 64, 128, 256, max_sums}; + } + for (auto bs : block_sizes) { + for (auto ws : {-1, 8, 16, 32, 64, 128, 256}) { + for (auto ns : num_sums) { + if (ns > max_sums) abort(); + if (with_sums) num_dot_sums = ns; + + using clk_t = std::chrono::high_resolution_clock; + using dur_t = std::chrono::duration; + + blocks_and_threads(minGridSize, blockSize, array_size, func, esize, bs, ws); + + kernel(); + auto s = clk_t::now(); + for (int it = 0; it < niter; ++it) kernel(); + auto t = (clk_t::now() - s).count(); + if (t < dt) { + minGridLocal = minGridSize; + minBlockLocal = blockSize; + minWaves = ws; + if (ns != -1) minSums = ns; + dt = t; + } + } + } + } + minGridSize = minGridLocal; + blockSize = minBlockLocal; + if (with_sums) num_dot_sums = minSums; +#else + if (with_sums) { + minWaves = 64; + blocks_and_threads(minGridSize, blockSize, array_size, func, esize, 1024, minWaves); + } else { + minWaves = -1; + blocks_and_threads(minGridSize, blockSize, array_size, func, esize, 256, minWaves); + } +#endif + + std::cout << name << " kernel config: " << minGridSize << " groups of (fixed) size " << blockSize + << " in " << minWaves << " waves "; + if (with_sums) + std::cout << " with " << num_dot_sums << " sums"; + std::cout << std::endl; +} template -__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - a[i] = initA; - b[i] = initB; - c[i] = initC; +__global__ void init_kernel(T * a, T * b, T * c, T initA, T initB, T initC, size_t array_size) { + for_each(array_size, [=](int i) { + a[i] = initA; + b[i] = initB; + c[i] = initC; + }); } template -void CUDAStream::init_arrays(T initA, T initB, T initC) -{ - init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC); - check_error(); - cudaDeviceSynchronize(); - check_error(); +void CUDAStream::init_arrays(T initA, T initB, T initC) { + constexpr int threads_per_block = 256; + size_t blocks = ceil_div(array_size, threads_per_block); + init_kernel<<>>(d_a, d_b, d_c, initA, initB, initC, array_size); + CU(cudaStreamSynchronize(*stream(s))); } template -void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) -{ +void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) { // Copy device memory to host #if defined(PAGEFAULT) || defined(MANAGED) - cudaDeviceSynchronize(); - for (int i = 0; i < array_size; i++) - { + CU(cudaStreamSynchronize(*stream(s))); + for (size_t i = 0; i < array_size; i++) { a[i] = d_a[i]; b[i] = d_b[i]; c[i] = d_c[i]; } #else - cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost); - check_error(); - cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost); - check_error(); - cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost); - check_error(); + CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost)); + CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost)); + CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost)); #endif } - template -__global__ void copy_kernel(const T * a, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i]; +__global__ void copy_kernel(const T * __restrict a, T * __restrict c, size_t array_size) { + a = (const T*)__builtin_assume_aligned(a, 16); + c = (T*)__builtin_assume_aligned(c, 16); + for_each_vec(array_size, outs{c}, ins{a}, [=](T a) { + return a; + }); } template -void CUDAStream::copy() -{ - copy_kernel<<>>(d_a, d_c); - check_error(); - cudaDeviceSynchronize(); - check_error(); +void CUDAStream::copy() { + copy_kernel<<>>(d_a, d_c, array_size); + CU(cudaStreamSynchronize(*stream(s))); } template -__global__ void mul_kernel(T * b, const T * c) -{ +__global__ void mul_kernel(T * b, const T * c, size_t array_size) { const T scalar = startScalar; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - b[i] = scalar * c[i]; + for_each_vec(array_size, outs{b}, ins{c}, [](T c) { + return c * scalar; + }); } template -void CUDAStream::mul() -{ - mul_kernel<<>>(d_b, d_c); - check_error(); - cudaDeviceSynchronize(); - check_error(); +void CUDAStream::mul() { + mul_kernel<<>>(d_b, d_c, array_size); + CU(cudaStreamSynchronize(*stream(s))); } template -__global__ void add_kernel(const T * a, const T * b, T * c) -{ - const int i = blockDim.x * blockIdx.x + threadIdx.x; - c[i] = a[i] + b[i]; +__global__ void add_kernel(const T * a, const T * b, T * c, size_t array_size) { + for_each_vec(array_size, outs{c}, ins{a, b}, [](T a, T b) { + return a + b; + }); } template -void CUDAStream::add() -{ - add_kernel<<>>(d_a, d_b, d_c); - check_error(); - cudaDeviceSynchronize(); - check_error(); +void CUDAStream::add() { + add_kernel<<>>(d_a, d_b, d_c, array_size); + CU(cudaStreamSynchronize(*stream(s))); } template -__global__ void triad_kernel(T * a, const T * b, const T * c) -{ +__global__ void triad_kernel(T * a, const T * b, const T * c, size_t array_size) { const T scalar = startScalar; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - a[i] = b[i] + scalar * c[i]; + for_each_vec(array_size, outs{a}, ins{b, c}, [](T b, T c) { + return b + c * scalar; + }); } template -void CUDAStream::triad() -{ - triad_kernel<<>>(d_a, d_b, d_c); - check_error(); - cudaDeviceSynchronize(); - check_error(); +void CUDAStream::triad() { + triad_kernel<<>>(d_a, d_b, d_c, array_size); + CU(cudaStreamSynchronize(*stream(s))); } template -__global__ void nstream_kernel(T * a, const T * b, const T * c) -{ +__global__ void nstream_kernel(T * a, const T * b, const T * c, size_t array_size) { const T scalar = startScalar; - const int i = blockDim.x * blockIdx.x + threadIdx.x; - a[i] += b[i] + scalar * c[i]; + for_each_vec(array_size, outs{a}, ins{a, b, c}, [=](T a, T b, T c) { + return a + b + scalar * c; + }); } template -void CUDAStream::nstream() -{ - nstream_kernel<<>>(d_a, d_b, d_c); - check_error(); - cudaDeviceSynchronize(); - check_error(); +void CUDAStream::nstream() { + nstream_kernel<<>>(d_a, d_b, d_c, array_size); + CU(cudaStreamSynchronize(*stream(s))); } template -__global__ void dot_kernel(const T * a, const T * b, T * sum, int array_size) -{ - __shared__ T tb_sum[TBSIZE]; - - int i = blockDim.x * blockIdx.x + threadIdx.x; - const size_t local_i = threadIdx.x; - - tb_sum[local_i] = {}; - for (; i < array_size; i += blockDim.x*gridDim.x) - tb_sum[local_i] += a[i] * b[i]; - - for (int offset = blockDim.x / 2; offset > 0; offset /= 2) - { - __syncthreads(); - if (local_i < offset) - { - tb_sum[local_i] += tb_sum[local_i+offset]; - } +__global__ void dot_kernel(const T * a, const T * b, sum_t* sums, int num_sums, int array_size) { + namespace cg = cooperative_groups; + using V = V; + __shared__ T data[32]; + + T init = T{0}; + for_each_vec(array_size, outs{}, ins{a, b}, [&init](T a, T b) { + init += a * b; + return vals{}; + }); + + auto tile = cg::tiled_partition<32>(cg::this_thread_block()); + auto r = cg::reduce(tile, init, cg::plus{}); + cg::invoke_one(tile, [&] { + data[tile.meta_group_rank()] = r; + }); + __syncthreads(); + if (threadIdx.x < tile.meta_group_size()) { + auto g = cg::coalesced_threads(); + auto r = cg::reduce(g, data[threadIdx.x], cg::plus{}); + cg::invoke_one(g, [&] { + atomicAdd(&sums[blockIdx.x % num_sums].data, r); + }); } +} - if (local_i == 0) - sum[blockIdx.x] = tb_sum[local_i]; +template +T CUDAStream::dot() { + sum_t* p = (sum_t*)sums; + for (int i = 0; i < num_dot_sums; ++i) p[i].data = 0; + dot_kernel<<>>(d_a, d_b, p, num_dot_sums, array_size); + CU(cudaStreamSynchronize(*stream(s))); + T sum = 0; + for (int i = 0; i < num_dot_sums; ++i) sum += p[i].data; + return sum; } template -T CUDAStream::dot() -{ - dot_kernel<<>>(d_a, d_b, d_sum, array_size); - check_error(); - -#if defined(MANAGED) || defined(PAGEFAULT) - cudaDeviceSynchronize(); - check_error(); +CUDAStream::CUDAStream(const int ARRAY_SIZE, const int device_index) { + // Set device + int count; + CU(cudaGetDeviceCount(&count)); + if (device_index >= count) + throw std::runtime_error("Invalid device index"); + CU(cudaSetDevice(device_index)); + + // Print out device information + std::cout << "Using CUDA device " << getDeviceName(device_index) << std::endl; + std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; +#if defined(MANAGED) + std::cout << "Memory: MANAGED" << std::endl; +#elif defined(PAGEFAULT) + std::cout << "Memory: PAGEFAULT" << std::endl; #else - cudaMemcpy(sums, d_sum, dot_num_blocks*sizeof(T), cudaMemcpyDeviceToHost); - check_error(); + std::cout << "Memory: DEFAULT" << std::endl; #endif + array_size = ARRAY_SIZE; + + CU(cudaStreamCreate(stream(s))); - T sum = 0.0; - for (int i = 0; i < dot_num_blocks; i++) - { -#if defined(MANAGED) || defined(PAGEFAULT) - sum += d_sum[i]; + // Check buffers fit on the device + size_t array_bytes = sizeof(T); + array_bytes *= ARRAY_SIZE; + size_t total_bytes = array_bytes * 4; + + cudaDeviceProp props; + CU(cudaGetDeviceProperties(&props, device_index)); + if (props.totalGlobalMem < total_bytes) + throw std::runtime_error("Device does not have enough memory for all 3 buffers"); + + // Create device buffers +#if defined(MANAGED) + CU(cudaMallocManaged(&d_a, array_bytes)); + CU(cudaMallocManaged(&d_b, array_bytes)); + CU(cudaMallocManaged(&d_c, array_bytes)); +#elif defined(PAGEFAULT) + d_a = (T*)malloc(array_bytes); + d_b = (T*)malloc(array_bytes); + d_c = (T*)malloc(array_bytes); #else - sum += sums[i]; + CU(cudaMalloc(&d_a, array_bytes)); + CU(cudaMalloc(&d_b, array_bytes)); + CU(cudaMalloc(&d_c, array_bytes)); #endif - } + sums = (long long*)malloc(sizeof(sum_t) * max_sums); + CU(cudaHostRegister(sums, sizeof(sum_t) * max_sums, cudaHostRegisterDefault)); + num_dot_sums = -1; + + // Query sensible device properties for the different kernels + autotune("Copy", num_blocks_copy, num_threads_copy, num_dot_sums, array_size, (void*)copy_kernel, sizeof(T), [&] { copy(); }); + autotune("Mul", num_blocks_mul, num_threads_mul, num_dot_sums, array_size, (void*)mul_kernel, sizeof(T), [&] { mul(); }); + autotune("Add", num_blocks_add, num_threads_add, num_dot_sums, array_size, (void*)add_kernel, sizeof(T), [&] { add(); }); + autotune("Triad", num_blocks_triad, num_threads_triad, num_dot_sums, array_size, (void*)triad_kernel, sizeof(T), [&] { triad(); }); + autotune("Nstream", num_blocks_nstream, num_threads_nstream, num_dot_sums, array_size, (void*)nstream_kernel, sizeof(T), [&] { nstream(); }); + num_dot_sums = max_sums; + autotune("Dot", num_blocks_dot, num_threads_dot, num_dot_sums, array_size, (void*)dot_kernel, sizeof(T), [&] { dot(); }); +} - return sum; +template +CUDAStream::~CUDAStream() { +#if defined(PAGEFAULT) + free(d_a); + free(d_b); + free(d_c); +#else + CU(cudaFree(d_a)); + CU(cudaFree(d_b)); + CU(cudaFree(d_c)); + CU(cudaStreamDestroy(*stream(s))); + CU(cudaHostUnregister(sums)); +#endif + free(sums); } -void listDevices(void) -{ + +void listDevices(void) { // Get number of devices int count; - cudaGetDeviceCount(&count); - check_error(); + CU(cudaGetDeviceCount(&count)); // Print device names - if (count == 0) - { + if (count == 0) { std::cerr << "No devices found." << std::endl; - } - else - { + } else { std::cout << std::endl; std::cout << "Devices:" << std::endl; - for (int i = 0; i < count; i++) - { + for (int i = 0; i < count; i++) { std::cout << i << ": " << getDeviceName(i) << std::endl; } std::cout << std::endl; } } - -std::string getDeviceName(const int device) -{ +std::string getDeviceName(const int device) { cudaDeviceProp props; - cudaGetDeviceProperties(&props, device); - check_error(); + CU(cudaGetDeviceProperties(&props, device)); return std::string(props.name); } -std::string getDeviceDriver(const int device) -{ - cudaSetDevice(device); - check_error(); +std::string getDeviceDriver(const int device) { + CU(cudaSetDevice(device)); int driver; - cudaDriverGetVersion(&driver); - check_error(); + CU(cudaDriverGetVersion(&driver)); return std::to_string(driver); } diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index d16511fe..74d8a37e 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -1,7 +1,8 @@ - // Copyright (c) 2015-16 Tom Deakin, Simon McIntosh-Smith, // University of Bristol HPC // +// Copyright (c) 2024, NVIDIA CORPORATION. All rights reservd. +// // For full license terms please see the LICENSE file distributed with this // source code @@ -15,26 +16,29 @@ #define IMPLEMENTATION_STRING "CUDA" -#define TBSIZE 1024 - template class CUDAStream : public Stream { protected: // Size of arrays - int array_size; + size_t array_size; // Host array for partial sums for dot kernel - T *sums; + long long* sums; + int num_dot_sums; // Device side pointers to arrays - T *d_a; - T *d_b; - T *d_c; - T *d_sum; + T *d_a, *d_b, *d_c; + + // Number of blocks per grid: + int num_blocks_copy, num_blocks_mul, num_blocks_add, + num_blocks_triad, num_blocks_dot, num_blocks_nstream; + + // Number of threads per block: + int num_threads_copy, num_threads_mul, num_threads_add, + num_threads_triad, num_threads_dot, num_threads_nstream; - // Number of blocks for dot kernel - int dot_num_blocks; + long long s; public: diff --git a/src/cuda/model.cmake b/src/cuda/model.cmake index 7c1b0d6e..8f3744f7 100644 --- a/src/cuda/model.cmake +++ b/src/cuda/model.cmake @@ -9,6 +9,9 @@ register_flag_optional(MEM "Device memory mode: PAGEFAULT - shared memory, only host pointers allocated." "DEFAULT") +register_flag_optional(AUTOTUNE "Auto-tune launch config" "AUTOTUNE") +register_flag_optional(VECTORIZATION "Vectorization" "VECTORIZATION") + register_flag_required(CMAKE_CUDA_COMPILER "Path to the CUDA nvcc compiler") @@ -20,8 +23,14 @@ register_flag_optional(CUDA_EXTRA_FLAGS "Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" "") - macro(setup) + set(CMAKE_CXX_STANDARD 20) + set(CMAKE_CUDA_STANDARD 20) + + if(NOT DEFINED CMAKE_CUDA20_STANDARD_COMPILE_OPTION) + set(CMAKE_CUDA20_STANDARD_COMPILE_OPTION "") + set(CMAKE_CUDA20_EXTENSION_COMPILE_OPTION "") + endif() # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes if(POLICY CMP0104) @@ -32,9 +41,16 @@ macro(setup) register_definitions(${MEM}) # add -forward-unknown-to-host-compiler for compatibility reasons - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" "-arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) + set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler" + "-arch=${CUDA_ARCH}" "--extended-lambda" ${CUDA_EXTRA_FLAGS}) string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}") - + if (AUTOTUNE) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DAUTOTUNE") + endif() + if (VECTORIZATION) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DVECTORIZATION") + endif() + # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG # appended later wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})