From e7251f6540a6ca398248fc95ed664717522ea287 Mon Sep 17 00:00:00 2001 From: Ben Ashbaugh Date: Tue, 17 Sep 2024 21:54:18 -0700 Subject: [PATCH] initial version --- include/CL/opencl.hpp | 6 + samples/16_floatatomics/CMakeLists.txt | 10 ++ samples/16_floatatomics/README.md | 21 +++ samples/16_floatatomics/main.cpp | 199 +++++++++++++++++++++++++ samples/CMakeLists.txt | 1 + 5 files changed, 237 insertions(+) create mode 100644 samples/16_floatatomics/CMakeLists.txt create mode 100644 samples/16_floatatomics/README.md create mode 100644 samples/16_floatatomics/main.cpp diff --git a/include/CL/opencl.hpp b/include/CL/opencl.hpp index 88351de..a134a2b 100644 --- a/include/CL/opencl.hpp +++ b/include/CL/opencl.hpp @@ -1799,6 +1799,12 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LO CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR, cl_device_kernel_clock_capabilities_khr) #endif /* cl_khr_kernel_clock */ +#if defined(cl_ext_float_atomics) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext) +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext) +#endif /* cl_ext_float_atomics */ + #if defined(cl_intel_command_queue_families) CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_) #endif // cl_intel_command_queue_families diff --git a/samples/16_floatatomics/CMakeLists.txt b/samples/16_floatatomics/CMakeLists.txt new file mode 100644 index 0000000..c60536d --- /dev/null +++ b/samples/16_floatatomics/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright (c) 2019-2024 Ben Ashbaugh +# +# SPDX-License-Identifier: MIT + +add_opencl_sample( + TEST + NUMBER 16 + TARGET floatatomics + VERSION 120 + SOURCES main.cpp) diff --git a/samples/16_floatatomics/README.md b/samples/16_floatatomics/README.md new file mode 100644 index 0000000..9c57aad --- /dev/null +++ b/samples/16_floatatomics/README.md @@ -0,0 +1,21 @@ +# Floating-point Atomic Adds + +## Sample Purpose + +TODO + +Inspired by: https://pipinspace.github.io/blog/atomic-float-addition-in-opencl.html + +## Key APIs and Concepts + +TODO + +## Command Line Options + +| Option | Default Value | Description | +|:--|:-:|:--| +| `-d ` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on. +| `-p ` | 0 | Specify the index of the OpenCL platform to execute the sample on. +| `-i ` | 16 | Specify the number of iterations to execute. +| `--gwx ` | 1024 | Specify the global work size to execute, which is also the number of floating-point atomics to perform. +| `-e` | N/A | Unconditionally use the emulated floating-point atomic add. diff --git a/samples/16_floatatomics/main.cpp b/samples/16_floatatomics/main.cpp new file mode 100644 index 0000000..bd01177 --- /dev/null +++ b/samples/16_floatatomics/main.cpp @@ -0,0 +1,199 @@ +/* +// Copyright (c) 2019-2024 Ben Ashbaugh +// +// SPDX-License-Identifier: MIT +*/ + +#include + +#include + +#include +#include + +#include "util.hpp" + +static const char kernelString[] = R"CLC( +inline float atomic_add_f(volatile global float* addr, float val) +{ + #if defined(__opencl_c_ext_fp32_global_atomic_add) && !defined(EMULATE) + return atomic_fetch_add((volatile global atoic_float*)addr, val); + #elif defined(cl_nv_pragma_unroll) && !defined(EMULATE) + float ret; asm volatile("atom.global.add.f32 %0,[%1],%2;":"=f"(ret):"l"(addr),"f"(val):"memory"); + return ret; + #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7 + float ret = atomic_xchg(addr, 0.0f); + float old = ret + val; + while((old = atomic_xchg(addr, old)) != 0.0f) { + old = atomic_xchg(addr, 0.0f) + old; + } + return ret; + #endif +} + +kernel void FloatAtomicTest(global float* dst) +{ + atomic_add_f(dst, 1.0f); +} +)CLC"; + +static void PrintFloatAtomicCapabilities( + cl_device_fp_atomic_capabilities_ext caps ) +{ + if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT\n"); + if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT\n"); + if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT\n"); + if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT\n"); + if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT\n"); + if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT\n"); + + cl_device_command_buffer_capabilities_khr extra = caps & ~( + CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT | + CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT | + CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT | + CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT | + CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT | + CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT ); + if (extra) { + printf("\t\t(Unknown capability: %016" PRIx64 ")\n", extra); + } +} + +int main( + int argc, + char** argv ) +{ + int platformIndex = 0; + int deviceIndex = 0; + + size_t iterations = 16; + size_t gwx = 1024; + + bool emulate = false; + + { + popl::OptionParser op("Supported Options"); + op.add>("p", "platform", "Platform Index", platformIndex, &platformIndex); + op.add>("d", "device", "Device Index", deviceIndex, &deviceIndex); + op.add>("i", "iterations", "Iterations", iterations, &iterations); + op.add>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx); + op.add("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate); + + bool printUsage = false; + try { + op.parse(argc, argv); + } catch (std::exception& e) { + fprintf(stderr, "Error: %s\n\n", e.what()); + printUsage = true; + } + + if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) { + fprintf(stderr, + "Usage: floatatomics [options]\n" + "%s", op.help().c_str()); + return -1; + } + } + + std::vector platforms; + cl::Platform::get(&platforms); + + printf("Running on platform: %s\n", + platforms[platformIndex].getInfo().c_str() ); + + std::vector devices; + platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices); + + printf("Running on device: %s\n", + devices[deviceIndex].getInfo().c_str() ); + + if (checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) { + printf("Device supports " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n"); + + cl_device_fp_atomic_capabilities_ext spcaps = + devices[deviceIndex].getInfo(); + printf("CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT:\n"); + PrintFloatAtomicCapabilities(spcaps); + + cl_device_fp_atomic_capabilities_ext dpcaps = + devices[deviceIndex].getInfo(); + printf("CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT:\n"); + PrintFloatAtomicCapabilities(dpcaps); + + cl_device_fp_atomic_capabilities_ext hpcaps = + devices[deviceIndex].getInfo(); + printf("CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT:\n"); + PrintFloatAtomicCapabilities(hpcaps); + + if (spcaps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT) { + printf("Device does not support fp32 atomic add.\n"); + } + } else { + printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n"); + } + + if (emulate) { + printf("Forcing emulation.\n"); + } + + cl::Context context{devices[deviceIndex]}; + cl::CommandQueue commandQueue{context, devices[deviceIndex]}; + + cl::Program program{ context, kernelString }; + program.build(emulate ? "-DEMULATE" : nullptr); + cl::Kernel kernel = cl::Kernel{ program, "FloatAtomicTest" }; + + cl::Buffer deviceMemDst = cl::Buffer{ + context, + CL_MEM_READ_WRITE, + sizeof(cl_float) }; + + // execution + { + kernel.setArg(0, deviceMemDst); + + // Ensure the queue is empty and no processing is happening + // on the device before starting the timer. + commandQueue.finish(); + + auto start = std::chrono::system_clock::now(); + for( size_t i = 0; i < iterations; i++ ) + { + cl_float zero = 0.0f; + commandQueue.enqueueFillBuffer( + deviceMemDst, + zero, + 0, + sizeof(zero)); + commandQueue.enqueueNDRangeKernel( + kernel, + cl::NullRange, + cl::NDRange{gwx}); + } + + // Ensure all processing is complete before stopping the timer. + commandQueue.finish(); + + auto end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + printf("Finished in %f seconds\n", elapsed_seconds.count()); + } + + // validation + { + cl_float result = 0.0f; + commandQueue.enqueueReadBuffer( + deviceMemDst, + CL_TRUE, + 0, + sizeof(result), + &result); + if (result != (float)gwx) { + printf("Error: expected %f, got %f!\n", (float)gwx, result); + } else { + printf("Success.\n"); + } + } + + return 0; +} diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index f0439d7..a55e2f5 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -75,6 +75,7 @@ add_subdirectory( 05_spirvkernelfromfile ) add_subdirectory( 06_ndrangekernelfromfile ) add_subdirectory( 10_queueexperiments ) +add_subdirectory( 16_floatatomics ) set(BUILD_EXTENSION_SAMPLES TRUE) if(NOT TARGET OpenCLExt)