From 5f6e714bdd8d34c305876f392df9cc569df33e5c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 21:17:17 +0100 Subject: [PATCH 01/64] Add options for std::vector or raw pointers for TBB/STD --- src/std-data/STDDataStream.cpp | 55 ++++++++++++++++++--------- src/std-data/STDDataStream.h | 8 ++-- src/std-data/model.cmake | 8 +++- src/std-indices/STDIndicesStream.cpp | 53 +++++++++++++++++--------- src/std-indices/STDIndicesStream.h | 13 +++++-- src/std-indices/model.cmake | 8 +++- src/std-ranges/STDRangesStream.cpp | 56 +++++++++++++++++++--------- src/std-ranges/STDRangesStream.hpp | 8 ++-- src/std-ranges/model.cmake | 7 ++++ src/tbb/TBBStream.cpp | 32 ++++++++++++++-- src/tbb/TBBStream.hpp | 13 +++++-- src/tbb/model.cmake | 11 +++++- 12 files changed, 200 insertions(+), 72 deletions(-) diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 343e2470..2dead3b4 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -10,60 +10,79 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + // There are three execution policies: // auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; +constexpr auto exe_policy = std::execution::par_unseq; template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, a(array_size), b(array_size), c(array_size) -{ -} + noexcept : array_size{ARRAY_SIZE}, +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else +array_size(ARRAY_SIZE), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDDataStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, BEGIN(a), END(a), initA); + std::fill(exe_policy, BEGIN(b), END(b), initB); + std::fill(exe_policy, BEGIN(c), END(c), initC); } template void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDDataStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); } template void STDDataStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, c.begin(), c.end(), b.begin(), [scalar = startScalar](T ci){ return scalar*ci; }); + std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); } template void STDDataStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), c.begin(), std::plus()); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b.begin(), b.end(), c.begin(), a.begin(), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); } template @@ -73,8 +92,8 @@ void STDDataStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a.begin(), a.end(), b.begin(), a.begin(), [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a.begin(), a.end(), c.begin(), a.begin(), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); } @@ -82,7 +101,7 @@ template T STDDataStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -102,3 +121,5 @@ std::string getDeviceDriver(const int) template class STDDataStream; template class STDDataStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 741fd6ce..84b4dcfb 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -21,9 +21,11 @@ class STDDataStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index ef69f304..6f87bc94 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -28,6 +32,8 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 2221f903..8c0958c3 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -10,46 +10,63 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + // There are three execution policies: // auto exe_policy = std::execution::seq; // auto exe_policy = std::execution::par; -auto exe_policy = std::execution::par_unseq; - +constexpr auto exe_policy = std::execution::par_unseq; template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, range(0, array_size), a(array_size), b(array_size), c(array_size) -{ -} + noexcept : array_size{ARRAY_SIZE}, range(0, array_size), +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDIndicesStream::init_arrays(T initA, T initB, T initC) { - std::fill(exe_policy, a.begin(), a.end(), initA); - std::fill(exe_policy, b.begin(), b.end(), initB); - std::fill(exe_policy, c.begin(), c.end(), initC); + std::fill(exe_policy, BEGIN(a), END(a), initA); + std::fill(exe_policy, BEGIN(b), END(b), initB); + std::fill(exe_policy, BEGIN(c), END(c), initC); } template void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDIndicesStream::copy() { // c[i] = a[i] - std::copy(exe_policy, a.begin(), a.end(), c.begin()); + std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -58,7 +75,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c.begin(), [&](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) { return a[i] + b[i]; }); } @@ -67,7 +84,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -79,7 +96,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a.begin(), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } @@ -89,7 +106,7 @@ template T STDIndicesStream::dot() { // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a.begin(), a.end(), b.begin(), 0.0); + return std::transform_reduce(exe_policy, BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -109,3 +126,5 @@ std::string getDeviceDriver(const int) template class STDIndicesStream; template class STDIndicesStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 26c7cb0d..6810888a 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -10,6 +10,11 @@ #include #include "Stream.h" +#ifdef USE_SPAN +#include +#endif + + #define IMPLEMENTATION_STRING "STD (index-oriented)" @@ -60,9 +65,11 @@ class STDIndicesStream : public Stream ranged range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index ef69f304..6f87bc94 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + register_flag_optional(NVHPC_OFFLOAD "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) @@ -28,6 +32,8 @@ macro(setup) register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index de615289..fc71fee4 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -10,20 +10,40 @@ #include #include +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif + +// There are three execution policies: +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +constexpr auto exe_policy = std::execution::par_unseq; + template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE} -{ - a = std::vector(array_size); - b = std::vector(array_size); - c = std::vector(array_size); -} + : array_size{ARRAY_SIZE}, +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif +{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } template void STDRangesStream::init_arrays(T initA, T initB, T initC) { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, // loop range [&] (int i) { a[i] = initA; @@ -37,16 +57,16 @@ template void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template void STDRangesStream::copy() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i]; @@ -60,7 +80,7 @@ void STDRangesStream::mul() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { b[i] = scalar * c[i]; @@ -72,7 +92,7 @@ template void STDRangesStream::add() { std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { c[i] = a[i] + b[i]; @@ -86,7 +106,7 @@ void STDRangesStream::triad() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] = b[i] + scalar * c[i]; @@ -100,7 +120,7 @@ void STDRangesStream::nstream() const T scalar = startScalar; std::for_each_n( - std::execution::par_unseq, + exe_policy, std::views::iota(0).begin(), array_size, [&] (int i) { a[i] += b[i] + scalar * c[i]; @@ -114,8 +134,8 @@ T STDRangesStream::dot() // sum += a[i] * b[i]; return std::transform_reduce( - std::execution::par_unseq, - a.begin(), a.end(), b.begin(), 0.0); + exe_policy, + BEGIN(a), END(a), BEGIN(b), 0.0); } void listDevices(void) @@ -136,3 +156,5 @@ std::string getDeviceDriver(const int) template class STDRangesStream; template class STDRangesStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 890e893f..33bc77bd 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -21,9 +21,11 @@ class STDRangesStream : public Stream int array_size; // Device side pointers - std::vector a; - std::vector b; - std::vector c; +#ifdef USE_VECTOR + std::vector a, b, c; +#else + T *a, *b, *c; +#endif public: STDRangesStream(const int, int); diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index fd07387d..ac56962b 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -3,6 +3,10 @@ register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" "c++") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -13,4 +17,7 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++2a) + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index 9c34a506..bd94443b 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -5,15 +5,37 @@ // source code #include "TBBStream.hpp" +#include + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_VECTOR +#define BEGIN(x) (x).begin() +#define END(x) (x).end() +#else +#define BEGIN(x) (x) +#define END(x) ((x) + array_size) +#endif template TBBStream::TBBStream(const int ARRAY_SIZE, int device) - : partitioner(), range(0, ARRAY_SIZE), a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + : partitioner(), range(0, ARRAY_SIZE), +#ifdef USE_VECTOR + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) +#else + array_size(ARRAY_SIZE), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) +#endif { if(device != 0){ throw std::runtime_error("Device != 0 is not supported by TBB"); } std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; } @@ -35,9 +57,9 @@ template void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { // Element-wise copy. - h_a = a; - h_b = b; - h_c = c; + std::copy(BEGIN(a), END(a), h_a.begin()); + std::copy(BEGIN(b), END(b), h_b.begin()); + std::copy(BEGIN(c), END(c), h_c.begin()); } template @@ -132,3 +154,5 @@ std::string getDeviceDriver(const int) template class TBBStream; template class TBBStream; +#undef BEGIN +#undef END \ No newline at end of file diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp index 90763a9c..2744afc2 100644 --- a/src/tbb/TBBStream.hpp +++ b/src/tbb/TBBStream.hpp @@ -40,10 +40,15 @@ class TBBStream : public Stream tbb_partitioner partitioner; tbb::blocked_range range; // Device side pointers - std::vector a; - std::vector b; - std::vector c; - +#ifdef USE_VECTOR + std::vector a, b, c; +#else + size_t array_size; + T *a, *b, *c; +#endif + + + public: TBBStream(const int, int); ~TBBStream() = default; diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index e4d6bac3..c1ff9aac 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -1,7 +1,7 @@ register_flag_optional(ONE_TBB_DIR "Absolute path to oneTBB (with header `onetbb/tbb.h`) distribution, the directory should contain at least `include/` and `lib/. - If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." + If unspecified, the system TBB (with header `tbb/tbb.h`) will be used via CMake's find_package(TBB)." "") @@ -15,15 +15,22 @@ register_flag_optional(PARTITIONER See https://spec.oneapi.com/versions/latest/elements/oneTBB/source/algorithms.html#partitioners for more details." "AUTO") +register_flag_optional(USE_VECTOR + "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." + "OFF") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct endif() - + # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages find_package(TBB REQUIRED) register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) + if(USE_VECTOR) + register_definitions(USE_VECTOR) + endif() endmacro() From a299d613bb0f848b7b931b27bfbc128140349a50 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 21:27:34 +0100 Subject: [PATCH 02/64] Add CI tests with and without vectors Remove duplicate CI tests from bad merge Fix extra array_size init for std-data --- src/ci-test-compile.sh | 21 +++++++++++++-------- src/std-data/STDDataStream.cpp | 1 - 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index a7c5bab5..e443ccab 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -140,8 +140,15 @@ build_gcc() { run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" + # std again but with vectors + run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + + run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + run_build $name "${GCC_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors if [ "${GCC_OMP_OFFLOAD_AMD:-false}" != "false" ]; then run_build "amd_$name" "${GCC_CXX:?}" acc "$cxx -DCXX_EXTRA_FLAGS=-foffload=amdgcn-amdhsa" @@ -207,14 +214,6 @@ build_clang() { run_build "nvidia_$name" "${GCC_CXX:?}" omp "$cxx -DOFFLOAD=NVIDIA:$NV_ARCH" fi - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" - run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" - run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" - run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH" run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=MANAGED" @@ -225,8 +224,14 @@ build_clang() { run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported + # std again but with vectors + run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" + # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" # not yet supported + run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB + run_build $name "${CLANG_CXX:?}" tbb "$cxx -DUSE_VECTOR=ON" # build with vectors run_build $name "${CLANG_CXX:?}" raja "$cxx -DRAJA_IN_TREE=${RAJA_SRC:?}" # no clang /w RAJA+cuda because it needs nvcc which needs gcc diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 2dead3b4..9eb12914 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -34,7 +34,6 @@ STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) #ifdef USE_VECTOR a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else -array_size(ARRAY_SIZE), a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) From 37dcdc224ce595d89661aae3e225a439559a2e8f Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 21:43:09 +0100 Subject: [PATCH 03/64] nvc++: "last line of file ends without a newline" Add CI vector tests for NVHPC --- src/ci-test-compile.sh | 5 +++++ src/std-data/STDDataStream.cpp | 2 +- src/std-indices/STDIndicesStream.cpp | 2 +- src/std-ranges/STDRangesStream.cpp | 2 +- src/tbb/TBBStream.cpp | 2 +- 5 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index e443ccab..c5ba953d 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -242,6 +242,11 @@ build_nvhpc() { local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + + # std again but with vectors + run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" + run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DUSE_VECTOR=ON" + run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" } diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 9eb12914..34059f5c 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -121,4 +121,4 @@ template class STDDataStream; template class STDDataStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 8c0958c3..d3537774 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -127,4 +127,4 @@ template class STDIndicesStream; template class STDIndicesStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index fc71fee4..356e6dc5 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -157,4 +157,4 @@ template class STDRangesStream; template class STDRangesStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index bd94443b..c5e9d905 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -155,4 +155,4 @@ template class TBBStream; template class TBBStream; #undef BEGIN -#undef END \ No newline at end of file +#undef END From 193eaa7fe2b690035a85b365a6ff263659a43b86 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sun, 24 Jul 2022 23:30:24 +0100 Subject: [PATCH 04/64] Fix index iterator on large problem sizes --- src/std-indices/STDIndicesStream.h | 66 ++++++++++++++++++------------ 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 6810888a..3fd88f35 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -23,35 +23,47 @@ // implementation doesn't target template class ranged { - N from, to; public: - ranged(N from, N to ): from(from), to(to) {} - class iterator { - N num; + class iterator { + friend class ranged; public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = const N&; - using iterator_category = std::random_access_iterator_tag; - explicit iterator(N _num = 0) : num(_num) {} - - iterator& operator++() { num++; return *this; } - iterator operator++(int) { iterator retval = *this; ++(*this); return retval; } - iterator operator+(const value_type v) const { return iterator(num + v); } - iterator operator+=(int x) { iterator retval = *this; this->num+=x; return retval; } - - bool operator==(iterator other) const { return num == other.num; } - bool operator!=(iterator other) const { return *this != other; } - bool operator<(iterator other) const { return num < other.num; } - - reference operator*() const { return num;} - difference_type operator-(const iterator &it) const { return num - it.num; } - value_type operator[](const difference_type &i) const { return num + i; } - - }; - iterator begin() { return iterator(from); } - iterator end() { return iterator(to >= from? to+1 : to-1); } + using difference_type = N; + using value_type = N; + using pointer = const N*; + using reference = const N&; + using iterator_category = std::random_access_iterator_tag; + + reference operator *() const { return i_; } + const iterator &operator ++() { ++i_; return *this; } + iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } + + const iterator &operator --() { --i_; return *this; } + iterator operator --(int) { iterator copy(*this); --i_; return copy; } + + const iterator &operator +=(N by) { i_+=by; return *this; } + + value_type operator[](const difference_type &i) const { return i_ + i; } + + difference_type operator-(const iterator &it) const { return i_ - it.i_; } + iterator operator+(const value_type v) const { return iterator(i_ + v); } + + bool operator ==(const iterator &other) const { return i_ == other.i_; } + bool operator !=(const iterator &other) const { return i_ != other.i_; } + bool operator < (const iterator &other) const { return i_ < other.i_; } + + protected: + explicit iterator(N start) : i_ (start) {} + + private: + N i_; + }; + + [[nodiscard]] iterator begin() const { return begin_; } + [[nodiscard]] iterator end() const { return end_; } + ranged(N begin, N end) : begin_(begin), end_(end) {} +private: + iterator begin_; + iterator end_; }; template From f5513cd69ec6a3c1c0802963135f081df49f3d27 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Tue, 26 Jul 2022 23:51:21 +0100 Subject: [PATCH 05/64] Add in-tree oneTBB build --- CMakeLists.txt | 19 +++++++++++++++++++ src/std-data/model.cmake | 8 +++++++- src/std-indices/model.cmake | 9 ++++++++- src/std-ranges/model.cmake | 7 +++++++ src/tbb/model.cmake | 7 ++++++- 5 files changed, 47 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6769952d..14bd39ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,25 @@ hint_flag(CXX_EXTRA_LINKER_FLAGS " # Honor user's CXX_EXTRA_LINK_FLAGS set(CXX_EXTRA_LINK_FLAGS ${CXX_EXTRA_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) +option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on models that + don't explicitly link against TBB is a no-op, see description of your selected + model on how this is used." OFF) + +if (USE_TBB) + include(FetchContent) + FetchContent_Declare( + TBB + GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git + GIT_TAG faaf43c4ab22cb4b4267d65d5e218fa58800eea8 + ) + # Not using FetchContent_MakeAvailable because we need EXCLUDE_FROM_ALL + FetchContent_GetProperties(TBB) + if (NOT TBB_POPULATED) + FetchContent_Populate(TBB) + add_subdirectory(${tbb_SOURCE_DIR} ${tbb_BINARY_DIR} EXCLUDE_FROM_ALL) + endif () +endif () + # include our macros include(cmake/register_models.cmake) diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index 6f87bc94..3f79f13a 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -23,6 +23,10 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + macro(setup) set(CMAKE_CXX_STANDARD 17) @@ -35,5 +39,7 @@ macro(setup) if(USE_VECTOR) register_definitions(USE_VECTOR) endif() - + if (USE_TBB) + register_link_library(TBB::tbb) + endif () endmacro() diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index 6f87bc94..7dc22b9d 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -23,6 +23,11 @@ register_flag_optional(NVHPC_OFFLOAD ccall - Compile for all supported compute capabilities" "") +register_flag_optional(USE_TBB + "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + + macro(setup) set(CMAKE_CXX_STANDARD 17) @@ -35,5 +40,7 @@ macro(setup) if(USE_VECTOR) register_definitions(USE_VECTOR) endif() - + if (USE_TBB) + register_link_library(TBB::tbb) + endif () endmacro() diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index ac56962b..65e54894 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -7,6 +7,10 @@ register_flag_optional(USE_VECTOR "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." "OFF") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -20,4 +24,7 @@ macro(setup) if(USE_VECTOR) register_definitions(USE_VECTOR) endif() + if (USE_TBB) + register_link_library(TBB::tbb) + endif () endmacro() diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index c1ff9aac..eeb16377 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -19,15 +19,20 @@ register_flag_optional(USE_VECTOR "Whether to use std::vector for storage or use aligned_alloc. C++ vectors are *zero* initialised where as aligned_alloc is uninitialised before first use." "OFF") +register_flag_optional(USE_TBB + "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." + "OFF") + macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct + find_package(TBB REQUIRED) endif() + # No need to handle USE_TBB as both ONE_TBB_DIR and USE_TBB will create the TBB::tbb target # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages - find_package(TBB REQUIRED) register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) if(USE_VECTOR) From 5197a4e5618292ea812f10d7cbd22a4ef75b36a0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 27 Jul 2022 00:16:29 +0100 Subject: [PATCH 06/64] Find TBB if USE_TBB is not set --- src/tbb/model.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/tbb/model.cmake b/src/tbb/model.cmake index eeb16377..1cbd7fba 100644 --- a/src/tbb/model.cmake +++ b/src/tbb/model.cmake @@ -27,11 +27,12 @@ macro(setup) if(ONE_TBB_DIR) set(TBB_ROOT "${ONE_TBB_DIR}") # see https://github.com/Kitware/VTK/blob/0a31a9a3c1531ae238ac96a372fec4be42282863/CMake/FindTBB.cmake#L34 # docs on Intel's website refers to TBB_DIR which is not correct + endif() + if (NOT USE_TBB) + # Only find TBB when we're not building in-tree find_package(TBB REQUIRED) endif() - # No need to handle USE_TBB as both ONE_TBB_DIR and USE_TBB will create the TBB::tbb target - # see https://github.com/oneapi-src/oneTBB/blob/master/cmake/README.md#tbbconfig---integration-of-binary-packages register_link_library(TBB::tbb) register_definitions(PARTITIONER_${PARTITIONER}) From dfb4eb06b24245727c37e1356a3f0e73f2b83db0 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 15:03:26 +0100 Subject: [PATCH 07/64] Add oneDPL for std models --- CMakeLists.txt | 26 +++++++++- cmake/register_models.cmake | 4 ++ cmake/shim_onedpl.cmake | 27 ++++++++++ src/dpl_shim.h | 75 ++++++++++++++++++++++++++++ src/std-data/STDDataStream.cpp | 46 ++++++++++------- src/std-data/STDDataStream.h | 5 +- src/std-data/model.cmake | 18 ++++++- src/std-indices/STDIndicesStream.cpp | 39 +++++++++++---- src/std-indices/STDIndicesStream.h | 11 ++-- src/std-indices/model.cmake | 17 +++++-- src/std-ranges/STDRangesStream.cpp | 39 +++++++++++---- src/std-ranges/STDRangesStream.hpp | 10 ++-- src/std-ranges/model.cmake | 17 ++++++- 13 files changed, 269 insertions(+), 65 deletions(-) create mode 100644 cmake/shim_onedpl.cmake create mode 100644 src/dpl_shim.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 14bd39ec..263555a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,13 +76,15 @@ option(USE_TBB "Enable oneTBB library for *supported* models. Enabling this on m model on how this is used." OFF) if (USE_TBB) - include(FetchContent) FetchContent_Declare( TBB GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git GIT_TAG faaf43c4ab22cb4b4267d65d5e218fa58800eea8 ) - # Not using FetchContent_MakeAvailable because we need EXCLUDE_FROM_ALL + # Don't fail builds on waring (TBB has -Wall while not being free of warnings from unused symbols...) + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + set(TBB_STRICT OFF) + # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL FetchContent_GetProperties(TBB) if (NOT TBB_POPULATED) FetchContent_Populate(TBB) @@ -90,6 +92,25 @@ if (USE_TBB) endif () endif () +option(USE_TBB "Enable oneDPL library for *supported* models. Enabling this on models that + don't explicitly link against DPL is a no-op, see description of your selected + model on how this is used." OFF) + +if (USE_ONEDPL) + FetchContent_Declare( + oneDPL + GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git + GIT_TAG oneDPL-2021.7.0-release + ) + # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL + FetchContent_GetProperties(oneDPL) + if (NOT oneDPL_POPULATED) + FetchContent_Populate(oneDPL) + add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL) + endif () +endif() + + # include our macros include(cmake/register_models.cmake) @@ -170,6 +191,7 @@ include_directories(src) add_executable(${EXE_NAME} ${IMPL_SOURCES} src/main.cpp) target_link_libraries(${EXE_NAME} PUBLIC ${LINK_LIBRARIES}) target_compile_definitions(${EXE_NAME} PUBLIC ${IMPL_DEFINITIONS}) +target_include_directories(${EXE_NAME} PUBLIC ${IMPL_DIRECTORIES}) if (CXX_EXTRA_LIBRARIES) target_link_libraries(${EXE_NAME} PUBLIC ${CXX_EXTRA_LIBRARIES}) diff --git a/cmake/register_models.cmake b/cmake/register_models.cmake index f180c03b..9432313e 100644 --- a/cmake/register_models.cmake +++ b/cmake/register_models.cmake @@ -71,6 +71,10 @@ macro(register_definitions) list(APPEND IMPL_DEFINITIONS ${ARGN}) endmacro() +macro(register_directories) + list(APPEND IMPL_DIRECTORIES ${ARGN}) +endmacro() + macro(register_flag_required NAME DESCRIPTION) list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" ON "") endmacro() diff --git a/cmake/shim_onedpl.cmake b/cmake/shim_onedpl.cmake new file mode 100644 index 00000000..861d0697 --- /dev/null +++ b/cmake/shim_onedpl.cmake @@ -0,0 +1,27 @@ + + +if (USE_ONEDPL) + # # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html + # # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation + # register_definitions( + # PSTL_USE_PARALLEL_POLICIES=0 + # _GLIBCXX_USE_TBB_PAR_BACKEND=0 + # ) + register_definitions(USE_ONEDPL) + if (USE_ONEDPL STREQUAL "TBB") + register_definitions(ONEDPL_USE_TBB_BACKEND=1) + # TBB will either be linked later (USE_TBB==ON) or via extra libraries, don't do anything here + elseif (USE_ONEDPL STREQUAL "OPENMP") + register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) + # Link OpenMP via CMAKE + find_package(OpenMP REQUIRED) + register_link_library(OpenMP::OpenMP_CXX) + elseif (USE_ONEDPL STREQUAL "SYCL") + register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) + # This needs a SYCL compiler, will fail if CXX doesn't SYCL2020 + register_append_cxx_flags(ANY -fsycl-unnamed-lambda -fsycl) + else () + message(FATAL_ERROR "Unsupported USE_ONEDPL backend: ${USE_ONEDPL}, see USE_ONEDPL flag description for available values.") + endif () + register_directories(ANY ${onedpl_SOURCE_DIR}/include) +endif () \ No newline at end of file diff --git a/src/dpl_shim.h b/src/dpl_shim.h new file mode 100644 index 00000000..89012a35 --- /dev/null +++ b/src/dpl_shim.h @@ -0,0 +1,75 @@ +#pragma once + +#include +#include + +#ifndef ALIGNMENT +#define ALIGNMENT (2*1024*1024) // 2MB +#endif + +#ifdef USE_ONEDPL + +// oneDPL C++17 PSTL + +#include +#include +#include + +#ifdef ONEDPL_USE_DPCPP_BACKEND + +#include + +const static auto exe_policy = oneapi::dpl::execution::device_policy<>{ + oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{}) +}; + +template using Allocator = sycl::usm_allocator; + +template +constexpr Allocator alloc_vec() { return {exe_policy.queue()}; }; + +template +T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue()); } + +template +void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } + +#else + +// auto exe_policy = dpl::execution::seq; +// auto exe_policy = dpl::execution::par; +static constexpr auto exe_policy = dpl::execution::par_unseq; +#define USE_STD_PTR_ALLOC_DEALLOC + +#endif + +#else + +// Normal C++17 PSTL + +#include +#include +#include + +// auto exe_policy = std::execution::seq; +// auto exe_policy = std::execution::par; +static constexpr auto exe_policy = std::execution::par_unseq; +#define USE_STD_PTR_ALLOC_DEALLOC + + +#endif + +#ifdef USE_STD_PTR_ALLOC_DEALLOC + +template using Allocator = std::allocator; + +template +constexpr Allocator alloc_vec() { return {}; }; + +template +T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); } + +template +void dealloc_raw(T *ptr) { free(ptr); } + +#endif diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 34059f5c..2bb6a33a 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -6,14 +6,6 @@ #include "STDDataStream.h" -#include -#include -#include - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - #ifdef USE_VECTOR #define BEGIN(x) (x).begin() #define END(x) (x).end() @@ -22,23 +14,39 @@ #define END(x) ((x) + array_size) #endif -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -constexpr auto exe_policy = std::execution::par_unseq; - - template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) #else - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif -{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB " TBB_VERSION_STRING; +#elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif +} + +template +STDDataStream::~STDDataStream() { +#ifndef USE_VECTOR + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +#endif +} template void STDDataStream::init_arrays(T initA, T initB, T initC) diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index 84b4dcfb..e50c95d8 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -5,6 +5,7 @@ // source code #pragma once +#include "dpl_shim.h" #include #include @@ -22,7 +23,7 @@ class STDDataStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector a, b, c; + std::vector> a, b, c; #else T *a, *b, *c; #endif @@ -30,7 +31,7 @@ class STDDataStream : public Stream public: STDDataStream(const int, int) noexcept; - ~STDDataStream() = default; + ~STDDataStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index 3f79f13a..f2fecba8 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -27,18 +27,32 @@ register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + SYCL - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + macro(setup) set(CMAKE_CXX_STANDARD 17) + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) + if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if(USE_VECTOR) + if (USE_VECTOR) register_definitions(USE_VECTOR) - endif() + endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index d3537774..4ec9977d 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -22,22 +22,39 @@ #define END(x) ((x) + array_size) #endif -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -constexpr auto exe_policy = std::execution::par_unseq; - template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, range(0, array_size), +noexcept : array_size{ARRAY_SIZE}, range(0, array_size), #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) #else - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) +#endif +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB " TBB_VERSION_STRING; +#elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; #endif -{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } +} + +template +STDIndicesStream::~STDIndicesStream() { +#ifndef USE_VECTOR + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +#endif +} template void STDIndicesStream::init_arrays(T initA, T initB, T initC) diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 3fd88f35..63254cdf 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -5,19 +5,14 @@ // source code #pragma once +#include "dpl_shim.h" #include #include #include "Stream.h" -#ifdef USE_SPAN -#include -#endif - - #define IMPLEMENTATION_STRING "STD (index-oriented)" - // A lightweight counting iterator which will be used by the STL algorithms // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this // implementation doesn't target @@ -78,7 +73,7 @@ class STDIndicesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector a, b, c; + std::vector> a, b, c; #else T *a, *b, *c; #endif @@ -86,7 +81,7 @@ class STDIndicesStream : public Stream public: STDIndicesStream(const int, int) noexcept; - ~STDIndicesStream() = default; + ~STDIndicesStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index 7dc22b9d..36e2ed82 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -27,19 +27,30 @@ register_flag_optional(USE_TBB "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + SYCL - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") macro(setup) set(CMAKE_CXX_STANDARD 17) - + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well register_append_cxx_flags(ANY ${NVHPC_FLAGS}) register_append_link_flags(${NVHPC_FLAGS}) endif () - if(USE_VECTOR) + if (USE_VECTOR) register_definitions(USE_VECTOR) - endif() + endif () if (USE_TBB) register_link_library(TBB::tbb) endif () diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 356e6dc5..29993bc6 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -22,22 +22,39 @@ #define END(x) ((x) + array_size) #endif -// There are three execution policies: -// auto exe_policy = std::execution::seq; -// auto exe_policy = std::execution::par; -constexpr auto exe_policy = std::execution::par_unseq; - template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE}, +noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) #else - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) +#endif +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#if USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if defined(ONEDPL_USE_DPCPP_BACKEND) + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif defined(ONEDPL_USE_TBB_BACKEND) + std::cout << "TBB " TBB_VERSION_STRING; +#elif defined(ONEDPL_USE_OPENMP_BACKEND) + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; #endif -{ std::cout <<"Backing storage typeid: " << typeid(a).name() << std::endl; } +} + +template +STDRangesStream::~STDRangesStream() { +#ifndef USE_VECTOR + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +#endif +} template void STDRangesStream::init_arrays(T initA, T initB, T initC) diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 33bc77bd..21902c6c 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -5,10 +5,10 @@ // source code #pragma once +#include "dpl_shim.h" #include -#include - +#include #include "Stream.h" #define IMPLEMENTATION_STRING "STD C++ ranges" @@ -22,14 +22,14 @@ class STDRangesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector a, b, c; + std::vector> a, b, c; #else T *a, *b, *c; #endif public: - STDRangesStream(const int, int); - ~STDRangesStream() = default; + STDRangesStream(const int, int) noexcept; + ~STDRangesStream(); virtual void copy() override; virtual void add() override; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 65e54894..2d90afc4 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -11,6 +11,18 @@ register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." "OFF") +register_flag_optional(USE_ONEDPL + "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. + + Possible values are: + OPENMP - Implements policies using OpenMP. + CMake will handle any flags needed to enable OpenMP if the compiler supports it. + TBB - Implements policies using TBB. + TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. + SYCL - Implements policies through SYCL2020. + This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." + "OFF") + macro(setup) # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here @@ -21,9 +33,10 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++2a) - if(USE_VECTOR) + include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) + if (USE_VECTOR) register_definitions(USE_VECTOR) - endif() + endif () if (USE_TBB) register_link_library(TBB::tbb) endif () From f77e43c6d533aea926aec8b39aea072d319a8611 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:23:07 +0100 Subject: [PATCH 08/64] Don't capture `this` implicitly Relax const constraints on the range iterator --- src/std-indices/STDIndicesStream.cpp | 8 ++++---- src/std-indices/STDIndicesStream.h | 13 +++++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 4ec9977d..7cacde3f 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -83,7 +83,7 @@ template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [this, scalar = startScalar](int i) { return scalar * c[i]; }); } @@ -92,7 +92,7 @@ template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [&](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [this](int i) { return a[i] + b[i]; }); } @@ -101,7 +101,7 @@ template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); } @@ -113,7 +113,7 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [&, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); } diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 63254cdf..a955374f 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -25,17 +25,22 @@ class ranged { using difference_type = N; using value_type = N; using pointer = const N*; - using reference = const N&; + using reference = N; using iterator_category = std::random_access_iterator_tag; + // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled. + // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device. + // This type is unused for any nother STL impl. + using is_passed_directly = std::true_type; + reference operator *() const { return i_; } - const iterator &operator ++() { ++i_; return *this; } + iterator &operator ++() { ++i_; return *this; } iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } - const iterator &operator --() { --i_; return *this; } + iterator &operator --() { --i_; return *this; } iterator operator --(int) { iterator copy(*this); --i_; return copy; } - const iterator &operator +=(N by) { i_+=by; return *this; } + iterator &operator +=(N by) { i_+=by; return *this; } value_type operator[](const difference_type &i) const { return i_ + i; } From 0e8b3b4bcec68e2a6ffce9bac65308d7337d3342 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:36:39 +0100 Subject: [PATCH 09/64] Add CI for dpl --- src/ci-test-compile.sh | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index c5ba953d..cccbd2d7 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -135,16 +135,14 @@ build_gcc() { "./$BUILD_DIR/omp_$name/omp-stream" -s 1048576 -n 10 fi - # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" - - # std again but with vectors - run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - + for use_onedpl in OFF OPENMP TBB; do + for use_vector in OFF ON; do + # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here + run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + done + done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${GCC_CXX:?}" tbb "$cxx" # build TBB again with the system TBB @@ -220,14 +218,14 @@ build_clang() { run_build $name "${CLANG_CXX:?}" cuda "$cxx -DCMAKE_CUDA_COMPILER=${NVHPC_NVCC:?} -DCUDA_ARCH=$NV_ARCH -DMEM=PAGEFAULT" run_build $name "${CLANG_CXX:?}" kokkos "$cxx -DKOKKOS_IN_TREE=${KOKKOS_SRC:?} -DKokkos_ENABLE_OPENMP=ON" run_build $name "${CLANG_CXX:?}" ocl "$cxx -DOpenCL_LIBRARY=${OCL_LIB:?}" - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" # not yet supported - # std again but with vectors - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" - # run_build $name "${LANG_CXX:?}" std20 "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_VECTOR=ON" # not yet supported + for use_onedpl in OFF OPENMP TBB; do + for use_vector in OFF ON; do + run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported + done + done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" run_build $name "${CLANG_CXX:?}" tbb "$cxx" # build TBB again with the system TBB From 379bc2032c34edb91345082b2cd1a5554f5880b6 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:49:16 +0100 Subject: [PATCH 10/64] Add CI for dpl (again) --- src/ci-test-compile.sh | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index cccbd2d7..041c9916 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -136,11 +136,15 @@ build_gcc() { fi for use_onedpl in OFF OPENMP TBB; do + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" done done @@ -221,9 +225,13 @@ build_clang() { for use_onedpl in OFF OPENMP TBB; do for use_vector in OFF ON; do - run_build $name "${CLANG_CXX:?}" std-data "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" - # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx -DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-} -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported + case "$use_onedpl" in + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + esac + run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " + run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" + # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" # not yet supported done done From 14844ceb5615d33ddff573197f48fb996fab4bb8 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 16:53:57 +0100 Subject: [PATCH 11/64] Fix CMakeLists.txt typo on USE_ONEDPL --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 263555a0..f2af1e31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,9 +92,9 @@ if (USE_TBB) endif () endif () -option(USE_TBB "Enable oneDPL library for *supported* models. Enabling this on models that - don't explicitly link against DPL is a no-op, see description of your selected - model on how this is used." OFF) +option(USE_ONEDPL "Enable oneDPL library for *supported* models. Enabling this on models that + don't explicitly link against DPL is a no-op, see description of your selected + model on how this is used." OFF) if (USE_ONEDPL) FetchContent_Declare( From 5a496a91b293f7cd79059afe5452d1b7856753ea Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 19:29:46 +0100 Subject: [PATCH 12/64] Fixup oneDPL inclusion --- CMakeLists.txt | 14 ++++++++++++++ cmake/register_models.cmake | 4 ---- cmake/shim_onedpl.cmake | 27 --------------------------- src/dpl_shim.h | 2 +- src/std-data/STDDataStream.cpp | 8 ++++---- src/std-data/model.cmake | 7 ++++--- src/std-indices/STDIndicesStream.cpp | 8 ++++---- src/std-indices/model.cmake | 5 ++++- src/std-ranges/STDRangesStream.cpp | 8 ++++---- src/std-ranges/model.cmake | 5 ++++- 10 files changed, 39 insertions(+), 49 deletions(-) delete mode 100644 cmake/shim_onedpl.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index f2af1e31..eb9e57b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,10 +102,24 @@ if (USE_ONEDPL) GIT_REPOSITORY https://github.com/oneapi-src/oneDPL.git GIT_TAG oneDPL-2021.7.0-release ) + string(TOLOWER ${USE_ONEDPL} ONEDPL_BACKEND) + # XXX oneDPL looks for omp instead of openmp, which mismatches(!) with ONEDPL_PAR_BACKEND if using find_package + if (ONEDPL_BACKEND STREQUAL "openmp") + set(ONEDPL_BACKEND omp) + endif () # Not using FetchContent_MakeAvailable (CMake>= 3.14) because we need EXCLUDE_FROM_ALL FetchContent_GetProperties(oneDPL) if (NOT oneDPL_POPULATED) FetchContent_Populate(oneDPL) + if (USE_TBB) + macro(find_package NAME) + if ("${NAME}" STREQUAL "TBB") + message(STATUS "Discarding oneDPL's call to find_package(${NAME} ${ARGN})") + else () + _find_package(${NAME} ${ARGN}) + endif () + endmacro() + endif () add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL) endif () endif() diff --git a/cmake/register_models.cmake b/cmake/register_models.cmake index 9432313e..f180c03b 100644 --- a/cmake/register_models.cmake +++ b/cmake/register_models.cmake @@ -71,10 +71,6 @@ macro(register_definitions) list(APPEND IMPL_DEFINITIONS ${ARGN}) endmacro() -macro(register_directories) - list(APPEND IMPL_DIRECTORIES ${ARGN}) -endmacro() - macro(register_flag_required NAME DESCRIPTION) list(APPEND CUSTOM_FLAGS_TRIPLE "${NAME}" "${DESCRIPTION}" ON "") endmacro() diff --git a/cmake/shim_onedpl.cmake b/cmake/shim_onedpl.cmake deleted file mode 100644 index 861d0697..00000000 --- a/cmake/shim_onedpl.cmake +++ /dev/null @@ -1,27 +0,0 @@ - - -if (USE_ONEDPL) - # # XXX see https://www.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-library-guide/top/oneapi-dpc-library-onedpl-overview.html - # # this is to avoid the system TBB headers (if exists) from having precedence which isn't compatible with oneDPL's par implementation - # register_definitions( - # PSTL_USE_PARALLEL_POLICIES=0 - # _GLIBCXX_USE_TBB_PAR_BACKEND=0 - # ) - register_definitions(USE_ONEDPL) - if (USE_ONEDPL STREQUAL "TBB") - register_definitions(ONEDPL_USE_TBB_BACKEND=1) - # TBB will either be linked later (USE_TBB==ON) or via extra libraries, don't do anything here - elseif (USE_ONEDPL STREQUAL "OPENMP") - register_definitions(ONEDPL_USE_OPENMP_BACKEND=1) - # Link OpenMP via CMAKE - find_package(OpenMP REQUIRED) - register_link_library(OpenMP::OpenMP_CXX) - elseif (USE_ONEDPL STREQUAL "SYCL") - register_definitions(ONEDPL_USE_DPCPP_BACKEND=1) - # This needs a SYCL compiler, will fail if CXX doesn't SYCL2020 - register_append_cxx_flags(ANY -fsycl-unnamed-lambda -fsycl) - else () - message(FATAL_ERROR "Unsupported USE_ONEDPL backend: ${USE_ONEDPL}, see USE_ONEDPL flag description for available values.") - endif () - register_directories(ANY ${onedpl_SOURCE_DIR}/include) -endif () \ No newline at end of file diff --git a/src/dpl_shim.h b/src/dpl_shim.h index 89012a35..e47ae99b 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -15,7 +15,7 @@ #include #include -#ifdef ONEDPL_USE_DPCPP_BACKEND +#if ONEDPL_USE_DPCPP_BACKEND #include diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 2bb6a33a..b6641dee 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -24,13 +24,13 @@ STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) #endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#if USE_ONEDPL +#ifdef USE_ONEDPL std::cout << "Using oneDPL backend: "; -#if defined(ONEDPL_USE_DPCPP_BACKEND) +#if ONEDPL_USE_DPCPP_BACKEND std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif defined(ONEDPL_USE_TBB_BACKEND) +#elif ONEDPL_USE_TBB_BACKEND std::cout << "TBB " TBB_VERSION_STRING; -#elif defined(ONEDPL_USE_OPENMP_BACKEND) +#elif ONEDPL_USE_OPENMP_BACKEND std::cout << "OpenMP"; #else std::cout << "Default"; diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index f2fecba8..3d2399d6 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -41,9 +41,6 @@ register_flag_optional(USE_ONEDPL macro(setup) set(CMAKE_CXX_STANDARD 17) - - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) - if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -56,4 +53,8 @@ macro(setup) if (USE_TBB) register_link_library(TBB::tbb) endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 7cacde3f..9d98a1b0 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -32,13 +32,13 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size), #endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#if USE_ONEDPL +#ifdef USE_ONEDPL std::cout << "Using oneDPL backend: "; -#if defined(ONEDPL_USE_DPCPP_BACKEND) +#if ONEDPL_USE_DPCPP_BACKEND std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif defined(ONEDPL_USE_TBB_BACKEND) +#elif ONEDPL_USE_TBB_BACKEND std::cout << "TBB " TBB_VERSION_STRING; -#elif defined(ONEDPL_USE_OPENMP_BACKEND) +#elif ONEDPL_USE_OPENMP_BACKEND std::cout << "OpenMP"; #else std::cout << "Default"; diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index 36e2ed82..befa9335 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -41,7 +41,6 @@ register_flag_optional(USE_ONEDPL macro(setup) set(CMAKE_CXX_STANDARD 17) - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well @@ -54,4 +53,8 @@ macro(setup) if (USE_TBB) register_link_library(TBB::tbb) endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 29993bc6..3ea32e41 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -32,13 +32,13 @@ noexcept : array_size{ARRAY_SIZE}, #endif { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#if USE_ONEDPL +#ifdef USE_ONEDPL std::cout << "Using oneDPL backend: "; -#if defined(ONEDPL_USE_DPCPP_BACKEND) +#if ONEDPL_USE_DPCPP_BACKEND std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif defined(ONEDPL_USE_TBB_BACKEND) +#elif ONEDPL_USE_TBB_BACKEND std::cout << "TBB " TBB_VERSION_STRING; -#elif defined(ONEDPL_USE_OPENMP_BACKEND) +#elif ONEDPL_USE_OPENMP_BACKEND std::cout << "OpenMP"; #else std::cout << "Default"; diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 2d90afc4..268cc149 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -33,11 +33,14 @@ macro(setup) unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default # and append our own: register_append_cxx_flags(ANY -std=c++2a) - include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/shim_onedpl.cmake) if (USE_VECTOR) register_definitions(USE_VECTOR) endif () if (USE_TBB) register_link_library(TBB::tbb) endif () + if (USE_ONEDPL) + register_definitions(USE_ONEDPL) + register_link_library(oneDPL) + endif () endmacro() From ecb0464f6c7b35f94c8ea2abc1bcd50b036ecb97 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 19:48:42 +0100 Subject: [PATCH 13/64] Fixup oneDPL and oneTBB in CI (gcc-10) --- src/ci-test-compile.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 041c9916..2a101910 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -137,8 +137,8 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in - OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here @@ -227,7 +227,7 @@ build_clang() { for use_vector in OFF ON; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; esac run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" From 1f4bc3fffc64b7232d47a2772792f226ddb10aaf Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 20:04:18 +0100 Subject: [PATCH 14/64] Fixup oneDPL and oneTBB in CI (gcc-10) take 2 --- src/ci-test-compile.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 2a101910..a2952bc5 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -138,7 +138,7 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here @@ -227,7 +227,7 @@ build_clang() { for use_vector in OFF ON; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; esac run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" From d56dc956e09925c386c1cabb97931791e4688d3c Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 20:30:27 +0100 Subject: [PATCH 15/64] Fixup oneDPL and oneTBB in CI (gcc-10) take 3 --- src/ci-test-compile.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index a2952bc5..d3fc5b71 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -138,7 +138,7 @@ build_gcc() { for use_onedpl in OFF OPENMP TBB; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${GCC_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac for use_vector in OFF ON; do # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here @@ -227,7 +227,7 @@ build_clang() { for use_vector in OFF ON; do case "$use_onedpl" in OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; - *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-DPSTL_USE_PARALLEL_POLICIES=0" ;; + *) dpl_conditional_flags="-DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector " run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DUSE_VECTOR=$use_vector" From aa82e57ba08ae59971b880e645f64d5583e15b7d Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 22:02:48 +0100 Subject: [PATCH 16/64] Fixup oneDPL dpcpp configuration Add conditional sync after each kernel. --- CMakeLists.txt | 9 ++++++++- src/dpl_shim.h | 4 ++++ src/std-data/STDDataStream.cpp | 5 +++++ src/std-data/model.cmake | 2 +- src/std-indices/STDIndicesStream.cpp | 5 +++++ src/std-indices/model.cmake | 2 +- src/std-ranges/STDRangesStream.cpp | 5 +++++ src/std-ranges/model.cmake | 2 +- 8 files changed, 30 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb9e57b2..7c137461 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -121,8 +121,15 @@ if (USE_ONEDPL) endmacro() endif () add_subdirectory(${onedpl_SOURCE_DIR} ${onedpl_BINARY_DIR} EXCLUDE_FROM_ALL) + + # Fixup oneDPL's omission on setting DPCPP definitions. + # We do this after the creation of the oneDPL target. + if (ONEDPL_BACKEND MATCHES "^(dpcpp|dpcpp_only)$") + target_compile_definitions(oneDPL INTERFACE ONEDPL_USE_DPCPP_BACKEND=1) + endif () + endif () -endif() +endif () # include our macros diff --git a/src/dpl_shim.h b/src/dpl_shim.h index e47ae99b..d341a591 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -34,6 +34,8 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue template void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } +static void sync_device(){exe_policy.queue().wait_and_throw(); } + #else // auto exe_policy = dpl::execution::seq; @@ -72,4 +74,6 @@ T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * si template void dealloc_raw(T *ptr) { free(ptr); } +static void sync_device(){ /*no-op*/ } + #endif diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index b6641dee..d4dc17f6 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -69,6 +69,7 @@ void STDDataStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + sync_device(); } template @@ -76,6 +77,7 @@ void STDDataStream::mul() { // b[i] = scalar * c[i]; std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); + sync_device(); } template @@ -83,6 +85,7 @@ void STDDataStream::add() { // c[i] = a[i] + b[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); + sync_device(); } template @@ -90,6 +93,7 @@ void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); + sync_device(); } template @@ -101,6 +105,7 @@ void STDDataStream::nstream() // 2: a[i] += scalar * c[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); + sync_device(); } diff --git a/src/std-data/model.cmake b/src/std-data/model.cmake index 3d2399d6..e1697b6d 100644 --- a/src/std-data/model.cmake +++ b/src/std-data/model.cmake @@ -35,7 +35,7 @@ register_flag_optional(USE_ONEDPL CMake will handle any flags needed to enable OpenMP if the compiler supports it. TBB - Implements policies using TBB. TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - SYCL - Implements policies through SYCL2020. + DPCPP - Implements policies through SYCL2020. This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 9d98a1b0..04b78296 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -77,6 +77,7 @@ void STDIndicesStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); + sync_device(); } template @@ -86,6 +87,7 @@ void STDIndicesStream::mul() std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [this, scalar = startScalar](int i) { return scalar * c[i]; }); + sync_device(); } template @@ -95,6 +97,7 @@ void STDIndicesStream::add() std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [this](int i) { return a[i] + b[i]; }); + sync_device(); } template @@ -104,6 +107,7 @@ void STDIndicesStream::triad() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); + sync_device(); } template @@ -116,6 +120,7 @@ void STDIndicesStream::nstream() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); + sync_device(); } diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake index befa9335..c2fef288 100644 --- a/src/std-indices/model.cmake +++ b/src/std-indices/model.cmake @@ -35,7 +35,7 @@ register_flag_optional(USE_ONEDPL CMake will handle any flags needed to enable OpenMP if the compiler supports it. TBB - Implements policies using TBB. TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - SYCL - Implements policies through SYCL2020. + DPCPP - Implements policies through SYCL2020. This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 3ea32e41..8a77a682 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -89,6 +89,7 @@ void STDRangesStream::copy() c[i] = a[i]; } ); + sync_device(); } template @@ -103,6 +104,7 @@ void STDRangesStream::mul() b[i] = scalar * c[i]; } ); + sync_device(); } template @@ -115,6 +117,7 @@ void STDRangesStream::add() c[i] = a[i] + b[i]; } ); + sync_device(); } template @@ -129,6 +132,7 @@ void STDRangesStream::triad() a[i] = b[i] + scalar * c[i]; } ); + sync_device(); } template @@ -143,6 +147,7 @@ void STDRangesStream::nstream() a[i] += b[i] + scalar * c[i]; } ); + sync_device(); } template diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake index 268cc149..35554c77 100644 --- a/src/std-ranges/model.cmake +++ b/src/std-ranges/model.cmake @@ -19,7 +19,7 @@ register_flag_optional(USE_ONEDPL CMake will handle any flags needed to enable OpenMP if the compiler supports it. TBB - Implements policies using TBB. TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - SYCL - Implements policies through SYCL2020. + DPCPP - Implements policies through SYCL2020. This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") From ed6206b54398f785ce3d7f2dfe048a98fd3d7a21 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Thu, 28 Jul 2022 23:45:43 +0100 Subject: [PATCH 17/64] Remove conditional sync after each kernel Don't capture `this`, capture each member instead --- src/dpl_shim.h | 4 ---- src/std-data/STDDataStream.cpp | 15 +++++---------- src/std-indices/STDIndicesStream.cpp | 23 +++++++++-------------- src/std-ranges/STDRangesStream.cpp | 15 +++++---------- 4 files changed, 19 insertions(+), 38 deletions(-) diff --git a/src/dpl_shim.h b/src/dpl_shim.h index d341a591..e47ae99b 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -34,8 +34,6 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue template void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } -static void sync_device(){exe_policy.queue().wait_and_throw(); } - #else // auto exe_policy = dpl::execution::seq; @@ -74,6 +72,4 @@ T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * si template void dealloc_raw(T *ptr) { free(ptr); } -static void sync_device(){ /*no-op*/ } - #endif diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index d4dc17f6..7c71163e 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -69,32 +69,28 @@ void STDDataStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - sync_device(); -} + } template void STDDataStream::mul() { // b[i] = scalar * c[i]; std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); - sync_device(); -} + } template void STDDataStream::add() { // c[i] = a[i] + b[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); - sync_device(); -} + } template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); - sync_device(); -} + } template void STDDataStream::nstream() @@ -105,8 +101,7 @@ void STDDataStream::nstream() // 2: a[i] += scalar * c[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); - sync_device(); -} + } template diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 04b78296..f9397fab 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -77,38 +77,34 @@ void STDIndicesStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - sync_device(); -} + } template void STDIndicesStream::mul() { // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [this, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) { return scalar * c[i]; }); - sync_device(); -} + } template void STDIndicesStream::add() { // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [this](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) { return a[i] + b[i]; }); - sync_device(); -} + } template void STDIndicesStream::triad() { // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); - sync_device(); -} + } template void STDIndicesStream::nstream() @@ -117,11 +113,10 @@ void STDIndicesStream::nstream() // Need to do in two stages with C++11 STL. // 1: a[i] += b[i] // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [this, scalar = startScalar](int i) { + std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); - sync_device(); -} + } template diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 8a77a682..9063ff20 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -89,8 +89,7 @@ void STDRangesStream::copy() c[i] = a[i]; } ); - sync_device(); -} + } template void STDRangesStream::mul() @@ -104,8 +103,7 @@ void STDRangesStream::mul() b[i] = scalar * c[i]; } ); - sync_device(); -} + } template void STDRangesStream::add() @@ -117,8 +115,7 @@ void STDRangesStream::add() c[i] = a[i] + b[i]; } ); - sync_device(); -} + } template void STDRangesStream::triad() @@ -132,8 +129,7 @@ void STDRangesStream::triad() a[i] = b[i] + scalar * c[i]; } ); - sync_device(); -} + } template void STDRangesStream::nstream() @@ -147,8 +143,7 @@ void STDRangesStream::nstream() a[i] += b[i] + scalar * c[i]; } ); - sync_device(); -} + } template T STDRangesStream::dot() From 72335f320e1976876d57c381abf731db19cb548e Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Fri, 29 Jul 2022 00:17:36 +0100 Subject: [PATCH 18/64] Revert to normal vector without allocators Prohibit vector type in indices --- src/dpl_shim.h | 10 ---------- src/std-data/STDDataStream.cpp | 12 ++++++------ src/std-data/STDDataStream.h | 2 +- src/std-indices/STDIndicesStream.cpp | 22 ++++++++++++++++------ src/std-indices/STDIndicesStream.h | 2 +- src/std-ranges/STDRangesStream.cpp | 12 ++++++------ src/std-ranges/STDRangesStream.hpp | 2 +- 7 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/dpl_shim.h b/src/dpl_shim.h index e47ae99b..226693bd 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -23,11 +23,6 @@ const static auto exe_policy = oneapi::dpl::execution::device_policy<>{ oneapi::dpl::execution::make_device_policy(cl::sycl::default_selector{}) }; -template using Allocator = sycl::usm_allocator; - -template -constexpr Allocator alloc_vec() { return {exe_policy.queue()}; }; - template T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue()); } @@ -61,11 +56,6 @@ static constexpr auto exe_policy = std::execution::par_unseq; #ifdef USE_STD_PTR_ALLOC_DEALLOC -template using Allocator = std::allocator; - -template -constexpr Allocator alloc_vec() { return {}; }; - template T *alloc_raw(size_t size) { return (T *) aligned_alloc(ALIGNMENT, sizeof(T) * size); } diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 7c71163e..3d7ef18a 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -18,7 +18,7 @@ template STDDataStream::STDDataStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif @@ -69,28 +69,28 @@ void STDDataStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - } +} template void STDDataStream::mul() { // b[i] = scalar * c[i]; std::transform(exe_policy, BEGIN(c), END(c), BEGIN(b), [scalar = startScalar](T ci){ return scalar*ci; }); - } +} template void STDDataStream::add() { // c[i] = a[i] + b[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(c), std::plus()); - } +} template void STDDataStream::triad() { // a[i] = b[i] + scalar * c[i]; std::transform(exe_policy, BEGIN(b), END(b), BEGIN(c), BEGIN(a), [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); - } +} template void STDDataStream::nstream() @@ -101,7 +101,7 @@ void STDDataStream::nstream() // 2: a[i] += scalar * c[i]; std::transform(exe_policy, BEGIN(a), END(a), BEGIN(b), BEGIN(a), [](T ai, T bi){ return ai + bi; }); std::transform(exe_policy, BEGIN(a), END(a), BEGIN(c), BEGIN(a), [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); - } +} template diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index e50c95d8..911a621b 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -23,7 +23,7 @@ class STDDataStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector> a, b, c; + std::vector a, b, c; #else T *a, *b, *c; #endif diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index f9397fab..6ea3362e 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -22,11 +22,21 @@ #define END(x) ((x) + array_size) #endif +#ifdef USE_VECTOR +#if (defined(__NVCOMPILER) || defined(__NVCOMPILER_LLVM__)) +#error "std::vector *is* supported in NVHPC if we capture `this`, however, oneDPL (via SYCL2020) only works correctly with explicit *value* captures." +#endif + +#if defined(USE_ONEDPL) +#error "std::vector is unspported: oneDPL (via SYCL2020) only works correctly with explicit *value* captures" +#endif +#endif + template STDIndicesStream::STDIndicesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, range(0, array_size), #ifdef USE_VECTOR - a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif @@ -77,7 +87,7 @@ void STDIndicesStream::copy() { // c[i] = a[i] std::copy(exe_policy, BEGIN(a), END(a), BEGIN(c)); - } +} template void STDIndicesStream::mul() @@ -86,7 +96,7 @@ void STDIndicesStream::mul() std::transform(exe_policy, range.begin(), range.end(), BEGIN(b), [c = this->c, scalar = startScalar](int i) { return scalar * c[i]; }); - } +} template void STDIndicesStream::add() @@ -95,7 +105,7 @@ void STDIndicesStream::add() std::transform(exe_policy, range.begin(), range.end(), BEGIN(c), [a = this->a, b = this->b](int i) { return a[i] + b[i]; }); - } +} template void STDIndicesStream::triad() @@ -104,7 +114,7 @@ void STDIndicesStream::triad() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [b = this->b, c = this->c, scalar = startScalar](int i) { return b[i] + scalar * c[i]; }); - } +} template void STDIndicesStream::nstream() @@ -116,7 +126,7 @@ void STDIndicesStream::nstream() std::transform(exe_policy, range.begin(), range.end(), BEGIN(a), [a = this->a, b = this->b, c = this->c, scalar = startScalar](int i) { return a[i] + b[i] + scalar * c[i]; }); - } +} template diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index a955374f..0916ef22 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -78,7 +78,7 @@ class STDIndicesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector> a, b, c; + std::vector a, b, c; #else T *a, *b, *c; #endif diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 9063ff20..a8a13490 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -26,7 +26,7 @@ template STDRangesStream::STDRangesStream(const int ARRAY_SIZE, int device) noexcept : array_size{ARRAY_SIZE}, #ifdef USE_VECTOR - a(ARRAY_SIZE, alloc_vec()), b(ARRAY_SIZE, alloc_vec()), c(ARRAY_SIZE, alloc_vec()) + a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) #else a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) #endif @@ -89,7 +89,7 @@ void STDRangesStream::copy() c[i] = a[i]; } ); - } +} template void STDRangesStream::mul() @@ -103,7 +103,7 @@ void STDRangesStream::mul() b[i] = scalar * c[i]; } ); - } +} template void STDRangesStream::add() @@ -115,7 +115,7 @@ void STDRangesStream::add() c[i] = a[i] + b[i]; } ); - } +} template void STDRangesStream::triad() @@ -129,7 +129,7 @@ void STDRangesStream::triad() a[i] = b[i] + scalar * c[i]; } ); - } +} template void STDRangesStream::nstream() @@ -143,7 +143,7 @@ void STDRangesStream::nstream() a[i] += b[i] + scalar * c[i]; } ); - } +} template T STDRangesStream::dot() diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 21902c6c..9d36d46b 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -22,7 +22,7 @@ class STDRangesStream : public Stream // Device side pointers #ifdef USE_VECTOR - std::vector> a, b, c; + std::vector a, b, c; #else T *a, *b, *c; #endif From 80853e66e07faa97779495a49f4f8f1fec5433f4 Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Sat, 30 Jul 2022 08:04:03 +0100 Subject: [PATCH 19/64] Don't include C++17 execution headers directly --- src/std-indices/STDIndicesStream.cpp | 4 ---- src/std-ranges/STDRangesStream.cpp | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 6ea3362e..6e135976 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -6,10 +6,6 @@ #include "STDIndicesStream.h" -#include -#include -#include - #ifndef ALIGNMENT #define ALIGNMENT (2*1024*1024) // 2MB #endif diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index a8a13490..e05a7d1c 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -6,10 +6,6 @@ #include "STDRangesStream.hpp" -#include -#include -#include - #ifndef ALIGNMENT #define ALIGNMENT (2*1024*1024) // 2MB #endif From 370d378fbc3d8581482f69b221f8febd97da61ff Mon Sep 17 00:00:00 2001 From: Tom Lin Date: Wed, 17 Aug 2022 15:09:00 +0100 Subject: [PATCH 20/64] Don't use Kokkos internal headers Don't initialise kokkos view to zero in ctor Upgrade std to 17 for Kokkos (<17 is warning in 3.6, error is develop) --- src/kokkos/KokkosStream.cpp | 6 +++--- src/kokkos/KokkosStream.hpp | 2 -- src/kokkos/model.cmake | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index 00efe92c..04e0dafd 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -14,9 +14,9 @@ KokkosStream::KokkosStream( { Kokkos::initialize(); - d_a = new Kokkos::View("d_a", ARRAY_SIZE); - d_b = new Kokkos::View("d_b", ARRAY_SIZE); - d_c = new Kokkos::View("d_c", ARRAY_SIZE); + d_a = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE); + d_b = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE); + d_c = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE); hm_a = new typename Kokkos::View::HostMirror(); hm_b = new typename Kokkos::View::HostMirror(); hm_c = new typename Kokkos::View::HostMirror(); diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index 3aa7cf5f..d7333a71 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -10,8 +10,6 @@ #include #include -#include -#include #include "Stream.h" diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 445991d4..46c773d9 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -17,7 +17,7 @@ set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) macro(setup) - set(CMAKE_CXX_STANDARD 14) + set(CMAKE_CXX_STANDARD 17) cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") From 11d2bef837fa8e88e9a04fd8da7f54689b4e9017 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Fri, 16 Jun 2023 19:53:41 +0000 Subject: [PATCH 21/64] OpenMP Instructions for Spack --- README.md | 30 ++++++++++++++++++++++++++++ docs/spack_instructions.md | 41 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 docs/spack_instructions.md diff --git a/README.md b/README.md index e4c2a152..7e38453f 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ This code was previously called GPU-STREAM. - [How is this different to STREAM?](#how-is-this-different-to-stream) - [Building](#building) - [CMake](#cmake) + - [Spack](#spack) - [GNU Make (removed)](#gnu-make) - [Results](#results) - [Contributing](#contributing) @@ -138,6 +139,35 @@ Alternatively, refer to the [CI script](./src/ci-test-compile.sh), which test-co *It is recommended that you delete the `build` directory when you change any of the build flags.* +### Spack + + +The project supports building with Spack >= 0.19.0, which can be installed without root via the [official GitHub repo](https://github.com/spack/spack). +The BabelStream Spack Package source code could be accessed from the link [here](https://github.com/spack/spack/tree/develop/var/spack/repos/builtin/packages/babelstream/package.py) +Each BabelStream implementation (programming model) is built as follows: + +```shell + +# Spack package installation starts with `spack install babelstream` for all programming models +# The programming model wish to be build needs to be specified with `+` option +# The model specific flags needs to be specified after defining model +$ spack install babelstream@% + + + +# The executables will be generated in: +# SPACK_INSTALL_DIRECTORY/opt/spack/system-name/compiler-name/babelstream-version-identifier/bin/ +# this address will be printed at the end of generation which could be easily copied +$ cd SPACK_INSTALL_DIRECTORY/opt/spack/system-name/compiler-name/babelstream-version-identifier/bin/ +$ ./-stream +``` +More detailed examples are provided in [Spack README file]() +The `MODEL` option selects one implementation of BabelStream to build. + +Currently available models are: +``` +omp;ocl;std;std20;hip;cuda;kokkos;sycl;sycl2020;acc;raja;tbb;thrust +``` + ### GNU Make Support for Make has been removed from 4.0 onwards. diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md new file mode 100644 index 00000000..b6187b31 --- /dev/null +++ b/docs/spack_instructions.md @@ -0,0 +1,41 @@ +# Spack Instructions + +## Table of contents +* [OpenMP](#omp) +* [OpenCL](#ocl) +* [STD](#std) +* [STD20](#std20) +* [Hip](#hip) +* [Cuda](#cuda) +* [Kokkos](#kokkos) +* [Sycl](#sycl) +* [Sycl2020](#) +* [ACC](#acc) +* [Raja](#raja) +* [Tbb](#tbb) +* [Thrust](#thrust) + +## OpenMP + +* There are 3 offloading options for OpenMP: NVIDIA, AMD and Intel. +* If a user provides a value for `cuda_arch`, the execution will be automatically offloaded to NVIDIA. +* If a user provides a value for `amdgpu_target`, the operation will be offloaded to AMD. +* In the absence of `cuda_arch` and `amdgpu_target`, the execution will be offloaded to Intel. + +| Flag | Definition | Options | +|-----------| ----------------------------------|-----------| +| cuda_arch | List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| `cuda_arch=70` | +|amdgpu_target| List of supported architectures are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/rocm.py#LL93C1-L125C19) | 'amdgpu_target=gfx701` | + +Example Commandss +```shell +# Example 1: for Intel offload + $ spack install babelstream%oneapi +omp + +# Example 2: for Nvidia GPU for Volta (sm_70) + $ spack install babelstream +omp cuda_arch=70 + +# Example 3: for AMD GPU gfx701 + $ spack install babelstream +omp amdgpu_target=gfx701 +``` + From acf70526c88a20209df323bc471c8c7972d39c9f Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Fri, 16 Jun 2023 21:24:06 +0000 Subject: [PATCH 22/64] OpenMP style update --- docs/spack_instructions.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index b6187b31..e1b0e96c 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -22,10 +22,10 @@ * If a user provides a value for `amdgpu_target`, the operation will be offloaded to AMD. * In the absence of `cuda_arch` and `amdgpu_target`, the execution will be offloaded to Intel. -| Flag | Definition | Options | -|-----------| ----------------------------------|-----------| -| cuda_arch | List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| `cuda_arch=70` | -|amdgpu_target| List of supported architectures are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/rocm.py#LL93C1-L125C19) | 'amdgpu_target=gfx701` | +| Flag | Definition | +|-----------| ----------------------------------| +| cuda_arch |- List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
- Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| +|amdgpu_target| List of supported architectures are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/rocm.py#LL93C1-L125C19) | Example Commandss ```shell From 22e9339a6ed9900278a3ad21e0da422e4be07a30 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Fri, 16 Jun 2023 22:03:55 +0000 Subject: [PATCH 23/64] OpenCL Instructions for Spack --- docs/spack_instructions.md | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index e1b0e96c..759f276a 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -27,7 +27,7 @@ | cuda_arch |- List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
- Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| |amdgpu_target| List of supported architectures are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/rocm.py#LL93C1-L125C19) | -Example Commandss + ```shell # Example 1: for Intel offload $ spack install babelstream%oneapi +omp @@ -39,3 +39,28 @@ Example Commandss $ spack install babelstream +omp amdgpu_target=gfx701 ``` + +## OpenCL + +* There are 4 different backend options for OpenCL : AMD,CUDA,INTEL, POCL +* No need to specify `amdgpu_target` or `cuda_arch` here since we are using AMD and CUDA as backend respectively. + + +| Flag | Definition | +|-----------| ----------------------------------| +| backend | 4 different backend options:
- cuda
- amd
- intel
- pocl | + + +```shell +# Example 1: CUDA backend + $ spack install babelstream%gcc +ocl backend=cuda + +# Example 2: AMD backend + $ spack install babelstream%gcc +ocl backend=amd + +# Example 3: Intel backend + $ spack install babelstream%gcc +ocl backend=intel + +# Example 4: POCL backend + $ spack install babelstream%gcc +ocl backend=pocl +``` \ No newline at end of file From 7715c0843b874d34761a60a63252ced816c76b34 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Fri, 16 Jun 2023 22:20:26 +0000 Subject: [PATCH 24/64] STD Instructions for Spack --- docs/spack_instructions.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index 759f276a..bbcdc4c1 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -42,7 +42,6 @@ ## OpenCL -* There are 4 different backend options for OpenCL : AMD,CUDA,INTEL, POCL * No need to specify `amdgpu_target` or `cuda_arch` here since we are using AMD and CUDA as backend respectively. @@ -63,4 +62,20 @@ # Example 4: POCL backend $ spack install babelstream%gcc +ocl backend=pocl +``` + +## STD +* Minimum GCC version requirement `10.1.0` +* NVHPC Offload will be added in the future release + +```shell +# Example 1: data + $ spack install babelstream +stddata + +# Example 2: ranges + $ spack install babelstream +stdranges + +# Example 3: indices + $ spack install babelstream +stdindices + ``` \ No newline at end of file From 406cc0010edb30b27f1111adf6eda292edcd8bcf Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 09:02:13 +0000 Subject: [PATCH 25/64] HIP Instructions for Spack --- docs/spack_instructions.md | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index bbcdc4c1..fa9f0fac 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -4,8 +4,7 @@ * [OpenMP](#omp) * [OpenCL](#ocl) * [STD](#std) -* [STD20](#std20) -* [Hip](#hip) +* [Hip(ROCM)](#hip) * [Cuda](#cuda) * [Kokkos](#kokkos) * [Sycl](#sycl) @@ -78,4 +77,30 @@ # Example 3: indices $ spack install babelstream +stdindices +``` + +## HIP(ROCM) + +* `amdgpu_target` and `flags` are optional here. + + +| Flag | Definition | +|-----------| ----------------------------------| +|amdgpu_target| List of supported architectures are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/rocm.py#LL93C1-L125C19) | +|flags | Extra flags to pass | + + + +```shell +# Example 1: ROCM default + $ spack install babelstream +rocm + +# Example 2: ROCM with GPU target + $ spack install babelstream +rocm amdgpu_target= + +# Example 3: ROCM with extra flags option + $ spack install babelstream +rocm flags= + +# Example 4: ROCM with GPU target and extra flags + $ spack install babelstream +rocm amdgpu_target= flags= ``` \ No newline at end of file From 9b131722355c02dba91436d386f802b2fbf9e65a Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 09:17:32 +0000 Subject: [PATCH 26/64] CUDA Instructions for Spack --- docs/spack_instructions.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index fa9f0fac..2309211d 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -1,5 +1,6 @@ # Spack Instructions + ## Table of contents * [OpenMP](#omp) * [OpenCL](#ocl) @@ -103,4 +104,29 @@ # Example 4: ROCM with GPU target and extra flags $ spack install babelstream +rocm amdgpu_target= flags= +``` + +## CUDA + +* The `cuda_arch` value is mandatory here. +* If a user provides a value for `mem`, device memory mode will be chosen accordingly +* If a user provides a value for `flags`, additional CUDA flags will be passed to NVCC +* In the absence of `mem` and `flags`, the execution will choose **DEFAULT** for device memory mode and no additional flags will be passed + + +| Flag | Definition | +|-----------| ----------------------------------| +| cuda_arch |- List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
- Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| +|mem| Device memory mode:
- **DEFAULT** allocate host and device memory pointers.
- **MANAGED** use CUDA Managed Memory.
- **PAGEFAULT** shared memory, only host pointers allocated | +|flags | Extra flags to pass | + +```shell +# Example 1: CUDA no mem and flags specified + $ spack install babelstream +cuda cuda_arch=70 + +# Example 2: for Nvidia GPU for Volta (sm_70) + $ spack install babelstream +cuda cuda_arch=70 mem=managed + +# Example 3: CUDA with mem and flags specified + $ spack install babelstream +cuda cuda_arch=70 mem=managed flags=xxx ``` \ No newline at end of file From c3dee4b64d42cc14cbaffa14fa7e599f2d308df8 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 10:08:01 +0000 Subject: [PATCH 27/64] Kokkos Instructions for Spack --- docs/spack_instructions.md | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index 2309211d..93a2aff2 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -122,11 +122,35 @@ ```shell # Example 1: CUDA no mem and flags specified - $ spack install babelstream +cuda cuda_arch=70 + $ spack install babelstream +cuda cuda_arch=<70> # Example 2: for Nvidia GPU for Volta (sm_70) - $ spack install babelstream +cuda cuda_arch=70 mem=managed + $ spack install babelstream +cuda cuda_arch=<70> mem= # Example 3: CUDA with mem and flags specified - $ spack install babelstream +cuda cuda_arch=70 mem=managed flags=xxx + $ spack install babelstream +cuda cuda_arch=<70> mem= flags= +``` + +## Kokkos + +* Kokkos implementation requires kokkos source folder to be provided because it builds it from the scratch + + +| Flag | Definition | +|-----------| ----------------------------------| +| dir | Download the kokkos release from github repository ( https://github.com/kokkos/kokkos ) and extract the zip file to a directory you want and target this directory with `dir` flag | +| backend | 2 different backend options:
- cuda
- omp | +| cuda_arch |- List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
- Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| + + +```shell +# Example 1: No Backend option specified + $ spack install babelstream +kokkos dir= + +# Example 2: CUDA backend + $ spack install babelstream +kokkos backend=cuda cuda_arch=70 dir= + +# Example 3: OMP backend + $ spack install babelstream +kokkos backend=omp dir= + ``` \ No newline at end of file From 2f4e6a587342cea0e7724708cb1144f725e70ae7 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 11:29:18 +0000 Subject: [PATCH 28/64] SYCL/SYCL2020 Instructions for Spack --- docs/spack_instructions.md | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index 93a2aff2..9bd6d7f9 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -153,4 +153,31 @@ # Example 3: OMP backend $ spack install babelstream +kokkos backend=omp dir= -``` \ No newline at end of file +``` + + +## SYCL2020 +* Instructions for installing the intel compilers are provided [here](https://spack.readthedocs.io/en/latest/build_systems/inteloneapipackage.html#building-a-package-with-icx) + +| Flag | Definition | +|-----------| ----------------------------------| +| implementation | 3 different implementation options:
- OneAPI-ICPX
- OneAPI-DPCPP
- Compute-CPP
| + +```shell +# Example 1: No implementation option specified (build for OneAPI-ICPX) + $ spack install babelstream%oneapi +sycl2020 + +# Example 2: OneAPI-DPCPP implementation + $ spack install babelstream +sycl2020 implementation=ONEAPI-DPCPP +``` + +## SYCL + +| Flag | Definition | +|-----------| ----------------------------------| +| implementation | 2 different implementation options:
- OneAPI-DPCPP
- Compute-CPP
| + +```shell +# Example 1: OneAPI-DPCPP implementation + $ spack install babelstream +sycl2020 implementation=ONEAPI-DPCPP +``` From 178763fd528ac35f5352d143248bbfa055e9002e Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 11:34:24 +0000 Subject: [PATCH 29/64] ACC Instructions for Spack --- docs/spack_instructions.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index 9bd6d7f9..1424a25f 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -181,3 +181,19 @@ # Example 1: OneAPI-DPCPP implementation $ spack install babelstream +sycl2020 implementation=ONEAPI-DPCPP ``` +## ACC +* Target device selection process is automatic with 2 options: + * **gpu** : Globally set the target device to an NVIDIA GPU automatically if `cuda_arch` is specified + * **multicore** : Globally set the target device to the host CPU automatically if `cpu_arch` is specified + +| Flag | Definition | +|-----------| ----------------------------------| +| cuda_arch |- List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
- Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| +| CPU_ARCH | This sets the `-tp` (target processor) flag, possible values are:
`px` - Generic x86 Processor
`bulldozer` - AMD Bulldozer processor
`piledriver` - AMD Piledriver processor
`zen` - AMD Zen architecture (Epyc, Ryzen)
`zen2` - AMD Zen 2 architecture (Ryzen 2)
`sandybridge` - Intel SandyBridge processor
`haswell` - Intel Haswell processor
`knl` - Intel Knights Landing processor
`skylake` - Intel Skylake Xeon processor
`host` - Link native version of HPC SDK cpu math library
`native` - Alias for -tp host | `cpu_arch=skylake` | +```shell +# Example 1: For GPU Run + $ spack install babelstream +acc cuda_arch=<70> + +# Example 2: For Multicore CPU Run + $ spack install babelstream +acc cpu_arch= +``` \ No newline at end of file From 61f23698e5d56c1e77d6695c9c86e6a0d978a10f Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 11:37:52 +0000 Subject: [PATCH 30/64] RAJA Instructions for Spack --- docs/spack_instructions.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index 1424a25f..a9e10b74 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -190,10 +190,25 @@ |-----------| ----------------------------------| | cuda_arch |- List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
- Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| | CPU_ARCH | This sets the `-tp` (target processor) flag, possible values are:
`px` - Generic x86 Processor
`bulldozer` - AMD Bulldozer processor
`piledriver` - AMD Piledriver processor
`zen` - AMD Zen architecture (Epyc, Ryzen)
`zen2` - AMD Zen 2 architecture (Ryzen 2)
`sandybridge` - Intel SandyBridge processor
`haswell` - Intel Haswell processor
`knl` - Intel Knights Landing processor
`skylake` - Intel Skylake Xeon processor
`host` - Link native version of HPC SDK cpu math library
`native` - Alias for -tp host | `cpu_arch=skylake` | + ```shell # Example 1: For GPU Run $ spack install babelstream +acc cuda_arch=<70> # Example 2: For Multicore CPU Run $ spack install babelstream +acc cpu_arch= +``` + +## RAJA +* RAJA implementation requires RAJA source folder to be provided because it builds it from the scratch + + +| Flag | Definition | +|-----------| ----------------------------------| +| dir | Download the Raja release from github repository and extract the zip file to a directory you want and target this directory with `dir` flag | +| backend | 2 different backend options:
- cuda
- omp | +|offload| Choose offloading platform `offload= [cpu]/[nvidia]` | +```shell +# Example 1: For CPU offload with backend OMP + $ spack install babelstream +raja offload=cpu backend=omp dir=/home/dir/raja ``` \ No newline at end of file From 0040130b67ba611ae259d51d3776fd072b77bfb8 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 11:38:50 +0000 Subject: [PATCH 31/64] TBB Instructions for Spack --- docs/spack_instructions.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index a9e10b74..2f995c99 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -208,7 +208,14 @@ | dir | Download the Raja release from github repository and extract the zip file to a directory you want and target this directory with `dir` flag | | backend | 2 different backend options:
- cuda
- omp | |offload| Choose offloading platform `offload= [cpu]/[nvidia]` | + ```shell # Example 1: For CPU offload with backend OMP $ spack install babelstream +raja offload=cpu backend=omp dir=/home/dir/raja +``` + +## TBB +```shell +# Example: + $ spack install babelstream +tbb ``` \ No newline at end of file From 217249ff38e114e252749a75396c3e10fc0aa2e9 Mon Sep 17 00:00:00 2001 From: Kaan Olgu Date: Sat, 17 Jun 2023 12:05:16 +0000 Subject: [PATCH 32/64] THRUST Instructions for Spack --- docs/spack_instructions.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/docs/spack_instructions.md b/docs/spack_instructions.md index 2f995c99..708deedd 100644 --- a/docs/spack_instructions.md +++ b/docs/spack_instructions.md @@ -218,4 +218,22 @@ ```shell # Example: $ spack install babelstream +tbb -``` \ No newline at end of file +``` + +## THRUST + +| Flag | Definition | +|-----------| ----------------------------------| +|implementation| Choose one of the implementation for Thrust. Options are `cuda` and `rocm` | `implementation = [cuda]/[rocm]` | +|backend| CUDA's Thrust implementation supports the following backends:- CUDA- OMP - TBB | +| cuda_arch |- List of supported compute capabilities are provided [here](https://github.com/spack/spack/blob/0f271883831bec6da3fc64c92eb1805c39a9f09a/lib/spack/spack/build_systems/cuda.py#LL19C1-L47C6)
- Useful [link](https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/) for matching CUDA gencodes with NVIDIA architectures| +|flags | Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH` | + +```shell +# Example1: CUDA implementation +$ spack install babelstream +thrust implementation=cuda backend=cuda cuda_arch=<70> flags=