diff --git a/CMakeLists.txt b/CMakeLists.txt index c03f5c78..88f9b732 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,9 +154,7 @@ include(cmake/register_models.cmake) register_model(serial SERIAL SerialStream.cpp) register_model(omp OMP OMPStream.cpp) register_model(ocl OCL OCLStream.cpp) -register_model(std-data STD_DATA STDDataStream.cpp) -register_model(std-indices STD_INDICES STDIndicesStream.cpp) -register_model(std-ranges STD_RANGES STDRangesStream.cpp) +register_model(std STD STDStream.cpp) register_model(hip HIP HIPStream.cpp) register_model(cuda CUDA CUDAStream.cu) register_model(kokkos KOKKOS KokkosStream.cpp) diff --git a/src/Stream.h b/src/Stream.h index c8c6af1c..f233f54a 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -7,11 +7,14 @@ #pragma once +#include #include #include #include #include "benchmark.h" +using std::intptr_t; + template class Stream { diff --git a/src/StreamModels.h b/src/StreamModels.h index b13d5b34..820c08a4 100644 --- a/src/StreamModels.h +++ b/src/StreamModels.h @@ -3,12 +3,8 @@ #if defined(CUDA) #include "CUDAStream.h" -#elif defined(STD_DATA) -#include "STDDataStream.h" -#elif defined(STD_INDICES) -#include "STDIndicesStream.h" -#elif defined(STD_RANGES) -#include "STDRangesStream.hpp" +#elif defined(STD) +#include "STDStream.h" #elif defined(TBB) #include "TBBStream.hpp" #elif defined(THRUST) @@ -63,17 +59,9 @@ std::unique_ptr> make_stream(Args... args) { // Use the Kokkos implementation return std::make_unique>(args...); -#elif defined(STD_DATA) +#elif defined(STD) // Use the C++ STD data-oriented implementation - return std::make_unique>(args...); - -#elif defined(STD_INDICES) - // Use the C++ STD index-oriented implementation - return std::make_unique>(args...); - -#elif defined(STD_RANGES) - // Use the C++ STD ranges implementation - return std::make_unique>(args...); + return std::make_unique>(args...); #elif defined(TBB) // Use the C++20 implementation diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 57b89afb..f3ba3000 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -152,9 +152,9 @@ build_gcc() { *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac # some distributions like Ubuntu bionic implements std par with TBB, so conditionally link it here - run_build $name "${GCC_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${GCC_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${GCC_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" + run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17" + run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23" + run_build $name "${GCC_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES" done run_build $name "${GCC_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -251,9 +251,10 @@ build_clang() { OFF) dpl_conditional_flags="-DCXX_EXTRA_LIBRARIES=${CLANG_STD_PAR_LIB:-}" ;; *) dpl_conditional_flags="-DFETCH_ONEDPL=ON -DFETCH_TBB=ON -DUSE_TBB=ON -DCXX_EXTRA_FLAGS=-D_GLIBCXX_USE_TBB_PAR_BACKEND=0" ;; esac - run_build $name "${CLANG_CXX:?}" std-data "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - run_build $name "${CLANG_CXX:?}" std-indices "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" - # run_build $name "${CLANG_CXX:?}" std-ranges "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl" # not yet supported + run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA17" + # Requires GCC 14 + # run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=DATA23" + run_build $name "${CLANG_CXX:?}" std "$cxx $dpl_conditional_flags -DUSE_ONEDPL=$use_onedpl -DSTDIMPL=INDICES" done run_build $name "${CLANG_CXX:?}" tbb "$cxx -DONE_TBB_DIR=$TBB_LIB" @@ -270,8 +271,10 @@ build_clang() { build_nvhpc() { local name="nvhpc_build" local cxx="-DCMAKE_CXX_COMPILER=${NVHPC_NVCXX:?}" - run_build $name "${NVHPC_NVCXX:?}" std-data "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" - run_build $name "${NVHPC_NVCXX:?}" std-indices "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY" + run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA17" + # Requires GCC 14 + # run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=DATA23" + run_build $name "${NVHPC_NVCXX:?}" std "$cxx -DNVHPC_OFFLOAD=$NV_ARCH_CCXY -DSTDIMPL=INDICES" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=gpu -DTARGET_PROCESSOR=px -DCUDA_ARCH=$NV_ARCH_CCXY" run_build $name "${NVHPC_NVCXX:?}" acc "$cxx -DTARGET_DEVICE=multicore -DTARGET_PROCESSOR=zen" diff --git a/src/dpl_shim.h b/src/dpl_shim.h index 226693bd..9b8a7acc 100644 --- a/src/dpl_shim.h +++ b/src/dpl_shim.h @@ -29,12 +29,15 @@ T *alloc_raw(size_t size) { return sycl::malloc_shared(size, exe_policy.queue template void dealloc_raw(T *ptr) { sycl::free(ptr, exe_policy.queue()); } +#define WORKAROUND + #else // auto exe_policy = dpl::execution::seq; // auto exe_policy = dpl::execution::par; static constexpr auto exe_policy = dpl::execution::par_unseq; #define USE_STD_PTR_ALLOC_DEALLOC +#define WORKAROUND #endif diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp deleted file mode 100644 index 8c280f8a..00000000 --- a/src/std-data/STDDataStream.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -// Updated 2021 by University of Bristol -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDDataStream.h" - -template -STDDataStream::STDDataStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) - noexcept : array_size{array_size}, - a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif - init_arrays(initA, initB, initC); -} - -template -STDDataStream::~STDDataStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDDataStream::init_arrays(T initA, T initB, T initC) -{ - std::fill(exe_policy, a, a + array_size, initA); - std::fill(exe_policy, b, b + array_size, initB); - std::fill(exe_policy, c, c + array_size, initC); -} - -template -void STDDataStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) -{ - h_a = a; - h_b = b; - h_c = c; -} - -template -void STDDataStream::copy() -{ - // c[i] = a[i] - std::copy(exe_policy, a, a + array_size, c); -} - -template -void STDDataStream::mul() -{ - // b[i] = scalar * c[i]; - std::transform(exe_policy, c, c + array_size, b, [scalar = startScalar](T ci){ return scalar*ci; }); -} - -template -void STDDataStream::add() -{ - // c[i] = a[i] + b[i]; - std::transform(exe_policy, a, a + array_size, b, c, std::plus()); -} - -template -void STDDataStream::triad() -{ - // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); -} - -template -void STDDataStream::nstream() -{ - // a[i] += b[i] + scalar * c[i]; - // Need to do in two stages with C++11 STL. - // 1: a[i] += b[i] - // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); - std::transform(exe_policy, a, a + array_size, c, a, [scalar = startScalar](T ai, T ci){ return ai + scalar*ci; }); -} - - -template -T STDDataStream::dot() -{ - // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} -template class STDDataStream; -template class STDDataStream; diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp deleted file mode 100644 index 4f8efe20..00000000 --- a/src/std-indices/STDIndicesStream.cpp +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2021 Tom Deakin and Tom Lin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDIndicesStream.h" - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - -template -STDIndicesStream::STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) -noexcept : array_size{array_size}, range(0, array_size), - a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif - init_arrays(initA, initB, initC); -} - -template -STDIndicesStream::~STDIndicesStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDIndicesStream::init_arrays(T initA, T initB, T initC) -{ - std::fill(exe_policy, a, a + array_size, initA); - std::fill(exe_policy, b, b + array_size, initB); - std::fill(exe_policy, c, c + array_size, initC); -} - -template -void STDIndicesStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) -{ - h_a = a; - h_b = b; - h_c = c; -} - -template -void STDIndicesStream::copy() -{ - // c[i] = a[i] - std::copy(exe_policy, a, a + array_size, c); -} - -template -void STDIndicesStream::mul() -{ - // b[i] = scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), b, [c = this->c, scalar = startScalar](intptr_t i) { - return scalar * c[i]; - }); -} - -template -void STDIndicesStream::add() -{ - // c[i] = a[i] + b[i]; - std::transform(exe_policy, range.begin(), range.end(), c, [a = this->a, b = this->b](intptr_t i) { - return a[i] + b[i]; - }); -} - -template -void STDIndicesStream::triad() -{ - // a[i] = b[i] + scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [b = this->b, c = this->c, scalar = startScalar](intptr_t i) { - return b[i] + scalar * c[i]; - }); -} - -template -void STDIndicesStream::nstream() -{ - // a[i] += b[i] + scalar * c[i]; - // Need to do in two stages with C++11 STL. - // 1: a[i] += b[i] - // 2: a[i] += scalar * c[i]; - std::transform(exe_policy, range.begin(), range.end(), a, [a = this->a, b = this->b, c = this->c, scalar = startScalar](intptr_t i) { - return a[i] + b[i] + scalar * c[i]; - }); -} - - -template -T STDIndicesStream::dot() -{ - // sum = 0; sum += a[i]*b[i]; return sum; - return std::transform_reduce(exe_policy, a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} -template class STDIndicesStream; -template class STDIndicesStream; diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h deleted file mode 100644 index 7a43b1ec..00000000 --- a/src/std-indices/STDIndicesStream.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2021 Tom Deakin and Tom Lin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once -#include "dpl_shim.h" - -#include -#include -#include "Stream.h" - -#define IMPLEMENTATION_STRING "STD (index-oriented)" - -// A lightweight counting iterator which will be used by the STL algorithms -// NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this -// implementation doesn't target -template -class ranged { -public: - class iterator { - friend class ranged; - public: - using difference_type = N; - using value_type = N; - using pointer = const N*; - using reference = N; - using iterator_category = std::random_access_iterator_tag; - - // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled. - // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device. - // This type is unused for any nother STL impl. - using is_passed_directly = std::true_type; - - reference operator *() const { return i_; } - iterator &operator ++() { ++i_; return *this; } - iterator operator ++(int) { iterator copy(*this); ++i_; return copy; } - - iterator &operator --() { --i_; return *this; } - iterator operator --(int) { iterator copy(*this); --i_; return copy; } - - iterator &operator +=(N by) { i_+=by; return *this; } - - value_type operator[](const difference_type &i) const { return i_ + i; } - - difference_type operator-(const iterator &it) const { return i_ - it.i_; } - iterator operator+(const value_type v) const { return iterator(i_ + v); } - - bool operator ==(const iterator &other) const { return i_ == other.i_; } - bool operator !=(const iterator &other) const { return i_ != other.i_; } - bool operator < (const iterator &other) const { return i_ < other.i_; } - - protected: - explicit iterator(N start) : i_ (start) {} - - private: - N i_; - }; - - [[nodiscard]] iterator begin() const { return begin_; } - [[nodiscard]] iterator end() const { return end_; } - ranged(N begin, N end) : begin_(begin), end_(end) {} -private: - iterator begin_; - iterator end_; -}; - -template -class STDIndicesStream : public Stream -{ - protected: - // Size of arrays - intptr_t array_size; - - // induction range - ranged range; - - // Device side pointers - T *a, *b, *c; - - public: - STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) noexcept; - ~STDIndicesStream(); - - void copy() override; - void add() override; - void mul() override; - void triad() override; - void nstream() override; - T dot() override; - - void get_arrays(T const*& a, T const*& b, T const*& c) override; - void init_arrays(T initA, T initB, T initC); -}; - diff --git a/src/std-indices/model.cmake b/src/std-indices/model.cmake deleted file mode 100644 index 60ef575f..00000000 --- a/src/std-indices/model.cmake +++ /dev/null @@ -1,53 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection" - "c++") - -register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. - The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) - - Possible values are: - cc35 - Compile for compute capability 3.5 - cc50 - Compile for compute capability 5.0 - cc60 - Compile for compute capability 6.0 - cc62 - Compile for compute capability 6.2 - cc70 - Compile for compute capability 7.0 - cc72 - Compile for compute capability 7.2 - cc75 - Compile for compute capability 7.5 - cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") - -register_flag_optional(USE_TBB - "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." - "OFF") - -register_flag_optional(USE_ONEDPL - "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. - - Possible values are: - OPENMP - Implements policies using OpenMP. - CMake will handle any flags needed to enable OpenMP if the compiler supports it. - TBB - Implements policies using TBB. - TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - DPCPP - Implements policies through SYCL2020. - This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." - "OFF") - -macro(setup) - set(CMAKE_CXX_STANDARD 17) - if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar -gpu=${NVHPC_OFFLOAD}) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY ${NVHPC_FLAGS}) - register_append_link_flags(${NVHPC_FLAGS}) - endif () - if (USE_TBB) - register_link_library(TBB::tbb) - endif () - if (USE_ONEDPL) - register_definitions(USE_ONEDPL) - register_link_library(oneDPL) - endif () -endmacro() diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp deleted file mode 100644 index 02bd56b2..00000000 --- a/src/std-ranges/STDRangesStream.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#include "STDRangesStream.hpp" -#include - -#ifndef ALIGNMENT -#define ALIGNMENT (2*1024*1024) // 2MB -#endif - -template -STDRangesStream::STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) - noexcept : array_size{array_size}, - a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) -{ - std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; -#ifdef USE_ONEDPL - std::cout << "Using oneDPL backend: "; -#if ONEDPL_USE_DPCPP_BACKEND - std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; -#elif ONEDPL_USE_TBB_BACKEND - std::cout << "TBB " TBB_VERSION_STRING; -#elif ONEDPL_USE_OPENMP_BACKEND - std::cout << "OpenMP"; -#else - std::cout << "Default"; -#endif - std::cout << std::endl; -#endif - init_arrays(initA, initB, initC); -} - -template -STDRangesStream::~STDRangesStream() { - dealloc_raw(a); - dealloc_raw(b); - dealloc_raw(c); -} - -template -void STDRangesStream::init_arrays(T initA, T initB, T initC) -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, // loop range - [=, this] (intptr_t i) { - a[i] = initA; - b[i] = initB; - c[i] = initC; - } - ); -} - -template -void STDRangesStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) -{ - h_a = a; - h_b = b; - h_c = c; -} - -template -void STDRangesStream::copy() -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - c[i] = a[i]; - } - ); -} - -template -void STDRangesStream::mul() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - b[i] = scalar * c[i]; - } - ); -} - -template -void STDRangesStream::add() -{ - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - c[i] = a[i] + b[i]; - } - ); -} - -template -void STDRangesStream::triad() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - a[i] = b[i] + scalar * c[i]; - } - ); -} - -template -void STDRangesStream::nstream() -{ - const T scalar = startScalar; - - std::for_each_n( - exe_policy, - std::views::iota((intptr_t)0).begin(), array_size, - [=, this] (intptr_t i) { - a[i] += b[i] + scalar * c[i]; - } - ); -} - -template -T STDRangesStream::dot() -{ - // sum += a[i] * b[i]; - return - std::transform_reduce( - exe_policy, - a, a + array_size, b, T{}); -} - -void listDevices(void) -{ - std::cout << "C++20 does not expose devices" << std::endl; -} - -std::string getDeviceName(const int) -{ - return std::string("Device name unavailable"); -} - -std::string getDeviceDriver(const int) -{ - return std::string("Device driver unavailable"); -} - -template class STDRangesStream; -template class STDRangesStream; diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp deleted file mode 100644 index da04f1f4..00000000 --- a/src/std-ranges/STDRangesStream.hpp +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2020 Tom Deakin -// University of Bristol HPC -// -// For full license terms please see the LICENSE file distributed with this -// source code - -#pragma once -#include "dpl_shim.h" - -#include -#include -#include "Stream.h" - -#define IMPLEMENTATION_STRING "STD C++ ranges" - -template -class STDRangesStream : public Stream -{ - protected: - // Size of arrays - intptr_t array_size; - - // Device side pointers - T *a, *b, *c; - - public: - STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id, - T initA, T initB, T initC) noexcept; - ~STDRangesStream(); - - void copy() override; - void add() override; - void mul() override; - void triad() override; - void nstream() override; - T dot() override; - - void get_arrays(T const*& a, T const*& b, T const*& c) override; - void init_arrays(T initA, T initB, T initC); -}; - diff --git a/src/std-ranges/model.cmake b/src/std-ranges/model.cmake deleted file mode 100644 index d7fd6a8b..00000000 --- a/src/std-ranges/model.cmake +++ /dev/null @@ -1,68 +0,0 @@ - -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection and supports C++20 Ranges" - "c++") - -register_flag_optional(NVHPC_OFFLOAD - "Enable offloading support (via the non-standard `-stdpar=gpu`) for the new NVHPC SDK. - The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) - - Possible values are: - cc35 - Compile for compute capability 3.5 - cc50 - Compile for compute capability 5.0 - cc60 - Compile for compute capability 6.0 - cc62 - Compile for compute capability 6.2 - cc70 - Compile for compute capability 7.0 - cc72 - Compile for compute capability 7.2 - cc75 - Compile for compute capability 7.5 - cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") - -register_flag_optional(USE_TBB - "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." - "OFF") - -register_flag_optional(USE_ONEDPL - "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. - - Possible values are: - OPENMP - Implements policies using OpenMP. - CMake will handle any flags needed to enable OpenMP if the compiler supports it. - TBB - Implements policies using TBB. - TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. - DPCPP - Implements policies through SYCL2020. - This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." - "OFF") - -macro(setup) - - # TODO this needs to eventually be removed when CMake adds proper C++20 support or at least update the flag used here - - # C++ 2a is too new, disable CMake's std flags completely: - set(CMAKE_CXX_EXTENSIONS OFF) - set(CMAKE_CXX_STANDARD_REQUIRED OFF) - unset(CMAKE_CXX_STANDARD) # drop any existing standard we have set by default - # and append our own: - register_append_cxx_flags(ANY -std=c++20) - if (NVHPC_OFFLOAD) - set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) - # propagate flags to linker so that it links with the gpu stuff as well - register_append_cxx_flags(ANY ${NVHPC_FLAGS}) - register_append_link_flags(${NVHPC_FLAGS}) - endif () - if (USE_TBB) - register_link_library(TBB::tbb) - endif () - if (USE_ONEDPL) - register_definitions(USE_ONEDPL) - register_link_library(oneDPL) - endif () -endmacro() - -macro(setup_target NAME) - if (USE_ONEDPL) - target_compile_features(${NAME} INTERFACE cxx_std_20) - target_compile_features(oneDPL INTERFACE cxx_std_20) - endif () -endmacro() diff --git a/src/std/STDStream.cpp b/src/std/STDStream.cpp new file mode 100644 index 00000000..678457ae --- /dev/null +++ b/src/std/STDStream.cpp @@ -0,0 +1,209 @@ +// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +// Updated 2021 by University of Bristol +// +// For full license terms please see the LICENSE file distributed with this +// source code + +#include "STDStream.h" +#include +#include + +#if defined(DATA20) || defined(INDICES) +#include +#endif + + // OneDPL workaround; TODO: remove this eventually +#include "dpl_shim.h" + +#ifdef INDICES +// NVHPC workaround: TODO: remove this eventually +#if defined(__NVCOMPILER) && defined(_NVHPC_STDPAR_GPU) +#define WORKAROUND +#include +auto counting_iter(intptr_t i) { return thrust::counting_iterator(i); } +auto counting_range(intptr_t b, intptr_t e) { + struct R { + thrust::counting_iterator b, e; + thrust::counting_iterator begin() { return b; } + thrust::counting_iterator end() { return e; } + }; + return R { .b = counting_iter(b), .e = counting_iter(e) }; +} +#else // NVHPC Workaround +auto counting_iter(intptr_t i) { return std::views::iota(i).begin(); } +auto counting_range(intptr_t b, intptr_t e) { return std::views::iota(b, e); } +#endif // NVHPC Workaround +#endif // INDICES + +template +STDStream::STDStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + noexcept : array_size{array_size}, + a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) +{ + std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; +#ifdef USE_ONEDPL + std::cout << "Using oneDPL backend: "; +#if ONEDPL_USE_DPCPP_BACKEND + std::cout << "SYCL USM (device=" << exe_policy.queue().get_device().get_info() << ")"; +#elif ONEDPL_USE_TBB_BACKEND + std::cout << "TBB " TBB_VERSION_STRING; +#elif ONEDPL_USE_OPENMP_BACKEND + std::cout << "OpenMP"; +#else + std::cout << "Default"; +#endif + std::cout << std::endl; +#endif + +#ifdef WORKAROUND + std::cout << "Non-conforming implementation: requires non-portable workarounds to run STREAM" << std::endl; +#endif + init_arrays(initA, initB, initC); +} + +template +STDStream::~STDStream() { + dealloc_raw(a); + dealloc_raw(b); + dealloc_raw(c); +} + +template +void STDStream::init_arrays(T initA, T initB, T initC) +{ + std::fill_n(exe_policy, a, array_size, initA); + std::fill_n(exe_policy, b, array_size, initB); + std::fill_n(exe_policy, c, array_size, initC); +} + +template +void STDStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) +{ + h_a = a; + h_b = b; + h_c = c; +} + +template +void STDStream::copy() +{ + // c[i] = a[i] +#if defined(DATA17) || defined(DATA20) + std::copy(exe_policy, a, a + array_size, c); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,c=c](intptr_t i) { + c[i] = a[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::mul() +{ + // b[i] = scalar * c[i]; +#if defined(DATA17) || defined(DATA20) + std::transform(exe_policy, c, c + array_size, b, [](T ci){ return startScalar*ci; }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [b=b, c=c](intptr_t i) { + b[i] = startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::add() +{ + // c[i] = a[i] + b[i]; +#if defined(DATA17) || defined(DATA20) + std::transform(exe_policy, a, a + array_size, b, c, std::plus()); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) { + c[i] = a[i] + b[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::triad() +{ + // a[i] = b[i] + scalar * c[i]; +#if defined(DATA17) || defined(DATA20) + std::transform(exe_policy, b, b + array_size, c, a, [scalar = startScalar](T bi, T ci){ return bi+scalar*ci; }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a, b=b, c=c](intptr_t i) { + a[i] = b[i] + startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + +template +void STDStream::nstream() +{ + // a[i] += b[i] + scalar * c[i]; +#if defined(DATA17) || defined(DATA20) // Until we can require GCC 14.1 + // Need to do in two round-trips with C++17 STL. + // 1: a[i] += b[i] + // 2: a[i] += scalar * c[i]; + std::transform(exe_policy, a, a + array_size, b, a, [](T ai, T bi){ return ai + bi; }); + std::transform(exe_policy, a, a + array_size, c, a, [](T ai, T ci){ return ai + startScalar*ci; }); +#elif DATA20 + // Requires GCC 14.1 (Ubuntu 24.04): + auto as = std::ranges::subrange(a, a + array_size); + auto bs = std::ranges::subrange(b, b + array_size); + auto cs = std::ranges::subrange(c, c + array_size); + auto r = std::views::zip(as, bs, cs); + std::transform(exe_policy, r.begin(), r.end(), a, [](auto vs) { + auto [a, b, c] = vs; + return a + b + startScalar * c; + }); +#elif INDICES + std::for_each_n(exe_policy, counting_iter(0), array_size, [a=a,b=b,c=c](intptr_t i) { + a[i] += b[i] + startScalar * c[i]; + }); +#else + #error unimplemented +#endif +} + + +template +T STDStream::dot() +{ +#if defined(DATA17) || defined(DATA20) + // sum = 0; sum += a[i] * b[i]; return sum; + return std::transform_reduce(exe_policy, a, a + array_size, b, T{0}); +#elif INDICES + auto r = counting_range(intptr_t(0), array_size); + return std::transform_reduce(exe_policy, r.begin(), r.end(), T{0}, std::plus{}, [a=a, b=b](intptr_t i) { + return a[i] * b[i]; + }); +#else + #error unimplemented +#endif +} + +void listDevices(void) +{ + std::cout << "Listing devices is not supported by the Parallel STL" << std::endl; +} + +std::string getDeviceName(const int) +{ + return std::string("Device name unavailable"); +} + +std::string getDeviceDriver(const int) +{ + return std::string("Device driver unavailable"); +} +template class STDStream; +template class STDStream; diff --git a/src/std-data/STDDataStream.h b/src/std/STDStream.h similarity index 69% rename from src/std-data/STDDataStream.h rename to src/std/STDStream.h index 6db998b2..aa26eb33 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std/STDStream.h @@ -5,17 +5,26 @@ // source code #pragma once -#include "dpl_shim.h" #include #include #include "Stream.h" -#define IMPLEMENTATION_STRING "STD (data-oriented)" +#ifdef DATA17 +#define STDIMPL "DATA17" +#elif DATA20 +#define STDIMPL "DATA20" +#elif INDICES +#define STDIMPL "INDICES" +#else +#error unimplemented +#endif + +#define IMPLEMENTATION_STRING "STD (" STDIMPL ")" template -class STDDataStream : public Stream +class STDStream : public Stream { protected: // Size of arrays @@ -25,9 +34,9 @@ class STDDataStream : public Stream T *a, *b, *c; public: - STDDataStream(BenchId bs, const intptr_t array_size, const int device_id, + STDStream(BenchId bs, const intptr_t array_size, const int device_id, T initA, T initB, T initC) noexcept; - ~STDDataStream(); + ~STDStream(); void copy() override; void add() override; diff --git a/src/std-data/model.cmake b/src/std/model.cmake similarity index 76% rename from src/std-data/model.cmake rename to src/std/model.cmake index 837d26bf..b133a42e 100644 --- a/src/std-data/model.cmake +++ b/src/std/model.cmake @@ -1,4 +1,3 @@ - register_flag_optional(CMAKE_CXX_COMPILER "Any CXX compiler that is supported by CMake detection" "c++") @@ -16,8 +15,10 @@ register_flag_optional(NVHPC_OFFLOAD cc72 - Compile for compute capability 7.2 cc75 - Compile for compute capability 7.5 cc80 - Compile for compute capability 8.0 - ccall - Compile for all supported compute capabilities" - "") + cc90 - Compile for compute capability 8.0 + ccall - Compile for all supported compute capabilities + ccnative - Compiles for compute capability of current device" + "") register_flag_optional(USE_TBB "No-op if ONE_TBB_DIR is set. Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." @@ -35,8 +36,23 @@ register_flag_optional(USE_ONEDPL This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." "OFF") +register_flag_optional(STDIMPL + "Implementation strategy (default = DATA20): + DATA17 - Parallel algorithms over data (requires C++17). + DATA23 - (default) Parallel algorithms over data (requires C++20). + INDICES - Parallel algorithms over indices (requires C++20)." + "DATA20" +) + macro(setup) - set(CMAKE_CXX_STANDARD 17) + register_definitions(${STDIMPL}) + if (${STDIMPL} STREQUAL "DATA17") + set(CMAKE_CXX_STANDARD 17) + else if (${STDIMPL} STREQUAL "INDICES") + set(CMAKE_CXX_STANDARD 20) + else if (${STDIMPL} STREQUAL "DATA23") + set(CMAKE_CXX_STANDARD 23) + endif () if (NVHPC_OFFLOAD) set(NVHPC_FLAGS -stdpar=gpu -gpu=${NVHPC_OFFLOAD}) # propagate flags to linker so that it links with the gpu stuff as well