diff --git a/CHANGELOG.md b/CHANGELOG.md index 972b2edca..44439a339 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,7 +36,12 @@ Documentation for rocPRIM is available at * `rocprim::thread_load` and `rocprim::thread_store`, use dereference instead. Not all of those functions are available on every device architecture, and their usage can hurt performance, because inline assembly inhibits optimizations. -## Unreleased rocPRIM-3.2.0 for ROCm 6.2.0 +## rocPRIM-3.2.1 for ROCm 6.2.1 + +### Optimizations +* Improved performance of block_reduce_warp_reduce when warp size == block size. + +## rocPRIM-3.2.0 for ROCm 6.2.0 ### Additions diff --git a/CMakeLists.txt b/CMakeLists.txt index cba75281c..a5b9b1274 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,11 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +if(DEFINED BUILD_SHARED_LIBS) + set(PKG_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) +else() + set(PKG_BUILD_SHARED_LIBS ON) +endif() set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared if(NOT USE_HIP_CPU) # Get dependencies (required here to get rocm-cmake) @@ -172,16 +177,22 @@ if(BUILD_DOCS AND NOT ONLY_INSTALL) add_subdirectory(docs) endif() +# set BUILD_SHARED_LIBS for packaging +set(BUILD_SHARED_LIBS ${PKG_BUILD_SHARED_LIBS}) # Package if (ROCPRIM_PROJECT_IS_TOP_LEVEL) - set(BUILD_SHARED_LIBS ON) # Build as though shared library for naming + # add dependency on HIP runtime + set(HIP_RUNTIME_MINIMUM 4.5.0) if(BUILD_ADDRESS_SANITIZER) set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" ) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() - rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0") + rocm_package_add_dependencies(SHARED_DEPENDS "${DEPENDS_HIP_RUNTIME} >= ${HIP_RUNTIME_MINIMUM}") + rocm_package_add_deb_dependencies(STATIC_DEPENDS "hip-static-dev >= ${HIP_RUNTIME_MINIMUM}") + rocm_package_add_rpm_dependencies(STATIC_DEPENDS "hip-static-devel >= ${HIP_RUNTIME_MINIMUM}") + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") set(CPACK_RPM_PACKAGE_LICENSE "MIT") diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index b2367ef22..38696bb10 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -36,7 +36,7 @@ docutils==0.21.2 # myst-parser # pydata-sphinx-theme # sphinx -fastjsonschema==2.19.1 +fastjsonschema==2.20.0 # via rocm-docs-core gitdb==4.0.11 # via gitpython @@ -62,13 +62,13 @@ mdurl==0.1.2 # via markdown-it-py myst-parser==3.0.1 # via rocm-docs-core -packaging==24.0 +packaging==24.1 # via # pydata-sphinx-theme # sphinx pycparser==2.22 # via cffi -pydata-sphinx-theme==0.15.3 +pydata-sphinx-theme==0.15.4 # via # rocm-docs-core # sphinx-book-theme @@ -111,7 +111,7 @@ sphinx==7.3.7 # sphinx-design # sphinx-external-toc # sphinx-notfound-page -sphinx-book-theme==1.1.2 +sphinx-book-theme==1.1.3 # via rocm-docs-core sphinx-copybutton==0.5.2 # via rocm-docs-core @@ -135,7 +135,7 @@ sphinxcontrib-serializinghtml==1.1.10 # via sphinx tomli==2.0.1 # via sphinx -typing-extensions==4.12.0 +typing-extensions==4.12.2 # via # pydata-sphinx-theme # pygithub diff --git a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp index 11bf18cdb..2ffc7437d 100644 --- a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp +++ b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp @@ -180,21 +180,25 @@ class block_reduce_warp_reduce input, output, num_valid, reduce_op ); - // i-th warp will have its partial stored in storage_.warp_partials[i-1] - if(lane_id == 0) + // Final reduction across warps is only required if there is more than 1 warp + if ROCPRIM_IF_CONSTEXPR (warps_no_ > 1) { - storage_.warp_partials[warp_id] = output; - } - ::rocprim::syncthreads(); - - if(flat_tid < warps_no_) - { - // Use warp partial to calculate the final reduce results for every thread - auto warp_partial = storage_.warp_partials[lane_id]; - - warp_reduce( - warp_partial, output, warps_no_, reduce_op - ); + // i-th warp will have its partial stored in storage_.warp_partials[i-1] + if(lane_id == 0) + { + storage_.warp_partials[warp_id] = output; + } + ::rocprim::syncthreads(); + + if(flat_tid < warps_no_) + { + // Use warp partial to calculate the final reduce results for every thread + auto warp_partial = storage_.warp_partials[lane_id]; + + warp_reduce( + warp_partial, output, warps_no_, reduce_op + ); + } } } @@ -246,22 +250,26 @@ class block_reduce_warp_reduce input, output, num_valid, reduce_op ); - // i-th warp will have its partial stored in storage_.warp_partials[i-1] - if(lane_id == 0) + // Final reduction across warps is only required if there is more than 1 warp + if ROCPRIM_IF_CONSTEXPR (warps_no_ > 1) { - storage_.warp_partials[warp_id] = output; - } - ::rocprim::syncthreads(); - - if(flat_tid < warps_no_) - { - // Use warp partial to calculate the final reduce results for every thread - auto warp_partial = storage_.warp_partials[lane_id]; - - unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_; - warp_reduce_output_type().reduce( - warp_partial, output, valid_warps_no, reduce_op - ); + // i-th warp will have its partial stored in storage_.warp_partials[i-1] + if(lane_id == 0) + { + storage_.warp_partials[warp_id] = output; + } + ::rocprim::syncthreads(); + + if(flat_tid < warps_no_) + { + // Use warp partial to calculate the final reduce results for every thread + auto warp_partial = storage_.warp_partials[lane_id]; + + unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_; + warp_reduce_output_type().reduce( + warp_partial, output, valid_warps_no, reduce_op + ); + } } } };