diff --git a/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh b/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh index c861a41..a90571d 100644 --- a/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh +++ b/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh @@ -41,6 +41,7 @@ #include "../block/block_store.cuh" #include "../block/block_radix_rank.cuh" #include "../block/block_exchange.cuh" +#include "../block/radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../util_type.cuh" #include "../iterator/cache_modified_input_iterator.cuh" @@ -56,16 +57,6 @@ namespace cub { * Tuning policy types ******************************************************************************/ -/** - * Radix ranking algorithm - */ -enum RadixRankAlgorithm -{ - RADIX_RANK_BASIC, - RADIX_RANK_MEMOIZE, - RADIX_RANK_MATCH -}; - /** * Parameterizable tuning policy type for AgentRadixSortDownsweep */ @@ -137,6 +128,9 @@ struct AgentRadixSortDownsweep RADIX_DIGITS = 1 << RADIX_BITS, KEYS_ONLY = Equals::VALUE, + LOAD_WARP_STRIPED = RANK_ALGORITHM == RADIX_RANK_MATCH || + RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY || + RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR, }; // Input iterator wrapper type (for applying cache modifier)s @@ -148,10 +142,22 @@ struct AgentRadixSortDownsweep BlockRadixRank, typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE), BlockRadixRank, - BlockRadixRankMatch + typename If<(RANK_ALGORITHM == RADIX_RANK_MATCH), + BlockRadixRankMatch, + typename If<(RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY), + BlockRadixRankMatchEarlyCounts, + BlockRadixRankMatchEarlyCounts + >::Type + >::Type >::Type >::Type BlockRadixRankT; + // Digit extractor type + typedef BFEDigitExtractor DigitExtractorT; + + enum { /// Number of bin-starting offsets tracked per thread @@ -184,11 +190,11 @@ struct AgentRadixSortDownsweep typename BlockLoadValuesT::TempStorage load_values; typename BlockRadixRankT::TempStorage radix_rank; - struct + struct KeysAndOffsets { UnsignedBits exchange_keys[TILE_ITEMS]; OffsetT relative_bin_offsets[RADIX_DIGITS]; - }; + } keys_and_offsets; Uninitialized exchange_values; @@ -216,11 +222,8 @@ struct AgentRadixSortDownsweep // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; - // The least-significant bit position of the current digit to extract - int current_bit; - - // Number of bits in current digit - int num_bits; + // Digit extractor + DigitExtractorT digit_extractor; // Whether to short-cirucit int short_circuit; @@ -243,7 +246,7 @@ struct AgentRadixSortDownsweep #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { - temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; + temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; } CTA_SYNC(); @@ -251,9 +254,9 @@ struct AgentRadixSortDownsweep #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) { - UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; - UnsignedBits digit = BFE(key, current_bit, num_bits); - relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit]; + UnsignedBits key = temp_storage.keys_and_offsets.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; + UnsignedBits digit = digit_extractor.Digit(key); + relative_bin_offsets[ITEM] = temp_storage.keys_and_offsets.relative_bin_offsets[digit]; // Un-twiddle key = Traits::TwiddleOut(key); @@ -303,16 +306,15 @@ struct AgentRadixSortDownsweep } /** - * Load a tile of keys (specialized for full tile, any ranking algorithm) + * Load a tile of keys (specialized for full tile, block load) */ - template __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) + Int2Type warp_striped) { BlockLoadKeysT(temp_storage.load_keys).Load( d_keys_in + block_offset, keys); @@ -322,16 +324,15 @@ struct AgentRadixSortDownsweep /** - * Load a tile of keys (specialized for partial tile, any ranking algorithm) + * Load a tile of keys (specialized for partial tile, block load) */ - template __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) + Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads @@ -345,7 +346,7 @@ struct AgentRadixSortDownsweep /** - * Load a tile of keys (specialized for full tile, match ranking algorithm) + * Load a tile of keys (specialized for full tile, warp-striped load) */ __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], @@ -353,14 +354,13 @@ struct AgentRadixSortDownsweep OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, - Int2Type rank_algorithm) + Int2Type warp_striped) { LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); } - /** - * Load a tile of keys (specialized for partial tile, match ranking algorithm) + * Load a tile of keys (specialized for partial tile, warp-striped load) */ __device__ __forceinline__ void LoadKeys( UnsignedBits (&keys)[ITEMS_PER_THREAD], @@ -368,7 +368,7 @@ struct AgentRadixSortDownsweep OffsetT valid_items, UnsignedBits oob_item, Int2Type is_full_tile, - Int2Type rank_algorithm) + Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads @@ -377,17 +377,15 @@ struct AgentRadixSortDownsweep LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); } - /** - * Load a tile of values (specialized for full tile, any ranking algorithm) + * Load a tile of values (specialized for full tile, block load) */ - template __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) + Int2Type warp_striped) { BlockLoadValuesT(temp_storage.load_values).Load( d_values_in + block_offset, values); @@ -397,15 +395,14 @@ struct AgentRadixSortDownsweep /** - * Load a tile of values (specialized for partial tile, any ranking algorithm) + * Load a tile of values (specialized for partial tile, block load) */ - template __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) + Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads @@ -419,28 +416,27 @@ struct AgentRadixSortDownsweep /** - * Load a tile of items (specialized for full tile, match ranking algorithm) + * Load a tile of items (specialized for full tile, warp-striped load) */ __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, - Int2Type rank_algorithm) + Int2Type warp_striped) { LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); } - /** - * Load a tile of items (specialized for partial tile, match ranking algorithm) + * Load a tile of items (specialized for partial tile, warp-striped load) */ __device__ __forceinline__ void LoadValues( ValueT (&values)[ITEMS_PER_THREAD], OffsetT block_offset, OffsetT valid_items, Int2Type is_full_tile, - Int2Type rank_algorithm) + Int2Type warp_striped) { // Register pressure work-around: moving valid_items through shfl prevents compiler // from reusing guards/addressing from prior guarded loads @@ -449,7 +445,6 @@ struct AgentRadixSortDownsweep LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); } - /** * Truck along associated values */ @@ -470,7 +465,7 @@ struct AgentRadixSortDownsweep block_offset, valid_items, Int2Type(), - Int2Type()); + Int2Type()); ScatterValues( values, @@ -515,7 +510,7 @@ struct AgentRadixSortDownsweep valid_items, default_key, Int2Type(), - Int2Type()); + Int2Type()); // Twiddle key bits if necessary #pragma unroll @@ -529,8 +524,7 @@ struct AgentRadixSortDownsweep BlockRadixRankT(temp_storage.radix_rank).RankKeys( keys, ranks, - current_bit, - num_bits, + digit_extractor, exclusive_digit_prefix); CTA_SYNC(); @@ -586,7 +580,7 @@ struct AgentRadixSortDownsweep if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) { bin_offset[track] -= exclusive_digit_prefix[track]; - temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track]; + temp_storage.keys_and_offsets.relative_bin_offsets[bin_idx] = bin_offset[track]; bin_offset[track] += inclusive_digit_prefix[track]; } } @@ -677,8 +671,7 @@ struct AgentRadixSortDownsweep d_values_in(d_values_in), d_keys_out(reinterpret_cast(d_keys_out)), d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits), + digit_extractor(current_bit, num_bits), short_circuit(1) { #pragma unroll @@ -717,8 +710,7 @@ struct AgentRadixSortDownsweep d_values_in(d_values_in), d_keys_out(reinterpret_cast(d_keys_out)), d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits), + digit_extractor(current_bit, num_bits), short_circuit(1) { #pragma unroll diff --git a/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh b/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh index c65773f..5865a60 100644 --- a/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh +++ b/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh @@ -37,6 +37,7 @@ #include "../thread/thread_load.cuh" #include "../warp/warp_reduce.cuh" #include "../block/block_load.cuh" +#include "../block/radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../util_type.cuh" #include "../iterator/cache_modified_input_iterator.cuh" @@ -121,7 +122,7 @@ struct AgentRadixSortUpsweep PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), LOG_PACKING_RATIO = Log2::VALUE, - LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), + LOG_COUNTER_LANES = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)), COUNTER_LANES = 1 << LOG_COUNTER_LANES, // To prevent counter overflow, we must periodically unpack and aggregate the @@ -139,6 +140,9 @@ struct AgentRadixSortUpsweep // Input iterator wrapper type (for applying cache modifier)s typedef CacheModifiedInputIterator KeysItr; + // Digit extractor type + typedef BFEDigitExtractor DigitExtractorT; + /** * Shared memory storage layout */ @@ -167,12 +171,8 @@ struct AgentRadixSortUpsweep // Input and output device pointers KeysItr d_keys_in; - // The least-significant bit position of the current digit to extract - int current_bit; - - // Number of bits in current digit - int num_bits; - + // Digit extractor + DigitExtractorT digit_extractor; //--------------------------------------------------------------------- @@ -217,7 +217,7 @@ struct AgentRadixSortUpsweep UnsignedBits converted_key = Traits::TwiddleIn(key); // Extract current digit bits - UnsignedBits digit = BFE(converted_key, current_bit, num_bits); + UnsignedBits digit = digit_extractor.Digit(converted_key); // Get sub-counter offset UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); @@ -342,8 +342,7 @@ struct AgentRadixSortUpsweep : temp_storage(temp_storage.Alias()), d_keys_in(reinterpret_cast(d_keys_in)), - current_bit(current_bit), - num_bits(num_bits) + digit_extractor(current_bit, num_bits) {} diff --git a/src/3rdparty/cub/agent/agent_scan.cuh b/src/3rdparty/cub/agent/agent_scan.cuh index 0781b3e..0abdb2b 100644 --- a/src/3rdparty/cub/agent/agent_scan.cuh +++ b/src/3rdparty/cub/agent/agent_scan.cuh @@ -100,12 +100,13 @@ struct AgentScan //--------------------------------------------------------------------- // The input value type - typedef typename std::iterator_traits::value_type InputT; + using InputT = typename std::iterator_traits::value_type; - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + // The output value type -- used as the intermediate accumulator + // Per https://wg21.link/P0571, use InitValueT if provided, otherwise the + // input iterator's value type. + using OutputT = + typename If::VALUE, InputT, InitValueT>::Type; // Tile status descriptor interface type typedef ScanTileState ScanTileStateT; @@ -167,11 +168,11 @@ struct AgentScan typename BlockLoadT::TempStorage load; // Smem needed for tile loading typename BlockStoreT::TempStorage store; // Smem needed for tile storing - struct + struct ScanStorage { typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - }; + } scan_storage; }; // Alias wrapper allowing storage to be unioned @@ -204,7 +205,7 @@ struct AgentScan OutputT &block_aggregate, Int2Type /*is_inclusive*/) { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); + BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); block_aggregate = scan_op(init_value, block_aggregate); } @@ -220,7 +221,7 @@ struct AgentScan OutputT &block_aggregate, Int2Type /*is_inclusive*/) { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); + BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); } @@ -235,7 +236,7 @@ struct AgentScan PrefixCallback &prefix_op, Int2Type /*is_inclusive*/) { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); + BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); } @@ -250,7 +251,7 @@ struct AgentScan PrefixCallback &prefix_op, Int2Type /*is_inclusive*/) { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); + BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); } @@ -293,9 +294,19 @@ struct AgentScan OutputT items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining); + { + // Fill last element with the first element because collectives are + // not suffix guarded. + BlockLoadT(temp_storage.load) + .Load(d_in + tile_offset, + items, + num_remaining, + *(d_in + tile_offset)); + } else + { BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + } CTA_SYNC(); @@ -311,7 +322,7 @@ struct AgentScan else { // Scan non-first tile - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); + TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx); ScanTile(items, scan_op, prefix_op, Int2Type()); } @@ -329,7 +340,7 @@ struct AgentScan * Scan tiles of items as part of a dynamic chained scan */ __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items + OffsetT num_items, ///< Total number of input items ScanTileStateT& tile_state, ///< Global tile state descriptor int start_tile) ///< The starting tile for the current grid { @@ -370,9 +381,19 @@ struct AgentScan OutputT items[ITEMS_PER_THREAD]; if (IS_LAST_TILE) - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items); + { + // Fill last element with the first element because collectives are + // not suffix guarded. + BlockLoadT(temp_storage.load) + .Load(d_in + tile_offset, + items, + valid_items, + *(d_in + tile_offset)); + } else + { BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); + } CTA_SYNC(); diff --git a/src/3rdparty/cub/block/block_exchange.cuh b/src/3rdparty/cub/block/block_exchange.cuh index 35a0333..73bc629 100644 --- a/src/3rdparty/cub/block/block_exchange.cuh +++ b/src/3rdparty/cub/block/block_exchange.cuh @@ -102,6 +102,13 @@ namespace cub { * \par Performance Considerations * - Proper device-specific padding ensures zero bank conflicts for most types. * + * \par Re-using dynamically allocating shared memory + * The following example under the examples/block folder illustrates usage of + * dynamically shared memory with BlockReduce and how to re-purpose + * the same memory region: + * example_block_reduce_dyn_smem.cu + * + * This example can be easily adapted to the storage required by BlockExchange. */ template < typename InputT, @@ -472,7 +479,7 @@ private: { int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; + new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]); } WARP_SYNC(0xffffffff); @@ -482,7 +489,7 @@ private: { int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; + new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]); } } diff --git a/src/3rdparty/cub/block/block_load.cuh b/src/3rdparty/cub/block/block_load.cuh index fc91f11..d8dd961 100644 --- a/src/3rdparty/cub/block/block_load.cuh +++ b/src/3rdparty/cub/block/block_load.cuh @@ -34,6 +34,7 @@ #pragma once #include +#include #include "block_exchange.cuh" #include "../iterator/cache_modified_input_iterator.cuh" @@ -364,7 +365,7 @@ __device__ __forceinline__ void LoadDirectWarpStriped( #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]; + new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); } } @@ -401,7 +402,7 @@ __device__ __forceinline__ void LoadDirectWarpStriped( { if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) { - items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]; + new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]); } } } @@ -472,6 +473,18 @@ enum BlockLoadAlgorithm */ BLOCK_LOAD_DIRECT, + /** + * \par Overview + * + * A [striped arrangement](index.html#sec5sec3) of data is read + * directly from memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_LOAD_STRIPED, + /** * \par Overview * @@ -507,7 +520,6 @@ enum BlockLoadAlgorithm */ BLOCK_LOAD_TRANSPOSE, - /** * \par Overview * @@ -528,7 +540,6 @@ enum BlockLoadAlgorithm */ BLOCK_LOAD_WARP_TRANSPOSE, - /** * \par Overview * @@ -572,6 +583,8 @@ enum BlockLoadAlgorithm * - BlockLoad can be optionally specialized by different data movement strategies: * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) +* -# cub::BLOCK_LOAD_STRIPED,. A [striped arrangement](index.html#sec5sec3) + * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory using CUDA's built-in vectorized loads as a * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) @@ -616,6 +629,13 @@ enum BlockLoadAlgorithm * The set of \p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * + * \par Re-using dynamically allocating shared memory + * The following example under the examples/block folder illustrates usage of + * dynamically shared memory with BlockReduce and how to re-purpose + * the same memory region: + * example_block_reduce_dyn_smem.cu + * + * This example can be easily adapted to the storage required by BlockLoad. */ template < typename InputT, @@ -703,6 +723,59 @@ private: }; + /** + * BLOCK_LOAD_STRIPED specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectStriped(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + template + __device__ __forceinline__ void Load( + InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from + InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + /** * BLOCK_LOAD_VECTORIZE specialization of load helper */ @@ -865,7 +938,7 @@ private: }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; @@ -940,7 +1013,7 @@ private: }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; @@ -1223,6 +1296,17 @@ public: }; +template ::value_type> +struct BlockLoadType +{ + using type = cub::BlockLoad; +}; + } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/src/3rdparty/cub/block/block_radix_rank.cuh b/src/3rdparty/cub/block/block_radix_rank.cuh index a98976f..cf54d5f 100644 --- a/src/3rdparty/cub/block/block_radix_rank.cuh +++ b/src/3rdparty/cub/block/block_radix_rank.cuh @@ -38,6 +38,7 @@ #include "../thread/thread_reduce.cuh" #include "../thread/thread_scan.cuh" #include "../block/block_scan.cuh" +#include "../block/radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" @@ -49,6 +50,52 @@ CUB_NS_PREFIX /// CUB namespace namespace cub { + +/** + * \brief Radix ranking algorithm, the algorithm used to implement stable ranking of the + * keys from a single tile. Note that different ranking algorithms require different + * initial arrangements of keys to function properly. + */ +enum RadixRankAlgorithm +{ + /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN == false. It + * uses thread-private histograms, and thus uses more shared memory. Requires blocked + * arrangement of keys. Does not support count callbacks. */ + RADIX_RANK_BASIC, + /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN == + * true. Similar to RADIX_RANK BASIC, it requires blocked arrangement of + * keys and does not support count callbacks.*/ + RADIX_RANK_MEMOIZE, + /** Ranking using the BlockRadixRankMatch algorithm. It uses warp-private + * histograms and matching for ranking the keys in a single warp. Therefore, + * it uses less shared memory compared to RADIX_RANK_BASIC. It requires + * warp-striped key arrangement and supports count callbacks. */ + RADIX_RANK_MATCH, + /** Ranking using the BlockRadixRankMatchEarlyCounts algorithm with + * MATCH_ALGORITHM == WARP_MATCH_ANY. An alternative implementation of + * match-based ranking that computes bin counts early. Because of this, it + * works better with onesweep sorting, which requires bin counts for + * decoupled look-back. Assumes warp-striped key arrangement and supports + * count callbacks.*/ + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, + /** Ranking using the BlockRadixRankEarlyCounts algorithm with + * MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR. It uses extra space in shared + * memory to generate warp match masks using atomicOr(). This is faster when + * there are few matches, but can lead to slowdowns if the number of + * matching keys among warp lanes is high. Assumes warp-striped key + * arrangement and supports count callbacks. */ + RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR +}; + + +/** Empty callback implementation */ +template +struct BlockRadixRankEmptyCallback +{ + __device__ __forceinline__ void operator()(int (&bins)[BINS_PER_THREAD]) {} +}; + + /** * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. * \ingroup BlockModule @@ -82,6 +129,14 @@ namespace cub { * { * * \endcode + * + * \par Re-using dynamically allocating shared memory + * The following example under the examples/block folder illustrates usage of + * dynamically shared memory with BlockReduce and how to re-purpose + * the same memory region: + * example_block_reduce_dyn_smem.cu + * + * This example can be easily adapted to the storage required by BlockRadixRank. */ template < int BLOCK_DIM_X, @@ -126,7 +181,7 @@ private: PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), LOG_PACKING_RATIO = Log2::VALUE, - LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane + LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), // Always at least one lane COUNTER_LANES = 1 << LOG_COUNTER_LANES, // The number of packed counters per thread (plus one for padding) @@ -346,12 +401,12 @@ public: */ template < typename UnsignedBits, - int KEYS_PER_THREAD> + int KEYS_PER_THREAD, + typename DigitExtractorT> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits) ///< [in] The number of bits in the current digit + DigitExtractorT digit_extractor) ///< [in] The digit extractor { DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem @@ -363,7 +418,7 @@ public: for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // Get digit - unsigned int digit = BFE(keys[ITEM], current_bit, num_bits); + unsigned int digit = digit_extractor.Digit(keys[ITEM]); // Get sub-counter unsigned int sub_counter = digit >> LOG_COUNTER_LANES; @@ -395,6 +450,7 @@ public: CTA_SYNC(); // Extract the local ranks of each key + #pragma unroll for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // Add in thread block exclusive prefix @@ -408,16 +464,16 @@ public: */ template < typename UnsignedBits, - int KEYS_PER_THREAD> + int KEYS_PER_THREAD, + typename DigitExtractorT> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits, ///< [in] The number of bits in the current digit + DigitExtractorT digit_extractor, ///< [in] The digit extractor int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] { // Rank keys - RankKeys(keys, ranks, current_bit, num_bits); + RankKeys(keys, ranks, digit_extractor); // Get the inclusive and exclusive digit totals corresponding to the calling thread. #pragma unroll @@ -565,17 +621,63 @@ public: *********************************************************************/ //@{ + /** \brief Computes the count of keys for each digit value, and calls the + * callback with the array of key counts. + + * @tparam CountsCallback The callback type. It should implement an instance + * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins + * is an array of key counts for each digit value distributed in block + * distribution among the threads of the thread block. Key counts can be + * used, to update other data structures in global or shared + * memory. Depending on the implementation of the ranking algoirhtm + * (see BlockRadixRankMatchEarlyCounts), key counts may become available + * early, therefore, they are returned through a callback rather than a + * separate output parameter of RankKeys(). + */ + template + __device__ __forceinline__ void CallBack(CountsCallback callback) + { + int bins[BINS_TRACKED_PER_THREAD]; + // Get count for each digit + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; + const int TILE_ITEMS = KEYS_PER_THREAD * BLOCK_THREADS; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + { + if (IS_DESCENDING) + { + bin_idx = RADIX_DIGITS - bin_idx - 1; + bins[track] = (bin_idx > 0 ? + temp_storage.aliasable.warp_digit_counters[bin_idx - 1][0] : TILE_ITEMS) - + temp_storage.aliasable.warp_digit_counters[bin_idx][0]; + } + else + { + bins[track] = (bin_idx < RADIX_DIGITS - 1 ? + temp_storage.aliasable.warp_digit_counters[bin_idx + 1][0] : TILE_ITEMS) - + temp_storage.aliasable.warp_digit_counters[bin_idx][0]; + } + } + } + callback(bins); + } + /** * \brief Rank keys. */ template < typename UnsignedBits, - int KEYS_PER_THREAD> + int KEYS_PER_THREAD, + typename DigitExtractorT, + typename CountsCallback> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits) ///< [in] The number of bits in the current digit + DigitExtractorT digit_extractor, ///< [in] The digit extractor + CountsCallback callback) { // Initialize shared digit counters @@ -595,7 +697,7 @@ public: for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) { // My digit - uint32_t digit = BFE(keys[ITEM], current_bit, num_bits); + uint32_t digit = digit_extractor.Digit(keys[ITEM]); if (IS_DESCENDING) digit = RADIX_DIGITS - digit - 1; @@ -648,6 +750,10 @@ public: temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; CTA_SYNC(); + if (!Equals>::VALUE) + { + CallBack(callback); + } // Seed ranks with counter values from previous warps #pragma unroll @@ -655,21 +761,34 @@ public: ranks[ITEM] += *digit_counters[ITEM]; } + template < + typename UnsignedBits, + int KEYS_PER_THREAD, + typename DigitExtractorT> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor) + { + RankKeys(keys, ranks, digit_extractor, + BlockRadixRankEmptyCallback()); + } /** * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. */ template < typename UnsignedBits, - int KEYS_PER_THREAD> + int KEYS_PER_THREAD, + typename DigitExtractorT, + typename CountsCallback> __device__ __forceinline__ void RankKeys( UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits, ///< [in] The number of bits in the current digit - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + DigitExtractorT digit_extractor, ///< [in] The digit extractor + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD], ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + CountsCallback callback) { - RankKeys(keys, ranks, current_bit, num_bits); + RankKeys(keys, ranks, digit_extractor, callback); // Get exclusive count for each digit #pragma unroll @@ -686,6 +805,326 @@ public: } } } + + template < + typename UnsignedBits, + int KEYS_PER_THREAD, + typename DigitExtractorT> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + DigitExtractorT digit_extractor, + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + { + RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix, + BlockRadixRankEmptyCallback()); + } +}; + +enum WarpMatchAlgorithm +{ + WARP_MATCH_ANY, + WARP_MATCH_ATOMIC_OR +}; + +/** + * Radix-rank using matching which computes the counts of keys for each digit + * value early, at the expense of doing more work. This may be useful e.g. for + * decoupled look-back, where it reduces the time other thread blocks need to + * wait for digit counts to become available. + */ +template +struct BlockRadixRankMatchEarlyCounts +{ + // constants + enum + { + BLOCK_THREADS = BLOCK_DIM_X, + RADIX_DIGITS = 1 << RADIX_BITS, + BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS, + BINS_TRACKED_PER_THREAD = BINS_PER_THREAD, + FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS, + WARP_THREADS = CUB_PTX_WARP_THREADS, + BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS, + WARP_MASK = ~0, + NUM_MATCH_MASKS = MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR ? BLOCK_WARPS : 0, + // Guard against declaring zero-sized array: + MATCH_MASKS_ALLOC_SIZE = NUM_MATCH_MASKS < 1 ? 1 : NUM_MATCH_MASKS, + }; + + // types + typedef cub::BlockScan BlockScan; + + + + // temporary storage + struct TempStorage + { + union + { + int warp_offsets[BLOCK_WARPS][RADIX_DIGITS]; + int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS]; + }; + + int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS]; + + typename BlockScan::TempStorage prefix_tmp; + }; + + TempStorage& temp_storage; + + // internal ranking implementation + template + struct BlockRadixRankMatchInternal + { + TempStorage& s; + DigitExtractorT digit_extractor; + CountsCallback callback; + int warp; + int lane; + + __device__ __forceinline__ int Digit(UnsignedBits key) + { + int digit = digit_extractor.Digit(key); + return IS_DESCENDING ? RADIX_DIGITS - 1 - digit : digit; + } + + __device__ __forceinline__ int ThreadBin(int u) + { + int bin = threadIdx.x * BINS_PER_THREAD + u; + return IS_DESCENDING ? RADIX_DIGITS - 1 - bin : bin; + } + + __device__ __forceinline__ + void ComputeHistogramsWarp(UnsignedBits (&keys)[KEYS_PER_THREAD]) + { + //int* warp_offsets = &s.warp_offsets[warp][0]; + int (&warp_histograms)[RADIX_DIGITS][NUM_PARTS] = s.warp_histograms[warp]; + // compute warp-private histograms + #pragma unroll + for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) + { + #pragma unroll + for (int part = 0; part < NUM_PARTS; ++part) + { + warp_histograms[bin][part] = 0; + } + } + if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR) + { + int* match_masks = &s.match_masks[warp][0]; + #pragma unroll + for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS) + { + match_masks[bin] = 0; + } + } + WARP_SYNC(WARP_MASK); + + // compute private per-part histograms + int part = lane % NUM_PARTS; + #pragma unroll + for (int u = 0; u < KEYS_PER_THREAD; ++u) + { + atomicAdd(&warp_histograms[Digit(keys[u])][part], 1); + } + + // sum different parts; + // no extra work is necessary if NUM_PARTS == 1 + if (NUM_PARTS > 1) + { + WARP_SYNC(WARP_MASK); + // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary + const int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS; + int bins[WARP_BINS_PER_THREAD]; + #pragma unroll + for (int u = 0; u < WARP_BINS_PER_THREAD; ++u) + { + int bin = lane + u * WARP_THREADS; + bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum()); + } + CTA_SYNC(); + + // store the resulting histogram in shared memory + int* warp_offsets = &s.warp_offsets[warp][0]; + #pragma unroll + for (int u = 0; u < WARP_BINS_PER_THREAD; ++u) + { + int bin = lane + u * WARP_THREADS; + warp_offsets[bin] = bins[u]; + } + } + } + + __device__ __forceinline__ + void ComputeOffsetsWarpUpsweep(int (&bins)[BINS_PER_THREAD]) + { + // sum up warp-private histograms + #pragma unroll + for (int u = 0; u < BINS_PER_THREAD; ++u) + { + bins[u] = 0; + int bin = ThreadBin(u); + if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS)) + { + #pragma unroll + for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp) + { + int warp_offset = s.warp_offsets[j_warp][bin]; + s.warp_offsets[j_warp][bin] = bins[u]; + bins[u] += warp_offset; + } + } + } + } + + __device__ __forceinline__ + void ComputeOffsetsWarpDownsweep(int (&offsets)[BINS_PER_THREAD]) + { + #pragma unroll + for (int u = 0; u < BINS_PER_THREAD; ++u) + { + int bin = ThreadBin(u); + if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS)) + { + int digit_offset = offsets[u]; + #pragma unroll + for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp) + { + s.warp_offsets[j_warp][bin] += digit_offset; + } + } + } + } + + __device__ __forceinline__ + void ComputeRanksItem( + UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], + Int2Type) + { + // compute key ranks + int lane_mask = 1 << lane; + int* warp_offsets = &s.warp_offsets[warp][0]; + int* match_masks = &s.match_masks[warp][0]; + #pragma unroll + for (int u = 0; u < KEYS_PER_THREAD; ++u) + { + int bin = Digit(keys[u]); + int* p_match_mask = &match_masks[bin]; + atomicOr(p_match_mask, lane_mask); + WARP_SYNC(WARP_MASK); + int bin_mask = *p_match_mask; + int leader = (WARP_THREADS - 1) - __clz(bin_mask); + int warp_offset = 0; + int popc = __popc(bin_mask & LaneMaskLe()); + if (lane == leader) + { + // atomic is a bit faster + warp_offset = atomicAdd(&warp_offsets[bin], popc); + } + warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask); + if (lane == leader) *p_match_mask = 0; + WARP_SYNC(WARP_MASK); + ranks[u] = warp_offset + popc - 1; + } + } + + __device__ __forceinline__ + void ComputeRanksItem( + UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD], + Int2Type) + { + // compute key ranks + int* warp_offsets = &s.warp_offsets[warp][0]; + #pragma unroll + for (int u = 0; u < KEYS_PER_THREAD; ++u) + { + int bin = Digit(keys[u]); + int bin_mask = MatchAny(bin); + int leader = (WARP_THREADS - 1) - __clz(bin_mask); + int warp_offset = 0; + int popc = __popc(bin_mask & LaneMaskLe()); + if (lane == leader) + { + // atomic is a bit faster + warp_offset = atomicAdd(&warp_offsets[bin], popc); + } + warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask); + ranks[u] = warp_offset + popc - 1; + } + } + + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + int (&exclusive_digit_prefix)[BINS_PER_THREAD]) + { + ComputeHistogramsWarp(keys); + + CTA_SYNC(); + int bins[BINS_PER_THREAD]; + ComputeOffsetsWarpUpsweep(bins); + callback(bins); + + BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix); + + ComputeOffsetsWarpDownsweep(exclusive_digit_prefix); + CTA_SYNC(); + ComputeRanksItem(keys, ranks, Int2Type()); + } + + __device__ __forceinline__ BlockRadixRankMatchInternal + (TempStorage& temp_storage, DigitExtractorT digit_extractor, CountsCallback callback) + : s(temp_storage), digit_extractor(digit_extractor), + callback(callback), warp(threadIdx.x / WARP_THREADS), lane(LaneId()) + {} + }; + + __device__ __forceinline__ BlockRadixRankMatchEarlyCounts + (TempStorage& temp_storage) : temp_storage(temp_storage) {} + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor, + int (&exclusive_digit_prefix)[BINS_PER_THREAD], + CountsCallback callback) + { + BlockRadixRankMatchInternal + internal(temp_storage, digit_extractor, callback); + internal.RankKeys(keys, ranks, exclusive_digit_prefix); + } + + template + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor, + int (&exclusive_digit_prefix)[BINS_PER_THREAD]) + { + typedef BlockRadixRankEmptyCallback CountsCallback; + BlockRadixRankMatchInternal + internal(temp_storage, digit_extractor, CountsCallback()); + internal.RankKeys(keys, ranks, exclusive_digit_prefix); + } + + template + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor) + { + int exclusive_digit_prefix[BINS_PER_THREAD]; + RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix); + } }; diff --git a/src/3rdparty/cub/block/block_radix_sort.cuh b/src/3rdparty/cub/block/block_radix_sort.cuh index e666902..213b7f2 100644 --- a/src/3rdparty/cub/block/block_radix_sort.cuh +++ b/src/3rdparty/cub/block/block_radix_sort.cuh @@ -36,6 +36,7 @@ #include "block_exchange.cuh" #include "block_radix_rank.cuh" +#include "radix_rank_sort_operations.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" @@ -63,21 +64,60 @@ namespace cub { * \tparam PTX_ARCH [optional] \ptxversion * * \par Overview - * - The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending order. It relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - BlockRadixSort can sort all of the built-in C++ numeric primitive types - * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half - * half-precision floating-point type. Within each key, the implementation treats fixed-length - * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting - * method can only be applied to unsigned integral types, BlockRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - \rowmajor + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending order. It relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \rowmajor + * + * \par Supported Types + * BlockRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. + * + * \par Floating-Point Special Cases + * + * - Positive and negative zeros are considered equivalent, and will be treated + * as such in the output. + * - No special handling is implemented for NaN values; these are sorted + * according to their bit representations after any transformations. + * + * \par Bitwise Key Transformations + * Although the direct radix sorting method can only be applied to unsigned + * integral types, BlockRadixSort is able to sort signed and floating-point + * types via simple bit-wise transformations that ensure lexicographic key + * ordering. + * + * These transformations must be considered when restricting the + * `[begin_bit, end_bit)` range, as the bitwise transformations will occur + * before the bit-range truncation. + * + * Any transformations applied to the keys prior to sorting are reversed + * while writing to the final output buffer. + * + * \par Type Specific Bitwise Transformations + * To convert the input values into a radix-sortable bitwise representation, + * the following transformations take place prior to sorting: + * + * - For unsigned integral values, the keys are used directly. + * - For signed integral values, the sign bit is inverted. + * - For positive floating point values, the sign bit is inverted. + * - For negative floating point values, the full key is inverted. + * + * \par No Descending Sort Transformations + * Unlike `DeviceRadixSort`, `BlockRadixSort` does not invert the input key bits + * when performing a descending sort. Instead, it has special logic to reverse + * the order of the keys while sorting. + * + * \par Stability + * BlockRadixSort is stable. For floating-point types -0.0 and +0.0 + * are considered equal and appear in the result in the same order as they + * appear in the input. + * * * \par Performance Considerations * - \granularity @@ -115,6 +155,13 @@ namespace cub { * corresponding output \p thread_keys in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * + * \par Re-using dynamically allocating shared memory + * The following example under the examples/block folder illustrates usage of + * dynamically shared memory with BlockReduce and how to re-purpose + * the same memory region: + * example_block_reduce_dyn_smem.cu + * + * This example can be easily adapted to the storage required by BlockRadixSort. */ template < typename KeyT, @@ -175,6 +222,9 @@ private: PTX_ARCH> DescendingBlockRadixRank; + /// Digit extractor type + typedef BFEDigitExtractor DigitExtractorT; + /// BlockExchange utility type for keys typedef BlockExchange BlockExchangeKeys; @@ -216,30 +266,26 @@ private: __device__ __forceinline__ void RankKeys( UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], - int begin_bit, - int pass_bits, + DigitExtractorT digit_extractor, Int2Type /*is_descending*/) { AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( - unsigned_keys, - ranks, - begin_bit, - pass_bits); + unsigned_keys, + ranks, + digit_extractor); } /// Rank keys (specialized for descending sort) __device__ __forceinline__ void RankKeys( UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], int (&ranks)[ITEMS_PER_THREAD], - int begin_bit, - int pass_bits, + DigitExtractorT digit_extractor, Int2Type /*is_descending*/) { DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( - unsigned_keys, - ranks, - begin_bit, - pass_bits); + unsigned_keys, + ranks, + digit_extractor); } /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) @@ -301,10 +347,11 @@ private: while (true) { int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + DigitExtractorT digit_extractor(begin_bit, pass_bits); // Rank the blocked keys int ranks[ITEMS_PER_THREAD]; - RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; CTA_SYNC(); @@ -357,10 +404,11 @@ public: while (true) { int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + DigitExtractorT digit_extractor(begin_bit, pass_bits); // Rank the blocked keys int ranks[ITEMS_PER_THREAD]; - RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + RankKeys(unsigned_keys, ranks, digit_extractor, is_descending); begin_bit += RADIX_BITS; CTA_SYNC(); diff --git a/src/3rdparty/cub/block/block_scan.cuh b/src/3rdparty/cub/block/block_scan.cuh index 513ef35..208d5f8 100644 --- a/src/3rdparty/cub/block/block_scan.cuh +++ b/src/3rdparty/cub/block/block_scan.cuh @@ -181,6 +181,13 @@ enum BlockScanAlgorithm * The corresponding output \p thread_data in those threads will be * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. * + * \par Re-using dynamically allocating shared memory + * The following example under the examples/block folder illustrates usage of + * dynamically shared memory with BlockReduce and how to re-purpose + * the same memory region: + * example_block_reduce_dyn_smem.cu + * + * This example can be easily adapted to the storage required by BlockScan. */ template < typename T, diff --git a/src/3rdparty/cub/block/block_store.cuh b/src/3rdparty/cub/block/block_store.cuh index 495a155..df654ea 100644 --- a/src/3rdparty/cub/block/block_store.cuh +++ b/src/3rdparty/cub/block/block_store.cuh @@ -34,6 +34,7 @@ #pragma once #include +#include #include "block_exchange.cuh" #include "../config.cuh" @@ -364,6 +365,17 @@ enum BlockStoreAlgorithm */ BLOCK_STORE_DIRECT, + /** + * \par Overview + * A [striped arrangement](index.html#sec5sec3) of data is written + * directly to memory. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_STORE_STRIPED, + /** * \par Overview * @@ -432,7 +444,6 @@ enum BlockStoreAlgorithm * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. */ BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - }; @@ -445,7 +456,6 @@ enum BlockStoreAlgorithm * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. - * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam PTX_ARCH [optional] \ptxversion @@ -457,6 +467,8 @@ enum BlockStoreAlgorithm * - BlockStore can be optionally specialized by different data movement strategies: * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_STRIPED. A [striped arrangement](index.html#sec5sec3) + * of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm) * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is written directly to memory using CUDA's built-in vectorized stores as a * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) @@ -466,6 +478,10 @@ enum BlockStoreAlgorithm * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is + * then written to memory. To reduce the shared memory requireent, only one warp's worth of shared + * memory is provisioned and is subsequently time-sliced among warps. [More...](\ref cub::BlockStoreAlgorithm) * - \rowmajor * * \par A Simple Example @@ -502,6 +518,13 @@ enum BlockStoreAlgorithm * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... * + * \par Re-using dynamically allocating shared memory + * The following example under the examples/block folder illustrates usage of + * dynamically shared memory with BlockReduce and how to re-purpose + * the same memory region: + * example_block_reduce_dyn_smem.cu + * + * This example can be easily adapted to the storage required by BlockStore. */ template < typename T, @@ -576,6 +599,47 @@ private: }; + /** + * BLOCK_STORE_STRIPED specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &/*temp_storage*/, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectStriped(linear_tid, block_itr, items, valid_items); + } + }; + + /** * BLOCK_STORE_VECTORIZE specialization of store helper */ @@ -697,7 +761,7 @@ private: }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; @@ -765,7 +829,7 @@ private: }; // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys typedef BlockExchange BlockExchange; @@ -820,6 +884,7 @@ private: } }; + /****************************************************************************** * Type definitions ******************************************************************************/ @@ -993,6 +1058,16 @@ public: } }; +template ::value_type> +struct BlockStoreType +{ + using type = cub::BlockStore; +}; } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/src/3rdparty/cub/block/specializations/block_scan_raking.cuh b/src/3rdparty/cub/block/specializations/block_scan_raking.cuh index 1d6c2f7..8f20818 100644 --- a/src/3rdparty/cub/block/specializations/block_scan_raking.cuh +++ b/src/3rdparty/cub/block/specializations/block_scan_raking.cuh @@ -84,7 +84,7 @@ struct BlockScanRaking SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, /// Cooperative work can be entirely warp synchronous - WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), + WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS)), }; /// WarpScan utility type diff --git a/src/3rdparty/cub/device/device_segmented_radix_sort.cuh b/src/3rdparty/cub/device/device_segmented_radix_sort.cuh index 2ab2a7d..401bc1f 100644 --- a/src/3rdparty/cub/device/device_segmented_radix_sort.cuh +++ b/src/3rdparty/cub/device/device_segmented_radix_sort.cuh @@ -60,13 +60,9 @@ namespace cub { * of the symbolic alphabet, the radix sorting method produces a lexicographic * ordering of those keys. * - * \par - * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types - * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half - * half-precision floating-point type. Although the direct radix sorting - * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. + * \par See Also + * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See + * that algorithm's documentation for more information. * * \par Usage Considerations * \cdp_class{DeviceSegmentedRadixSort} @@ -130,14 +126,16 @@ struct DeviceSegmentedRadixSort * * \endcode * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, typename ValueT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -148,8 +146,8 @@ struct DeviceSegmentedRadixSort ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -161,7 +159,7 @@ struct DeviceSegmentedRadixSort DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -241,12 +239,14 @@ struct DeviceSegmentedRadixSort * * \tparam KeyT [inferred] Key type * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, typename ValueT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortPairs( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -255,8 +255,8 @@ struct DeviceSegmentedRadixSort DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -265,7 +265,7 @@ struct DeviceSegmentedRadixSort // Signed integer type for global offsets typedef int OffsetT; - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -334,12 +334,14 @@ struct DeviceSegmentedRadixSort * * \tparam KeyT [inferred] Key type * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, typename ValueT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -350,8 +352,8 @@ struct DeviceSegmentedRadixSort ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -363,7 +365,7 @@ struct DeviceSegmentedRadixSort DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -443,12 +445,14 @@ struct DeviceSegmentedRadixSort * * \tparam KeyT [inferred] Key type * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, typename ValueT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortPairsDescending( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -457,8 +461,8 @@ struct DeviceSegmentedRadixSort DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -467,7 +471,7 @@ struct DeviceSegmentedRadixSort // Signed integer type for global offsets typedef int OffsetT; - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -536,11 +540,13 @@ struct DeviceSegmentedRadixSort * \endcode * * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -549,8 +555,8 @@ struct DeviceSegmentedRadixSort KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -563,7 +569,7 @@ struct DeviceSegmentedRadixSort DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -635,11 +641,13 @@ struct DeviceSegmentedRadixSort * \endcode * * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortKeys( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -647,8 +655,8 @@ struct DeviceSegmentedRadixSort DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -660,7 +668,7 @@ struct DeviceSegmentedRadixSort // Null value type DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -724,11 +732,13 @@ struct DeviceSegmentedRadixSort * \endcode * * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -737,8 +747,8 @@ struct DeviceSegmentedRadixSort KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -750,7 +760,7 @@ struct DeviceSegmentedRadixSort DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, @@ -822,11 +832,13 @@ struct DeviceSegmentedRadixSort * \endcode * * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam BeginOffsetIteratorT [inferred] Random-access input iterator type for reading segment beginning offsets \iterator + * \tparam EndOffsetIteratorT [inferred] Random-access input iterator type for reading segment ending offsets \iterator */ template < typename KeyT, - typename OffsetIteratorT> + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t SortKeysDescending( void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. @@ -834,8 +846,8 @@ struct DeviceSegmentedRadixSort DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys int num_items, ///< [in] The total number of items to sort (across all segments) int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. @@ -847,7 +859,7 @@ struct DeviceSegmentedRadixSort // Null value type DoubleBuffer d_values; - return DispatchSegmentedRadixSort::Dispatch( + return DispatchSegmentedRadixSort::Dispatch( d_temp_storage, temp_storage_bytes, d_keys, diff --git a/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh b/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh index 2b0919f..263d610 100644 --- a/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh @@ -37,6 +37,8 @@ #include #include +#include "../../agent/agent_radix_sort_histogram.cuh" +#include "../../agent/agent_radix_sort_onesweep.cuh" #include "../../agent/agent_radix_sort_upsweep.cuh" #include "../../agent/agent_radix_sort_downsweep.cuh" #include "../../agent/agent_scan.cuh" @@ -46,15 +48,21 @@ #include "../../util_type.cuh" #include "../../util_debug.cuh" #include "../../util_device.cuh" +#include "../../util_math.cuh" #include -/// Optional outer namespace(s) -CUB_NS_PREFIX +// suppress warnings triggered by #pragma unroll: +// "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]" +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wpass-failed" +#endif /// CUB namespace namespace cub { + /****************************************************************************** * Kernel entry points *****************************************************************************/ @@ -69,8 +77,8 @@ template < typename KeyT, ///< Key type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)) + int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) : + int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) __global__ void DeviceRadixSortUpsweepKernel( const KeyT *d_keys, ///< [in] Input keys buffer OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) @@ -156,6 +164,13 @@ __global__ void RadixSortScanBinsKernel( block_scan.template ConsumeTile(block_offset, prefix_op); block_offset += AgentScanT::TILE_ITEMS; } + + // Process the remaining partial tile (if any). + if (block_offset < num_counts) + { + block_scan.template ConsumeTile(block_offset, prefix_op, + num_counts - block_offset); + } } @@ -170,8 +185,8 @@ template < typename ValueT, ///< Value type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)) + int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) : + int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) __global__ void DeviceRadixSortDownsweepKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer @@ -345,7 +360,8 @@ template < bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename ValueT, ///< Value type - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator + typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : @@ -355,8 +371,8 @@ __global__ void DeviceSegmentedRadixSortKernel( KeyT *d_keys_out, ///< [in] Output keys buffer const ValueT *d_values_in, ///< [in] Input values buffer ValueT *d_values_out, ///< [in] Output values buffer - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data int current_bit, ///< [in] Bit position of current radix digit int pass_bits) ///< [in] Number of bits of current radix digit @@ -498,6 +514,96 @@ __global__ void DeviceSegmentedRadixSortKernel( } +/****************************************************************************** + * Onesweep kernels + ******************************************************************************/ + +/** + * Kernel for computing multiple histograms + */ + +/** + * Histogram kernel + */ +template < + typename ChainedPolicyT, + bool IS_DESCENDING, + typename KeyT, + typename OffsetT> +__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) +DeviceRadixSortHistogramKernel + (OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit) +{ + typedef typename ChainedPolicyT::ActivePolicy::HistogramPolicy HistogramPolicyT; + typedef AgentRadixSortHistogram AgentT; + __shared__ typename AgentT::TempStorage temp_storage; + AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit); + agent.Process(); +} + +template < + typename ChainedPolicyT, + bool IS_DESCENDING, + typename KeyT, + typename ValueT, + typename OffsetT, + typename AtomicOffsetT = OffsetT> +__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) +DeviceRadixSortOnesweepKernel + (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, + const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out, + const ValueT* d_values_in, OffsetT num_items, int current_bit, int num_bits) +{ + typedef typename ChainedPolicyT::ActivePolicy::OnesweepPolicy OnesweepPolicyT; + typedef AgentRadixSortOnesweep AgentT; + __shared__ typename AgentT::TempStorage s; + + AgentT agent(s, d_lookback, d_ctrs, d_bins_out, d_bins_in, d_keys_out, d_keys_in, + d_values_out, d_values_in, num_items, current_bit, num_bits); + agent.Process(); +} + + +/** + * Exclusive sum kernel + */ +template < + typename ChainedPolicyT, + typename OffsetT> +__global__ void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) +{ + typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT; + const int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS; + const int RADIX_DIGITS = 1 << RADIX_BITS; + const int BLOCK_THREADS = ExclusiveSumPolicyT::BLOCK_THREADS; + const int BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS; + typedef cub::BlockScan BlockScan; + __shared__ typename BlockScan::TempStorage temp_storage; + + // load the bins + OffsetT bins[BINS_PER_THREAD]; + int bin_start = blockIdx.x * RADIX_DIGITS; + #pragma unroll + for (int u = 0; u < BINS_PER_THREAD; ++u) + { + int bin = threadIdx.x * BINS_PER_THREAD + u; + if (bin >= RADIX_DIGITS) break; + bins[u] = d_bins[bin_start + bin]; + } + + // compute offsets + BlockScan(temp_storage).ExclusiveSum(bins, bins); + + // store the offsets + #pragma unroll + for (int u = 0; u < BINS_PER_THREAD; ++u) + { + int bin = threadIdx.x * BINS_PER_THREAD + u; + if (bin >= RADIX_DIGITS) break; + d_bins[bin_start + bin] = bins[u]; + } +} + /****************************************************************************** * Policy @@ -529,102 +635,25 @@ struct DeviceRadixSortPolicy // Architecture-specific tuning policies //------------------------------------------------------------------------------ - /// SM20 - struct Policy200 : ChainedPolicy<200, Policy200, Policy200> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - - // Relative size of KeyT type to a 4-byte word - SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, - }; - - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <512, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - /// SM30 - struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) + ONESWEEP = false, + ONESWEEP_RADIX_BITS = 8, }; - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + // Histogram policy + typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + // Exclusive sum policy + typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) - }; + // Onesweep policy + typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1, + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, + ONESWEEP_RADIX_BITS> OnesweepPolicy; // Scan policy typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; @@ -663,8 +692,21 @@ struct DeviceRadixSortPolicy PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) + ONESWEEP = false, + ONESWEEP_RADIX_BITS = 8, }; + // Histogram policy + typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; + + // Exclusive sum policy + typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; + + // Onesweep policy + typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1, + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT, + ONESWEEP_RADIX_BITS> OnesweepPolicy; + // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; @@ -692,8 +734,21 @@ struct DeviceRadixSortPolicy PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) + ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), // 10.0B 32b keys/s (GP100, 64M random keys) + ONESWEEP_RADIX_BITS = 8, }; + // Histogram policy + typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; + + // Exclusive sum policy + typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; + + // Onesweep policy + typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2, + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, + RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; + // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; @@ -722,8 +777,21 @@ struct DeviceRadixSortPolicy PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) + ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), + ONESWEEP_RADIX_BITS = 8, }; + // Histogram policy + typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; + + // Exclusive sum policy + typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; + + // Onesweep policy + typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2, + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, + RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; + // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; @@ -750,8 +818,21 @@ struct DeviceRadixSortPolicy enum { PRIMARY_RADIX_BITS = 5, ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), + ONESWEEP_RADIX_BITS = 8, }; + // Histogram policy + typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; + + // Exclusive sum policy + typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; + + // Onesweep policy + typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2, + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, + RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; + // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; @@ -779,8 +860,65 @@ struct DeviceRadixSortPolicy PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) + ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), // 15.8B 32b keys/s (V100-SXM2, 64M random keys) + ONESWEEP_RADIX_BITS = 8, }; + // Histogram policy + typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; + + // Exclusive sum policy + typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; + + // Onesweep policy + typedef AgentRadixSortOnesweepPolicy <256, + sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23, DominantT, 4, + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, + RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; + + + // ScanPolicy + typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicy; + + // Upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicy; + typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT, BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS> SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy; + }; + + + /// SM80 + struct Policy800 : ChainedPolicy<800, Policy800, Policy700> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t), + ONESWEEP_RADIX_BITS = 8, + }; + + // Histogram policy + typedef AgentRadixSortHistogramPolicy <128, 16, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy; + + // Exclusive sum policy + typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy; + + // Onesweep policy + typedef AgentRadixSortOnesweepPolicy <384, 21, DominantT, 1, + RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE, + RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy; + // ScanPolicy typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; @@ -802,7 +940,7 @@ struct DeviceRadixSortPolicy /// MaxPolicy - typedef Policy700 MaxPolicy; + typedef Policy800 MaxPolicy; }; @@ -968,7 +1106,7 @@ struct DispatchRadixSort : const ValueT *d_values_in, ValueT *d_values_out, OffsetT *d_spine, - int spine_length, + int /*spine_length*/, int ¤t_bit, PassConfigT &pass_config) { @@ -983,6 +1121,9 @@ struct DispatchRadixSort : pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); + // Spine length written by the upsweep kernel in the current pass. + int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits; + // Invoke upsweep_kernel with same grid size as downsweep_kernel thrust::cuda_cub::launcher::triple_chevron( pass_config.even_share.grid_size, @@ -1010,7 +1151,7 @@ struct DispatchRadixSort : 1, pass_config.scan_config.block_threads, 0, stream ).doit(pass_config.scan_kernel, d_spine, - spine_length); + pass_spine_length); // Check for failure to launch if (CubDebug(error = cudaPeekAtLastError())) break; @@ -1084,7 +1225,7 @@ struct DispatchRadixSort : DownsweepKernelT downsweep_kernel, int ptx_version, int sm_count, - int num_items) + OffsetT num_items) { cudaError error = cudaSuccess; do @@ -1113,6 +1254,137 @@ struct DispatchRadixSort : }; + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeOnesweep() + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + typedef OffsetT AtomicOffsetT; + + // compute temporary storage size + const int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS; + const int RADIX_DIGITS = 1 << RADIX_BITS; + const int ONESWEEP_ITEMS_PER_THREAD = ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD; + const int ONESWEEP_BLOCK_THREADS = ActivePolicyT::OnesweepPolicy::BLOCK_THREADS; + const int ONESWEEP_TILE_ITEMS = ONESWEEP_ITEMS_PER_THREAD * ONESWEEP_BLOCK_THREADS; + // parts handle inputs with >=2**30 elements, due to the way lookback works + // for testing purposes, one part is <= 2**28 elements + const int PART_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS; + int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS); + int num_parts = static_cast(cub::DivideAndRoundUp(num_items, PART_SIZE)); + OffsetT max_num_blocks = cub::DivideAndRoundUp(CUB_MIN(num_items, PART_SIZE), ONESWEEP_TILE_ITEMS); + + size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT); + size_t allocation_sizes[] = + { + // bins + num_parts * num_passes * RADIX_DIGITS * sizeof(OffsetT), + // lookback + max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), + // extra key buffer + is_overwrite_okay || num_passes <= 1 ? 0 : num_items * sizeof(KeyT), + // extra value buffer + is_overwrite_okay || num_passes <= 1 ? 0 : num_items * value_size, + // counters + num_parts * num_passes * sizeof(AtomicOffsetT), + }; + const int NUM_ALLOCATIONS = sizeof(allocation_sizes) / sizeof(allocation_sizes[0]); + void* allocations[NUM_ALLOCATIONS] = {}; + AliasTemporaries(d_temp_storage, temp_storage_bytes, + allocations, allocation_sizes); + + // just return if no temporary storage is provided + cudaError_t error = cudaSuccess; + if (d_temp_storage == NULL) return error; + + OffsetT* d_bins = (OffsetT*)allocations[0]; + AtomicOffsetT* d_lookback = (AtomicOffsetT*)allocations[1]; + KeyT* d_keys_tmp2 = (KeyT*)allocations[2]; + ValueT* d_values_tmp2 = (ValueT*)allocations[3]; + AtomicOffsetT* d_ctrs = (AtomicOffsetT*)allocations[4]; + + do { + // initialization + if (CubDebug(error = cudaMemsetAsync( + d_ctrs, 0, num_parts * num_passes * sizeof(AtomicOffsetT), stream))) break; + + // compute num_passes histograms with RADIX_DIGITS bins each + if (CubDebug(error = cudaMemsetAsync + (d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream))) break; + int device = -1; + int num_sms = 0; + if (CubDebug(error = cudaGetDevice(&device))) break; + if (CubDebug(error = cudaDeviceGetAttribute( + &num_sms, cudaDevAttrMultiProcessorCount, device))) break; + + const int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS; + int histo_blocks_per_sm = 1; + auto histogram_kernel = DeviceRadixSortHistogramKernel< + MaxPolicyT, IS_DESCENDING, KeyT, OffsetT>; + if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break; + histogram_kernel<<>> + (d_bins, d_keys.Current(), num_items, begin_bit, end_bit); + if (CubDebug(error = cudaPeekAtLastError())) break; + + // exclusive sums to determine starts + const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; + DeviceRadixSortExclusiveSumKernel + <<>>(d_bins); + if (CubDebug(error = cudaPeekAtLastError())) break; + + // use the other buffer if no overwrite is allowed + KeyT* d_keys_tmp = d_keys.Alternate(); + ValueT* d_values_tmp = d_values.Alternate(); + if (!is_overwrite_okay && num_passes % 2 == 0) + { + d_keys.d_buffers[1] = d_keys_tmp2; + d_values.d_buffers[1] = d_values_tmp2; + } + + for (int current_bit = begin_bit, pass = 0; current_bit < end_bit; + current_bit += RADIX_BITS, ++pass) + { + int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS); + for (int part = 0; part < num_parts; ++part) + { + int part_num_items = CUB_MIN(num_items - part * PART_SIZE, PART_SIZE); + int num_blocks = cub::DivideAndRoundUp(part_num_items, ONESWEEP_TILE_ITEMS); + if (CubDebug(error = cudaMemsetAsync( + d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), + stream))) break; + auto onesweep_kernel = DeviceRadixSortOnesweepKernel< + MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>; + onesweep_kernel<<>> + (d_lookback, d_ctrs + part * num_passes + pass, + part < num_parts - 1 ? + d_bins + ((part + 1) * num_passes + pass) * RADIX_DIGITS : NULL, + d_bins + (part * num_passes + pass) * RADIX_DIGITS, + d_keys.Alternate(), + d_keys.Current() + part * PART_SIZE, + d_values.Alternate(), + d_values.Current() + part * PART_SIZE, + part_num_items, current_bit, num_bits); + if (CubDebug(error = cudaPeekAtLastError())) break; + } + + // use the temporary buffers if no overwrite is allowed + if (!is_overwrite_okay && pass == 0) + { + d_keys = num_passes % 2 == 0 ? + DoubleBuffer(d_keys_tmp, d_keys_tmp2) : + DoubleBuffer(d_keys_tmp2, d_keys_tmp); + d_values = num_passes % 2 == 0 ? + DoubleBuffer(d_values_tmp, d_values_tmp2) : + DoubleBuffer(d_values_tmp2, d_values_tmp); + } + d_keys.selector ^= 1; + d_values.selector ^= 1; + } + } while (0); + + return error; + } /// Invocation (run multiple digit passes) template < @@ -1152,17 +1424,33 @@ struct DispatchRadixSort : // Init regular and alternate-digit kernel configurations PassConfig pass_config, alt_pass_config; - if ((error = pass_config.template InitPassConfig< - typename ActivePolicyT::UpsweepPolicy, - typename ActivePolicyT::ScanPolicy, - typename ActivePolicyT::DownsweepPolicy>( - upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break; - - if ((error = alt_pass_config.template InitPassConfig< - typename ActivePolicyT::AltUpsweepPolicy, - typename ActivePolicyT::ScanPolicy, - typename ActivePolicyT::AltDownsweepPolicy>( - alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break; + error = pass_config.template InitPassConfig< + typename ActivePolicyT::UpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::DownsweepPolicy>(upsweep_kernel, + scan_kernel, + downsweep_kernel, + ptx_version, + sm_count, + num_items); + if (error) + { + break; + } + + error = alt_pass_config.template InitPassConfig< + typename ActivePolicyT::AltUpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::AltDownsweepPolicy>(alt_upsweep_kernel, + scan_kernel, + alt_downsweep_kernel, + ptx_version, + sm_count, + num_items); + if (error) + { + break; + } // Get maximum spine length int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); @@ -1186,7 +1474,7 @@ struct DispatchRadixSort : // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; + int num_passes = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits); bool is_num_passes_odd = num_passes & 1; int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); @@ -1244,6 +1532,28 @@ struct DispatchRadixSort : // Chained policy invocation //------------------------------------------------------------------------------ + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeManyTiles(Int2Type) + { + // Invoke upsweep-downsweep + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + return InvokePasses( + DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, + DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, + RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); + } + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeManyTiles(Int2Type) + { + // Invoke onesweep + return InvokeOnesweep(); + } + /// Invocation template CUB_RUNTIME_FUNCTION __forceinline__ @@ -1262,12 +1572,7 @@ struct DispatchRadixSort : else { // Regular size - return InvokePasses( - DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, - DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, - RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, - DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, - DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); + return InvokeManyTiles(Int2Type()); } } @@ -1330,7 +1635,8 @@ template < bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low typename KeyT, ///< Key type typename ValueT, ///< Value type - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator + typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator typename OffsetT, ///< Signed integer type for global offsets typename SelectedPolicy = DeviceRadixSortPolicy > struct DispatchSegmentedRadixSort : @@ -1357,8 +1663,8 @@ struct DispatchSegmentedRadixSort : DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values OffsetT num_items; ///< [in] Number of items to sort OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets; ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets; ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. @@ -1380,8 +1686,8 @@ struct DispatchSegmentedRadixSort : DoubleBuffer &d_values, OffsetT num_items, OffsetT num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, int begin_bit, int end_bit, bool is_overwrite_okay, @@ -1601,8 +1907,8 @@ struct DispatchSegmentedRadixSort : // Force kernel code-generation in all compiler passes return InvokePasses( - DeviceSegmentedRadixSortKernel, - DeviceSegmentedRadixSortKernel); + DeviceSegmentedRadixSortKernel, + DeviceSegmentedRadixSortKernel); } @@ -1620,8 +1926,8 @@ struct DispatchSegmentedRadixSort : DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values int num_items, ///< [in] Number of items to sort int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers @@ -1658,3 +1964,7 @@ struct DispatchSegmentedRadixSort : CUB_NS_POSTFIX // Optional outer namespace(s) +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + diff --git a/src/3rdparty/cub/grid/grid_even_share.cuh b/src/3rdparty/cub/grid/grid_even_share.cuh index d5f8b34..badbfd6 100644 --- a/src/3rdparty/cub/grid/grid_even_share.cuh +++ b/src/3rdparty/cub/grid/grid_even_share.cuh @@ -37,6 +37,7 @@ #include "../config.cuh" #include "../util_namespace.cuh" #include "../util_macro.cuh" +#include "../util_math.cuh" #include "../util_type.cuh" #include "grid_mapping.cuh" @@ -77,7 +78,7 @@ struct GridEvenShare { private: - OffsetT total_tiles; + int total_tiles; int big_shares; OffsetT big_share_items; OffsetT normal_share_items; @@ -122,17 +123,18 @@ public: * \brief Dispatch initializer. To be called prior prior to kernel launch. */ __host__ __device__ __forceinline__ void DispatchInit( - OffsetT num_items, ///< Total number of input items + OffsetT num_items_, ///< Total number of input items int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) int tile_items) ///< Number of data items per input tile { - this->block_offset = num_items; // Initialize past-the-end - this->block_end = num_items; // Initialize past-the-end - this->num_items = num_items; - this->total_tiles = (num_items + tile_items - 1) / tile_items; + this->block_offset = num_items_; // Initialize past-the-end + this->block_end = num_items_; // Initialize past-the-end + this->num_items = num_items_; + this->total_tiles = static_cast(cub::DivideAndRoundUp(num_items_, tile_items)); this->grid_size = CUB_MIN(total_tiles, max_grid_size); - OffsetT avg_tiles_per_block = total_tiles / grid_size; - this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks + int avg_tiles_per_block = total_tiles / grid_size; + // leftover grains go to big blocks: + this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); this->normal_share_items = avg_tiles_per_block * tile_items; this->normal_base_offset = big_shares * tile_items; this->big_share_items = normal_share_items + tile_items; diff --git a/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh b/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh index 5219e50..a3d8272 100644 --- a/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh +++ b/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright @@ -12,7 +12,7 @@ * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -66,7 +66,7 @@ namespace cub { * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. * * \par Overview - * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native + * - CacheModifiedInputIterator is a random-access input iterator that wraps a native * device pointer of type ValueType*. \p ValueType references are * made by reading \p ValueType values through loads modified by \p MODIFIER. * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", @@ -76,7 +76,7 @@ namespace cub { * - Compatible with Thrust API v1.7 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto + * The code snippet below illustrates the use of \p CacheModifiedInputIterator to * dereference a device array of double using the "ldg" PTX load modifier * (i.e., load values through texture cache). * \par @@ -117,9 +117,9 @@ public: #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, + typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< + THRUST_NS_QUALIFIER::device_system_tag, + THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference >::type iterator_category; ///< The iterator category diff --git a/src/3rdparty/cub/util_arch.cuh b/src/3rdparty/cub/util_arch.cuh index 58d0c73..82acfef 100644 --- a/src/3rdparty/cub/util_arch.cuh +++ b/src/3rdparty/cub/util_arch.cuh @@ -45,9 +45,10 @@ namespace cub { #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \ - !defined(CUB_USE_COOPERATIVE_GROUPS) - #define CUB_USE_COOPERATIVE_GROUPS +#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__) || \ + CUDA_VERSION >= 9000) && \ + !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS #endif /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are diff --git a/src/3rdparty/cub/util_compiler.cuh b/src/3rdparty/cub/util_compiler.cuh index 9be9492..6239d0e 100644 --- a/src/3rdparty/cub/util_compiler.cuh +++ b/src/3rdparty/cub/util_compiler.cuh @@ -63,7 +63,7 @@ #endif // CUB_HOST_COMPILER // figure out which device compiler we're using -#if defined(__CUDACC__) +#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__) # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC diff --git a/src/3rdparty/cub/util_cpp_dialect.cuh b/src/3rdparty/cub/util_cpp_dialect.cuh index b4cbe92..23adf8e 100644 --- a/src/3rdparty/cub/util_cpp_dialect.cuh +++ b/src/3rdparty/cub/util_cpp_dialect.cuh @@ -108,27 +108,43 @@ # define CUB_COMP_DEPR_IMPL1 /* intentionally blank */ #endif -#define CUB_COMPILER_DEPRECATION(REQ, FIX) \ - CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) +#define CUB_COMPILER_DEPRECATION(REQ) \ + CUB_COMP_DEPR_IMPL(CUB requires at least REQ. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) + +#define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR) \ + CUB_COMP_DEPR_IMPL(CUB requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a future release. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.) -// Minimum required compiler checks: #ifndef CUB_IGNORE_DEPRECATED_COMPILER + +// Compiler checks: # if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000 - CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler); + CUB_COMPILER_DEPRECATION(GCC 5.0); +# elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 70000 + CUB_COMPILER_DEPRECATION(Clang 7.0); +# elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910 + // <2017. Hard upgrade message: + CUB_COMPILER_DEPRECATION(MSVC 2019 (19.20/16.0/14.20)); +# elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1920 + // >=2017, <2019. Soft deprecation message: + CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019 (19.20/16.0/14.20), MSVC 2017); # endif -# if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000 - CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler); -# endif -# if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910 - CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler); + +#endif // CUB_IGNORE_DEPRECATED_COMPILER + +#ifndef CUB_IGNORE_DEPRECATED_DIALECT + +// Dialect checks: +# if CUB_CPP_DIALECT < 2011 + // = 2014 +# define CUB_DEPRECATED [[deprecated]] +#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC # define CUB_DEPRECATED __declspec(deprecated) #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG # define CUB_DEPRECATED __attribute__((deprecated)) @@ -43,4 +52,3 @@ #else # define CUB_DEPRECATED #endif - diff --git a/src/3rdparty/cub/util_device.cuh b/src/3rdparty/cub/util_device.cuh index df0ee07..10da757 100644 --- a/src/3rdparty/cub/util_device.cuh +++ b/src/3rdparty/cub/util_device.cuh @@ -33,6 +33,8 @@ #pragma once +#include "detail/device_synchronize.cuh" + #include "util_type.cuh" #include "util_arch.cuh" #include "util_debug.cuh" @@ -121,7 +123,7 @@ __global__ void EmptyKernel(void) { } /** * \brief Returns the current device or -1 if an error occurred. */ -CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice() +CUB_RUNTIME_FUNCTION inline int CurrentDevice() { #if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. @@ -147,14 +149,14 @@ private: int const old_device; bool const needs_reset; public: - __host__ __forceinline__ SwitchDevice(int new_device) + __host__ inline SwitchDevice(int new_device) : old_device(CurrentDevice()), needs_reset(old_device != new_device) { if (needs_reset) CubDebug(cudaSetDevice(new_device)); } - __host__ __forceinline__ ~SwitchDevice() + __host__ inline ~SwitchDevice() { if (needs_reset) CubDebug(cudaSetDevice(old_device)); @@ -165,7 +167,7 @@ public: * \brief Returns the number of CUDA devices available or -1 if an error * occurred. */ -CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached() +CUB_RUNTIME_FUNCTION inline int DeviceCountUncached() { #if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. @@ -198,11 +200,22 @@ struct ValueCache * \brief Call the nullary function to produce the value and construct the * cache. */ - __host__ __forceinline__ ValueCache() : value(Function()) {} + __host__ inline ValueCache() : value(Function()) {} }; #endif +#if CUB_CPP_DIALECT >= 2011 +// Host code, only safely usable in C++11 or newer, where thread-safe +// initialization of static locals is guaranteed. This is a separate function +// to avoid defining a local static in a host/device function. +__host__ inline int DeviceCountCachedValue() +{ + static ValueCache cache; + return cache.value; +} +#endif + /** * \brief Returns the number of CUDA devices available. * @@ -210,17 +223,14 @@ struct ValueCache * * \note This function is thread safe. */ -CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount() +CUB_RUNTIME_FUNCTION inline int DeviceCount() { int result = -1; if (CUB_IS_HOST_CODE) { #if CUB_INCLUDE_HOST_CODE #if CUB_CPP_DIALECT >= 2011 // Host code and C++11. - // C++11 guarantees that initialization of static locals is thread safe. - static ValueCache cache; - - result = cache.value; + result = DeviceCountCachedValue(); #else // Host code and C++98. result = DeviceCountUncached(); @@ -273,7 +283,7 @@ public: /** * \brief Construct the cache. */ - __host__ __forceinline__ PerDeviceAttributeCache() : entries_() + __host__ inline PerDeviceAttributeCache() : entries_() { assert(DeviceCount() <= CUB_MAX_DEVICES); } @@ -312,7 +322,8 @@ public: // We don't use `CubDebug` here because we let the user code // decide whether or not errors are hard errors. - if (payload.error = std::forward(f)(payload.attribute)) + payload.error = std::forward(f)(payload.attribute); + if (payload.error) // Clear the global CUDA error state which may have been // set by the last call. Otherwise, errors may "leak" to // unrelated kernel launches. @@ -350,7 +361,7 @@ public: /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10). */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version) +CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version) { // Instantiate `EmptyKernel` in both host and device code to ensure // it can be called. @@ -390,15 +401,16 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_ver /** * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10). */ -__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device) +__host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device) { SwitchDevice sd(device); + (void)sd; return PtxVersionUncached(ptx_version); } #if CUB_CPP_DIALECT >= 2011 // C++11 and later. template -__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache() +__host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache() { // C++11 guarantees that initialization of static locals is thread safe. static PerDeviceAttributeCache cache; @@ -416,7 +428,7 @@ struct SmVersionCacheTag {}; * * \note This function is thread safe. */ -__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device) +__host__ inline cudaError_t PtxVersion(int& ptx_version, int device) { #if CUB_CPP_DIALECT >= 2011 // C++11 and later. @@ -445,7 +457,7 @@ __host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device) * * \note This function is thread safe. */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version) +CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version) { cudaError_t result = cudaErrorUnknown; if (CUB_IS_HOST_CODE) { @@ -481,7 +493,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version) /** * \brief Retrieves the SM version of \p device (major * 100 + minor * 10) */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) +CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice()) { #if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime. @@ -515,7 +527,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_versi * * \note This function is thread safe. */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice()) +CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice()) { cudaError_t result = cudaErrorUnknown; if (CUB_IS_HOST_CODE) { @@ -548,7 +560,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int /** * Synchronize the specified \p stream. */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream) +CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream) { cudaError_t result = cudaErrorUnknown; if (CUB_IS_HOST_CODE) { @@ -560,7 +572,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream) #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime. (void)stream; // Device can't yet sync on a specific stream - result = CubDebug(cudaDeviceSynchronize()); + result = CubDebug(cub::detail::device_synchronize()); #else // Device code without the CUDA runtime. (void)stream; // CUDA API calls are not supported from this device. @@ -604,7 +616,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream) * */ template -CUB_RUNTIME_FUNCTION __forceinline__ +CUB_RUNTIME_FUNCTION inline cudaError_t MaxSmOccupancy( int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy diff --git a/src/3rdparty/cub/util_macro.cuh b/src/3rdparty/cub/util_macro.cuh index ff86365..697944f 100644 --- a/src/3rdparty/cub/util_macro.cuh +++ b/src/3rdparty/cub/util_macro.cuh @@ -34,6 +34,8 @@ #include "util_namespace.cuh" +#include + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -56,6 +58,24 @@ namespace cub { #endif #endif +#define CUB_PREVENT_MACRO_SUBSTITUTION + +template +constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, + U &&u) + -> decltype(t < u ? std::forward(t) : std::forward(u)) +{ + return t < u ? std::forward(t) : std::forward(u); +} + +template +constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, + U &&u) + -> decltype(t < u ? std::forward(u) : std::forward(t)) +{ + return t < u ? std::forward(u) : std::forward(t); +} + #ifndef CUB_MAX /// Select maximum(a, b) #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) diff --git a/src/3rdparty/cub/util_namespace.cuh b/src/3rdparty/cub/util_namespace.cuh index 4488d97..2a1bb38 100644 --- a/src/3rdparty/cub/util_namespace.cuh +++ b/src/3rdparty/cub/util_namespace.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -27,27 +27,110 @@ ******************************************************************************/ /** - * \file - * Place-holder for prefixing the cub namespace + * \file util_namespace.cuh + * \brief Utilities that allow `cub::` to be placed inside an + * application-specific namespace. */ + #pragma once +// This is not used by this file; this is a hack so that we can detect the +// CUB version from Thrust on older versions of CUB that did not have +// version.cuh. #include "version.cuh" -// For example: -//#define CUB_NS_PREFIX namespace thrust{ namespace detail { -//#define CUB_NS_POSTFIX } } +// Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users +// that they must now define the qualifier macro, too. +#if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER) +#error CUB requires a definition of CUB_NS_QUALIFIER when CUB_NS_PREFIX/POSTFIX are defined. +#endif + +/** + * \def THRUST_CUB_WRAPPED_NAMESPACE + * If defined, this value will be used as the name of a namespace that wraps the + * `thrust::` and `cub::` namespaces. + * This macro should not be used with any other CUB namespace macros. + */ +#ifdef THRUST_CUB_WRAPPED_NAMESPACE +#define CUB_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE +#endif + +/** + * \def CUB_WRAPPED_NAMESPACE + * If defined, this value will be used as the name of a namespace that wraps the + * `cub::` namespace. + * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value. + * This macro should not be used with any other CUB namespace macros. + */ +#ifdef CUB_WRAPPED_NAMESPACE +#define CUB_NS_PREFIX \ + namespace CUB_WRAPPED_NAMESPACE \ + { +#define CUB_NS_POSTFIX } + +#define CUB_NS_QUALIFIER ::CUB_WRAPPED_NAMESPACE::cub +#endif + +/** + * \def CUB_NS_PREFIX + * This macro is inserted prior to all `namespace cub { ... }` blocks. It is + * derived from CUB_WRAPPED_NAMESPACE, if set, and will be empty otherwise. + * It may be defined by users, in which case CUB_NS_PREFIX, + * CUB_NS_POSTFIX, and CUB_NS_QUALIFIER must all be set consistently. + */ #ifndef CUB_NS_PREFIX #define CUB_NS_PREFIX #endif +/** + * \def CUB_NS_POSTFIX + * This macro is inserted following the closing braces of all + * `namespace cub { ... }` block. It is defined appropriately when + * CUB_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be + * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and + * CUB_NS_QUALIFIER must all be set consistently. + */ #ifndef CUB_NS_POSTFIX #define CUB_NS_POSTFIX #endif +/** + * \def CUB_NS_QUALIFIER + * This macro is used to qualify members of cub:: when accessing them from + * outside of their namespace. By default, this is just `::cub`, and will be + * set appropriately when CUB_WRAPPED_NAMESPACE is defined. This macro may be + * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and + * CUB_NS_QUALIFIER must all be set consistently. + */ +#ifndef CUB_NS_QUALIFIER +#define CUB_NS_QUALIFIER ::cub +#endif + +/** + * \def CUB_NAMESPACE_BEGIN + * This macro is used to open a `cub::` namespace block, along with any + * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc. + * This macro is defined by CUB and may not be overridden. + */ +#define CUB_NAMESPACE_BEGIN \ + CUB_NS_PREFIX \ + namespace cub \ + { + +/** + * \def CUB_NAMESPACE_END + * This macro is used to close a `cub::` namespace block, along with any + * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc. + * This macro is defined by CUB and may not be overridden. + */ +#define CUB_NAMESPACE_END \ + } /* end namespace cub */ \ + CUB_NS_POSTFIX + // Declare these namespaces here for the purpose of Doxygenating them +CUB_NS_PREFIX /*! \namespace cub * \brief \p cub is the top-level namespace which contains all CUB @@ -55,5 +138,6 @@ */ namespace cub { - } + +CUB_NS_POSTFIX diff --git a/src/3rdparty/cub/util_ptx.cuh b/src/3rdparty/cub/util_ptx.cuh index 3f20c11..7b3ce7a 100644 --- a/src/3rdparty/cub/util_ptx.cuh +++ b/src/3rdparty/cub/util_ptx.cuh @@ -243,6 +243,15 @@ __device__ __forceinline__ int CTA_SYNC_AND(int p) } +/** + * CTA barrier with predicate + */ +__device__ __forceinline__ int CTA_SYNC_OR(int p) +{ + return __syncthreads_or(p); +} + + /** * Warp barrier */ @@ -292,6 +301,7 @@ __device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_m #endif } + /** * Warp synchronous shfl_up */ @@ -340,6 +350,19 @@ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned return word; } +/** + * Warp synchronous shfl_idx + */ +__device__ __forceinline__ +unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __shfl_sync(member_mask, word, src_lane); +#else + return __shfl(word, src_lane); +#endif +} + /** * Floating point multiply. (Mantissa LSB rounds towards zero.) */ @@ -713,22 +736,5 @@ inline __device__ unsigned int MatchAny(unsigned int label) } - - - - - - - - - - - - - - - - - } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/src/3rdparty/cub/util_type.cuh b/src/3rdparty/cub/util_type.cuh index 0ba41e1..c380510 100644 --- a/src/3rdparty/cub/util_type.cuh +++ b/src/3rdparty/cub/util_type.cuh @@ -37,9 +37,12 @@ #include #include -#if (__CUDACC_VER_MAJOR__ >= 9) +#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__ #include #endif +#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__ + #include +#endif #include "util_macro.cuh" #include "util_arch.cuh" @@ -62,7 +65,7 @@ namespace cub { /****************************************************************************** - * Type equality + * Conditional types ******************************************************************************/ /** @@ -88,7 +91,7 @@ struct If /****************************************************************************** - * Conditional types + * Type equality ******************************************************************************/ /** @@ -358,7 +361,7 @@ struct UnitWord { enum { UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, - IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0) + IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0) }; }; @@ -1063,7 +1066,7 @@ struct FpLimits }; -#if (__CUDACC_VER_MAJOR__ >= 9) +#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__ template <> struct FpLimits<__half> { @@ -1079,6 +1082,21 @@ struct FpLimits<__half> }; #endif +#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__ +template <> +struct FpLimits<__nv_bfloat16> +{ + static __host__ __device__ __forceinline__ __nv_bfloat16 Max() { + unsigned short max_word = 0x7F7F; + return reinterpret_cast<__nv_bfloat16&>(max_word); + } + + static __host__ __device__ __forceinline__ __nv_bfloat16 Lowest() { + unsigned short lowest_word = 0xFF7F; + return reinterpret_cast<__nv_bfloat16&>(lowest_word); + } +}; +#endif /** * Basic type traits (fp primitive specialization) @@ -1143,9 +1161,12 @@ template <> struct NumericTraits : BaseTraits struct NumericTraits : BaseTraits {}; template <> struct NumericTraits : BaseTraits {}; -#if (__CUDACC_VER_MAJOR__ >= 9) +#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__ template <> struct NumericTraits<__half> : BaseTraits {}; #endif +#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__ + template <> struct NumericTraits<__nv_bfloat16> : BaseTraits {}; +#endif template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; diff --git a/src/3rdparty/cub/version.cuh b/src/3rdparty/cub/version.cuh index 122fb9a..0919222 100644 --- a/src/3rdparty/cub/version.cuh +++ b/src/3rdparty/cub/version.cuh @@ -43,7 +43,7 @@ * CUB_VERSION / 100 % 1000 is the minor version. * CUB_VERSION / 100000 is the major version. */ -#define CUB_VERSION 101000 +#define CUB_VERSION 101500 /*! \def CUB_MAJOR_VERSION * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the