diff --git a/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh b/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh
index c861a41..a90571d 100644
--- a/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh
+++ b/src/3rdparty/cub/agent/agent_radix_sort_downsweep.cuh
@@ -41,6 +41,7 @@
 #include "../block/block_store.cuh"
 #include "../block/block_radix_rank.cuh"
 #include "../block/block_exchange.cuh"
+#include "../block/radix_rank_sort_operations.cuh"
 #include "../config.cuh"
 #include "../util_type.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
@@ -56,16 +57,6 @@ namespace cub {
  * Tuning policy types
  ******************************************************************************/
 
-/**
- * Radix ranking algorithm
- */
-enum RadixRankAlgorithm
-{
-    RADIX_RANK_BASIC,
-    RADIX_RANK_MEMOIZE,
-    RADIX_RANK_MATCH
-};
-
 /**
  * Parameterizable tuning policy type for AgentRadixSortDownsweep
  */
@@ -137,6 +128,9 @@ struct AgentRadixSortDownsweep
 
         RADIX_DIGITS            = 1 << RADIX_BITS,
         KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+        LOAD_WARP_STRIPED       = RANK_ALGORITHM == RADIX_RANK_MATCH ||
+                                  RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY ||
+                                  RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR,
     };
 
     // Input iterator wrapper type (for applying cache modifier)s
@@ -148,10 +142,22 @@ struct AgentRadixSortDownsweep
             BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
             typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
                 BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
-                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
+                typename If<(RANK_ALGORITHM == RADIX_RANK_MATCH),
+                    BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>,
+                    typename If<(RANK_ALGORITHM == RADIX_RANK_MATCH_EARLY_COUNTS_ANY),
+                        BlockRadixRankMatchEarlyCounts<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING,
+                                                       SCAN_ALGORITHM, WARP_MATCH_ANY>,
+                        BlockRadixRankMatchEarlyCounts<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING,
+                                                       SCAN_ALGORITHM, WARP_MATCH_ATOMIC_OR>
+                    >::Type
+                >::Type
             >::Type
         >::Type BlockRadixRankT;
 
+    // Digit extractor type
+    typedef BFEDigitExtractor<KeyT> DigitExtractorT;
+
+
     enum
     {
         /// Number of bin-starting offsets tracked per thread
@@ -184,11 +190,11 @@ struct AgentRadixSortDownsweep
         typename BlockLoadValuesT::TempStorage  load_values;
         typename BlockRadixRankT::TempStorage   radix_rank;
 
-        struct
+        struct KeysAndOffsets
         {
             UnsignedBits                        exchange_keys[TILE_ITEMS];
             OffsetT                             relative_bin_offsets[RADIX_DIGITS];
-        };
+        } keys_and_offsets;
 
         Uninitialized<ValueExchangeT>           exchange_values;
 
@@ -216,11 +222,8 @@ struct AgentRadixSortDownsweep
     // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
     OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
 
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
+    // Digit extractor
+    DigitExtractorT digit_extractor;
 
     // Whether to short-cirucit
     int             short_circuit;
@@ -243,7 +246,7 @@ struct AgentRadixSortDownsweep
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
         {
-            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+            temp_storage.keys_and_offsets.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
         }
 
         CTA_SYNC();
@@ -251,9 +254,9 @@ struct AgentRadixSortDownsweep
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
         {
-            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
-            UnsignedBits digit          = BFE(key, current_bit, num_bits);
-            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+            UnsignedBits key            = temp_storage.keys_and_offsets.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = digit_extractor.Digit(key);
+            relative_bin_offsets[ITEM]  = temp_storage.keys_and_offsets.relative_bin_offsets[digit];
 
             // Un-twiddle
             key = Traits<KeyT>::TwiddleOut(key);
@@ -303,16 +306,15 @@ struct AgentRadixSortDownsweep
     }
 
     /**
-     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     * Load a tile of keys (specialized for full tile, block load)
      */
-    template <int _RANK_ALGORITHM>
     __device__ __forceinline__ void LoadKeys(
         UnsignedBits                (&keys)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
         OffsetT                     valid_items,
         UnsignedBits                oob_item,
         Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+        Int2Type<false>             warp_striped)
     {
         BlockLoadKeysT(temp_storage.load_keys).Load(
             d_keys_in + block_offset, keys);
@@ -322,16 +324,15 @@ struct AgentRadixSortDownsweep
 
 
     /**
-     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     * Load a tile of keys (specialized for partial tile, block load)
      */
-    template <int _RANK_ALGORITHM>
     __device__ __forceinline__ void LoadKeys(
         UnsignedBits                (&keys)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
         OffsetT                     valid_items,
         UnsignedBits                oob_item,
         Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+        Int2Type<false>             warp_striped)
     {
         // Register pressure work-around: moving valid_items through shfl prevents compiler
         // from reusing guards/addressing from prior guarded loads
@@ -345,7 +346,7 @@ struct AgentRadixSortDownsweep
 
 
     /**
-     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     * Load a tile of keys (specialized for full tile, warp-striped load)
      */
     __device__ __forceinline__ void LoadKeys(
         UnsignedBits                (&keys)[ITEMS_PER_THREAD],
@@ -353,14 +354,13 @@ struct AgentRadixSortDownsweep
         OffsetT                     valid_items,
         UnsignedBits                oob_item,
         Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+        Int2Type<true>              warp_striped)
     {
         LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
     }
 
-
     /**
-     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     * Load a tile of keys (specialized for partial tile, warp-striped load)
      */
     __device__ __forceinline__ void LoadKeys(
         UnsignedBits                (&keys)[ITEMS_PER_THREAD],
@@ -368,7 +368,7 @@ struct AgentRadixSortDownsweep
         OffsetT                     valid_items,
         UnsignedBits                oob_item,
         Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+        Int2Type<true>              warp_striped)
     {
         // Register pressure work-around: moving valid_items through shfl prevents compiler
         // from reusing guards/addressing from prior guarded loads
@@ -377,17 +377,15 @@ struct AgentRadixSortDownsweep
         LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
     }
 
-
     /**
-     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     * Load a tile of values (specialized for full tile, block load)
      */
-    template <int _RANK_ALGORITHM>
     __device__ __forceinline__ void LoadValues(
         ValueT                      (&values)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
         OffsetT                     valid_items,
         Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+        Int2Type<false>             warp_striped)
     {
         BlockLoadValuesT(temp_storage.load_values).Load(
             d_values_in + block_offset, values);
@@ -397,15 +395,14 @@ struct AgentRadixSortDownsweep
 
 
     /**
-     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     * Load a tile of values (specialized for partial tile, block load)
      */
-    template <int _RANK_ALGORITHM>
     __device__ __forceinline__ void LoadValues(
         ValueT                      (&values)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
         OffsetT                     valid_items,
         Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+        Int2Type<false>             warp_striped)
     {
         // Register pressure work-around: moving valid_items through shfl prevents compiler
         // from reusing guards/addressing from prior guarded loads
@@ -419,28 +416,27 @@ struct AgentRadixSortDownsweep
 
 
     /**
-     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     * Load a tile of items (specialized for full tile, warp-striped load)
      */
     __device__ __forceinline__ void LoadValues(
         ValueT                      (&values)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
         OffsetT                     valid_items,
         Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+        Int2Type<true>              warp_striped)
     {
         LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
     }
 
-
     /**
-     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     * Load a tile of items (specialized for partial tile, warp-striped load)
      */
     __device__ __forceinline__ void LoadValues(
         ValueT                      (&values)[ITEMS_PER_THREAD],
         OffsetT                     block_offset,
         OffsetT                     valid_items,
         Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+        Int2Type<true>              warp_striped)
     {
         // Register pressure work-around: moving valid_items through shfl prevents compiler
         // from reusing guards/addressing from prior guarded loads
@@ -449,7 +445,6 @@ struct AgentRadixSortDownsweep
         LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
     }
 
-
     /**
      * Truck along associated values
      */
@@ -470,7 +465,7 @@ struct AgentRadixSortDownsweep
             block_offset,
             valid_items,
             Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
+            Int2Type<LOAD_WARP_STRIPED>());
 
         ScatterValues<FULL_TILE>(
             values,
@@ -515,7 +510,7 @@ struct AgentRadixSortDownsweep
             valid_items, 
             default_key,
             Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
+            Int2Type<LOAD_WARP_STRIPED>());
 
         // Twiddle key bits if necessary
         #pragma unroll
@@ -529,8 +524,7 @@ struct AgentRadixSortDownsweep
         BlockRadixRankT(temp_storage.radix_rank).RankKeys(
             keys,
             ranks,
-            current_bit,
-            num_bits,
+            digit_extractor,
             exclusive_digit_prefix);
 
         CTA_SYNC();
@@ -586,7 +580,7 @@ struct AgentRadixSortDownsweep
             if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
             {
                 bin_offset[track] -= exclusive_digit_prefix[track];
-                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                temp_storage.keys_and_offsets.relative_bin_offsets[bin_idx] = bin_offset[track];
                 bin_offset[track] += inclusive_digit_prefix[track];
             }
         }
@@ -677,8 +671,7 @@ struct AgentRadixSortDownsweep
         d_values_in(d_values_in),
         d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
         d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
+        digit_extractor(current_bit, num_bits),
         short_circuit(1)
     {
         #pragma unroll
@@ -717,8 +710,7 @@ struct AgentRadixSortDownsweep
         d_values_in(d_values_in),
         d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
         d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
+        digit_extractor(current_bit, num_bits),
         short_circuit(1)
     {
         #pragma unroll
diff --git a/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh b/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh
index c65773f..5865a60 100644
--- a/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/src/3rdparty/cub/agent/agent_radix_sort_upsweep.cuh
@@ -37,6 +37,7 @@
 #include "../thread/thread_load.cuh"
 #include "../warp/warp_reduce.cuh"
 #include "../block/block_load.cuh"
+#include "../block/radix_rank_sort_operations.cuh"
 #include "../config.cuh"
 #include "../util_type.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
@@ -121,7 +122,7 @@ struct AgentRadixSortUpsweep
         PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
         LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
 
-        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        LOG_COUNTER_LANES       = CUB_MAX(0, int(RADIX_BITS) - int(LOG_PACKING_RATIO)),
         COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
 
         // To prevent counter overflow, we must periodically unpack and aggregate the
@@ -139,6 +140,9 @@ struct AgentRadixSortUpsweep
     // Input iterator wrapper type (for applying cache modifier)s
     typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
 
+    // Digit extractor type
+    typedef BFEDigitExtractor<KeyT> DigitExtractorT;
+
     /**
      * Shared memory storage layout
      */
@@ -167,12 +171,8 @@ struct AgentRadixSortUpsweep
     // Input and output device pointers
     KeysItr         d_keys_in;
 
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
+    // Digit extractor
+    DigitExtractorT digit_extractor;
 
 
     //---------------------------------------------------------------------
@@ -217,7 +217,7 @@ struct AgentRadixSortUpsweep
         UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
 
         // Extract current digit bits
-        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+        UnsignedBits digit = digit_extractor.Digit(converted_key);
 
         // Get sub-counter offset
         UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
@@ -342,8 +342,7 @@ struct AgentRadixSortUpsweep
     :
         temp_storage(temp_storage.Alias()),
         d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit),
-        num_bits(num_bits)
+        digit_extractor(current_bit, num_bits)
     {}
 
 
diff --git a/src/3rdparty/cub/agent/agent_scan.cuh b/src/3rdparty/cub/agent/agent_scan.cuh
index 0781b3e..0abdb2b 100644
--- a/src/3rdparty/cub/agent/agent_scan.cuh
+++ b/src/3rdparty/cub/agent/agent_scan.cuh
@@ -100,12 +100,13 @@ struct AgentScan
     //---------------------------------------------------------------------
 
     // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+    using InputT = typename std::iterator_traits<InputIteratorT>::value_type;
 
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+    // The output value type -- used as the intermediate accumulator
+    // Per https://wg21.link/P0571, use InitValueT if provided, otherwise the
+    // input iterator's value type.
+    using OutputT =
+      typename If<Equals<InitValueT, NullType>::VALUE, InputT, InitValueT>::Type;
 
     // Tile status descriptor interface type
     typedef ScanTileState<OutputT> ScanTileStateT;
@@ -167,11 +168,11 @@ struct AgentScan
         typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
         typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
 
-        struct
+        struct ScanStorage
         {
             typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
             typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
-        };
+        } scan_storage;
     };
 
     // Alias wrapper allowing storage to be unioned
@@ -204,7 +205,7 @@ struct AgentScan
         OutputT             &block_aggregate,
         Int2Type<false>     /*is_inclusive*/)
     {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
         block_aggregate = scan_op(init_value, block_aggregate);
     }
 
@@ -220,7 +221,7 @@ struct AgentScan
         OutputT             &block_aggregate,
         Int2Type<true>      /*is_inclusive*/)
     {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+        BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
     }
 
 
@@ -235,7 +236,7 @@ struct AgentScan
         PrefixCallback      &prefix_op,
         Int2Type<false>     /*is_inclusive*/)
     {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+        BlockScanT(temp_storage.scan_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
     }
 
 
@@ -250,7 +251,7 @@ struct AgentScan
         PrefixCallback      &prefix_op,
         Int2Type<true>      /*is_inclusive*/)
     {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+        BlockScanT(temp_storage.scan_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
     }
 
 
@@ -293,9 +294,19 @@ struct AgentScan
         OutputT items[ITEMS_PER_THREAD];
 
         if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        {
+            // Fill last element with the first element because collectives are
+            // not suffix guarded.
+            BlockLoadT(temp_storage.load)
+              .Load(d_in + tile_offset,
+                    items,
+                    num_remaining,
+                    *(d_in + tile_offset));
+        }
         else
+        {
             BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+        }
 
         CTA_SYNC();
 
@@ -311,7 +322,7 @@ struct AgentScan
         else
         {
             // Scan non-first tile
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.scan_storage.prefix, scan_op, tile_idx);
             ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
         }
 
@@ -329,7 +340,7 @@ struct AgentScan
      * Scan tiles of items as part of a dynamic chained scan
      */
     __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
+        OffsetT             num_items,          ///< Total number of input items
         ScanTileStateT&     tile_state,         ///< Global tile state descriptor
         int                 start_tile)         ///< The starting tile for the current grid
     {
@@ -370,9 +381,19 @@ struct AgentScan
         OutputT items[ITEMS_PER_THREAD];
 
         if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        {
+            // Fill last element with the first element because collectives are
+            // not suffix guarded.
+            BlockLoadT(temp_storage.load)
+              .Load(d_in + tile_offset,
+                    items,
+                    valid_items,
+                    *(d_in + tile_offset));
+        }
         else
+        {
             BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+        }
 
         CTA_SYNC();
 
diff --git a/src/3rdparty/cub/block/block_exchange.cuh b/src/3rdparty/cub/block/block_exchange.cuh
index 35a0333..73bc629 100644
--- a/src/3rdparty/cub/block/block_exchange.cuh
+++ b/src/3rdparty/cub/block/block_exchange.cuh
@@ -102,6 +102,13 @@ namespace cub {
  * \par Performance Considerations
  * - Proper device-specific padding ensures zero bank conflicts for most types.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockExchange.
  */
 template <
     typename    InputT,
@@ -472,7 +479,7 @@ private:
         {
             int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
+            new (&temp_storage.buff[item_offset]) InputT (input_items[ITEM]);
         }
 
         WARP_SYNC(0xffffffff);
@@ -482,7 +489,7 @@ private:
         {
             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
+            new(&output_items[ITEM]) OutputT(temp_storage.buff[item_offset]);
         }
     }
 
diff --git a/src/3rdparty/cub/block/block_load.cuh b/src/3rdparty/cub/block/block_load.cuh
index fc91f11..d8dd961 100644
--- a/src/3rdparty/cub/block/block_load.cuh
+++ b/src/3rdparty/cub/block/block_load.cuh
@@ -34,6 +34,7 @@
 #pragma once
 
 #include <iterator>
+#include <type_traits>
 
 #include "block_exchange.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
@@ -364,7 +365,7 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
+        new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
     }
 }
 
@@ -401,7 +402,7 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
     {
         if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
         {
-            items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
+            new(&items[ITEM]) InputT(block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]);
         }
     }
 }
@@ -472,6 +473,18 @@ enum BlockLoadAlgorithm
      */
     BLOCK_LOAD_DIRECT,
 
+    /**
+    * \par Overview
+    *
+    * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+    * directly from memory.
+    *
+    * \par Performance Considerations
+    * - The utilization of memory transactions (coalescing) decreases as the
+    *   access stride between threads increases (i.e., the number items per thread).
+    */
+    BLOCK_LOAD_STRIPED,
+
     /**
      * \par Overview
      *
@@ -507,7 +520,6 @@ enum BlockLoadAlgorithm
      */
     BLOCK_LOAD_TRANSPOSE,
 
-
     /**
      * \par Overview
      *
@@ -528,7 +540,6 @@ enum BlockLoadAlgorithm
      */
     BLOCK_LOAD_WARP_TRANSPOSE,
 
-
     /**
      * \par Overview
      *
@@ -572,6 +583,8 @@ enum BlockLoadAlgorithm
  * - BlockLoad can be optionally specialized by different data movement strategies:
  *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+*    -# <b>cub::BLOCK_LOAD_STRIPED,</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
  *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is read directly from memory using CUDA's built-in vectorized loads as a
  *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
@@ -616,6 +629,13 @@ enum BlockLoadAlgorithm
  * The set of \p thread_data across the block of threads in those threads will be
  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockLoad.
  */
 template <
     typename            InputT,
@@ -703,6 +723,59 @@ private:
     };
 
 
+    /**
+    * BLOCK_LOAD_STRIPED specialization of load helper
+    */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_STRIPED, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
     /**
      * BLOCK_LOAD_VECTORIZE specialization of load helper
      */
@@ -865,7 +938,7 @@ private:
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
         typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
@@ -940,7 +1013,7 @@ private:
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
         typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
@@ -1223,6 +1296,17 @@ public:
 
 };
 
+template <class Policy,
+          class It,
+          class T = typename std::iterator_traits<It>::value_type>
+struct BlockLoadType
+{
+  using type = cub::BlockLoad<T,
+                              Policy::BLOCK_THREADS,
+                              Policy::ITEMS_PER_THREAD,
+                              Policy::LOAD_ALGORITHM>;
+};
+
 
 }               // CUB namespace
 CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/src/3rdparty/cub/block/block_radix_rank.cuh b/src/3rdparty/cub/block/block_radix_rank.cuh
index a98976f..cf54d5f 100644
--- a/src/3rdparty/cub/block/block_radix_rank.cuh
+++ b/src/3rdparty/cub/block/block_radix_rank.cuh
@@ -38,6 +38,7 @@
 #include "../thread/thread_reduce.cuh"
 #include "../thread/thread_scan.cuh"
 #include "../block/block_scan.cuh"
+#include "../block/radix_rank_sort_operations.cuh"
 #include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
@@ -49,6 +50,52 @@ CUB_NS_PREFIX
 /// CUB namespace
 namespace cub {
 
+
+/**
+ * \brief Radix ranking algorithm, the algorithm used to implement stable ranking of the
+ * keys from a single tile. Note that different ranking algorithms require different
+ * initial arrangements of keys to function properly.
+ */
+enum RadixRankAlgorithm
+{
+    /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN == false. It
+     * uses thread-private histograms, and thus uses more shared memory. Requires blocked
+     * arrangement of keys. Does not support count callbacks. */
+    RADIX_RANK_BASIC,
+    /** Ranking using the BlockRadixRank algorithm with MEMOIZE_OUTER_SCAN ==
+     * true. Similar to RADIX_RANK BASIC, it requires blocked arrangement of
+     * keys and does not support count callbacks.*/
+    RADIX_RANK_MEMOIZE,
+    /** Ranking using the BlockRadixRankMatch algorithm. It uses warp-private
+     * histograms and matching for ranking the keys in a single warp. Therefore,
+     * it uses less shared memory compared to RADIX_RANK_BASIC. It requires
+     * warp-striped key arrangement and supports count callbacks. */
+    RADIX_RANK_MATCH,
+    /** Ranking using the BlockRadixRankMatchEarlyCounts algorithm with
+     * MATCH_ALGORITHM == WARP_MATCH_ANY. An alternative implementation of
+     * match-based ranking that computes bin counts early. Because of this, it
+     * works better with onesweep sorting, which requires bin counts for
+     * decoupled look-back. Assumes warp-striped key arrangement and supports
+     * count callbacks.*/
+    RADIX_RANK_MATCH_EARLY_COUNTS_ANY,
+    /** Ranking using the BlockRadixRankEarlyCounts algorithm with
+     * MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR. It uses extra space in shared
+     * memory to generate warp match masks using atomicOr(). This is faster when
+     * there are few matches, but can lead to slowdowns if the number of
+     * matching keys among warp lanes is high. Assumes warp-striped key
+     * arrangement and supports count callbacks. */
+    RADIX_RANK_MATCH_EARLY_COUNTS_ATOMIC_OR
+};
+
+
+/** Empty callback implementation */
+template <int BINS_PER_THREAD>
+struct BlockRadixRankEmptyCallback
+{
+    __device__ __forceinline__ void operator()(int (&bins)[BINS_PER_THREAD]) {}
+};
+
+
 /**
  * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
  * \ingroup BlockModule
@@ -82,6 +129,14 @@ namespace cub {
  *      {
  *
  *      \endcode
+ *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockRadixRank.
  */
 template <
     int                     BLOCK_DIM_X,
@@ -126,7 +181,7 @@ private:
         PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
         LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
 
-        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        LOG_COUNTER_LANES           = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),                // Always at least one lane
         COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
 
         // The number of packed counters per thread (plus one for padding)
@@ -346,12 +401,12 @@ public:
      */
     template <
         typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
     __device__ __forceinline__ void RankKeys(
         UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
         int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
+        DigitExtractorT digit_extractor)                    ///< [in] The digit extractor
     {
         DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
         DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
@@ -363,7 +418,7 @@ public:
         for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
         {
             // Get digit
-            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
+            unsigned int digit = digit_extractor.Digit(keys[ITEM]);
 
             // Get sub-counter
             unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
@@ -395,6 +450,7 @@ public:
         CTA_SYNC();
 
         // Extract the local ranks of each key
+        #pragma unroll
         for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
         {
             // Add in thread block exclusive prefix
@@ -408,16 +464,16 @@ public:
      */
     template <
         typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
     __device__ __forceinline__ void RankKeys(
         UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
         int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
         int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
     {
         // Rank keys
-        RankKeys(keys, ranks, current_bit, num_bits);
+        RankKeys(keys, ranks, digit_extractor);
 
         // Get the inclusive and exclusive digit totals corresponding to the calling thread.
         #pragma unroll
@@ -565,17 +621,63 @@ public:
      *********************************************************************/
     //@{
 
+    /** \brief Computes the count of keys for each digit value, and calls the
+     * callback with the array of key counts.
+
+     * @tparam CountsCallback The callback type. It should implement an instance
+     * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins
+     * is an array of key counts for each digit value distributed in block
+     * distribution among the threads of the thread block. Key counts can be
+     * used, to update other data structures in global or shared
+     * memory. Depending on the implementation of the ranking algoirhtm
+     * (see BlockRadixRankMatchEarlyCounts), key counts may become available
+     * early, therefore, they are returned through a callback rather than a
+     * separate output parameter of RankKeys().
+     */
+    template <int KEYS_PER_THREAD, typename CountsCallback>
+    __device__ __forceinline__ void CallBack(CountsCallback callback)
+    {
+        int bins[BINS_TRACKED_PER_THREAD];
+        // Get count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+            const int TILE_ITEMS = KEYS_PER_THREAD * BLOCK_THREADS;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+                    bins[track] = (bin_idx > 0 ?
+                        temp_storage.aliasable.warp_digit_counters[bin_idx - 1][0] : TILE_ITEMS) -
+                        temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+                }
+                else
+                {
+                    bins[track] = (bin_idx < RADIX_DIGITS - 1 ?
+                        temp_storage.aliasable.warp_digit_counters[bin_idx + 1][0] : TILE_ITEMS) -
+                        temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+                }
+            }
+        }
+        callback(bins);
+    }
+
     /**
      * \brief Rank keys.
      */
     template <
         typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT,
+        typename        CountsCallback>
     __device__ __forceinline__ void RankKeys(
         UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
         int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
+        CountsCallback    callback)
     {
         // Initialize shared digit counters
 
@@ -595,7 +697,7 @@ public:
         for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
         {
             // My digit
-            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+            uint32_t digit = digit_extractor.Digit(keys[ITEM]);
 
             if (IS_DESCENDING)
                 digit = RADIX_DIGITS - digit - 1;
@@ -648,6 +750,10 @@ public:
             temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
 
         CTA_SYNC();
+        if (!Equals<CountsCallback, BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>>::VALUE)
+        {
+            CallBack<KEYS_PER_THREAD>(callback);
+        }
 
         // Seed ranks with counter values from previous warps
         #pragma unroll
@@ -655,21 +761,34 @@ public:
             ranks[ITEM] += *digit_counters[ITEM];
     }
 
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor)
+    {
+        RankKeys(keys, ranks, digit_extractor,
+                 BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
+    }
 
     /**
      * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
      */
     template <
         typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT,
+        typename        CountsCallback>
     __device__ __forceinline__ void RankKeys(
         UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
         int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD],            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+        CountsCallback callback)
     {
-        RankKeys(keys, ranks, current_bit, num_bits);
+        RankKeys(keys, ranks, digit_extractor, callback);
 
         // Get exclusive count for each digit
         #pragma unroll
@@ -686,6 +805,326 @@ public:
             }
         }
     }
+
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD,
+        typename        DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        DigitExtractorT digit_extractor,
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix,
+                 BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
+    }
+};
+
+enum WarpMatchAlgorithm
+{
+    WARP_MATCH_ANY,
+    WARP_MATCH_ATOMIC_OR
+};
+
+/**
+ * Radix-rank using matching which computes the counts of keys for each digit
+ * value early, at the expense of doing more work. This may be useful e.g. for
+ * decoupled look-back, where it reduces the time other thread blocks need to
+ * wait for digit counts to become available.
+ */
+template <int BLOCK_DIM_X, int RADIX_BITS, bool IS_DESCENDING,
+          BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS,
+          WarpMatchAlgorithm MATCH_ALGORITHM = WARP_MATCH_ANY, int NUM_PARTS = 1>
+struct BlockRadixRankMatchEarlyCounts
+{
+    // constants
+    enum
+    {
+        BLOCK_THREADS = BLOCK_DIM_X,
+        RADIX_DIGITS = 1 << RADIX_BITS,
+        BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        BINS_TRACKED_PER_THREAD = BINS_PER_THREAD,
+        FULL_BINS = BINS_PER_THREAD * BLOCK_THREADS == RADIX_DIGITS,
+        WARP_THREADS = CUB_PTX_WARP_THREADS,
+        BLOCK_WARPS = BLOCK_THREADS / WARP_THREADS,
+        WARP_MASK = ~0,
+        NUM_MATCH_MASKS = MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR ? BLOCK_WARPS : 0,
+        // Guard against declaring zero-sized array:
+        MATCH_MASKS_ALLOC_SIZE = NUM_MATCH_MASKS < 1 ? 1 : NUM_MATCH_MASKS,
+    };
+
+    // types
+    typedef cub::BlockScan<int, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
+
+
+
+    // temporary storage
+    struct TempStorage
+    {
+        union
+        {
+            int warp_offsets[BLOCK_WARPS][RADIX_DIGITS];
+            int warp_histograms[BLOCK_WARPS][RADIX_DIGITS][NUM_PARTS];
+        };
+
+        int match_masks[MATCH_MASKS_ALLOC_SIZE][RADIX_DIGITS];
+
+        typename BlockScan::TempStorage prefix_tmp;
+    };
+
+    TempStorage& temp_storage;
+
+    // internal ranking implementation
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT,
+              typename CountsCallback>
+    struct BlockRadixRankMatchInternal
+    {
+        TempStorage& s;
+        DigitExtractorT digit_extractor;
+        CountsCallback callback;
+        int warp;
+        int lane;
+
+        __device__ __forceinline__ int Digit(UnsignedBits key)
+        {
+            int digit =  digit_extractor.Digit(key);
+            return IS_DESCENDING ? RADIX_DIGITS - 1 - digit : digit;
+        }
+
+        __device__ __forceinline__ int ThreadBin(int u)
+        {
+            int bin = threadIdx.x * BINS_PER_THREAD + u;
+            return IS_DESCENDING ? RADIX_DIGITS - 1 - bin : bin;
+        }
+
+        __device__ __forceinline__
+        void ComputeHistogramsWarp(UnsignedBits (&keys)[KEYS_PER_THREAD])
+        {
+            //int* warp_offsets = &s.warp_offsets[warp][0];
+            int (&warp_histograms)[RADIX_DIGITS][NUM_PARTS] = s.warp_histograms[warp];
+            // compute warp-private histograms
+            #pragma unroll
+            for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
+            {
+                #pragma unroll
+                for (int part = 0; part < NUM_PARTS; ++part)
+                {
+                    warp_histograms[bin][part] = 0;
+                }
+            }
+            if (MATCH_ALGORITHM == WARP_MATCH_ATOMIC_OR)
+            {
+                int* match_masks = &s.match_masks[warp][0];
+                #pragma unroll
+                for (int bin = lane; bin < RADIX_DIGITS; bin += WARP_THREADS)
+                {
+                    match_masks[bin] = 0;
+                }
+            }
+            WARP_SYNC(WARP_MASK);
+
+            // compute private per-part histograms
+            int part = lane % NUM_PARTS;
+            #pragma unroll
+            for (int u = 0; u < KEYS_PER_THREAD; ++u)
+            {
+                atomicAdd(&warp_histograms[Digit(keys[u])][part], 1);
+            }
+
+            // sum different parts;
+            // no extra work is necessary if NUM_PARTS == 1
+            if (NUM_PARTS > 1)
+            {
+                WARP_SYNC(WARP_MASK);
+                // TODO: handle RADIX_DIGITS % WARP_THREADS != 0 if it becomes necessary
+                const int WARP_BINS_PER_THREAD = RADIX_DIGITS / WARP_THREADS;
+                int bins[WARP_BINS_PER_THREAD];
+                #pragma unroll
+                for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
+                {
+                    int bin = lane + u * WARP_THREADS;
+                    bins[u] = internal::ThreadReduce(warp_histograms[bin], Sum());
+                }
+                CTA_SYNC();
+
+                // store the resulting histogram in shared memory
+                int* warp_offsets = &s.warp_offsets[warp][0];
+                #pragma unroll
+                for (int u = 0; u < WARP_BINS_PER_THREAD; ++u)
+                {
+                    int bin = lane + u * WARP_THREADS;
+                    warp_offsets[bin] = bins[u];
+                }
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeOffsetsWarpUpsweep(int (&bins)[BINS_PER_THREAD])
+        {
+            // sum up warp-private histograms
+            #pragma unroll
+            for (int u = 0; u < BINS_PER_THREAD; ++u)
+            {
+                bins[u] = 0;
+                int bin = ThreadBin(u);
+                if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS))
+                {
+                    #pragma unroll
+                    for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp)
+                    {
+                        int warp_offset = s.warp_offsets[j_warp][bin];
+                        s.warp_offsets[j_warp][bin] = bins[u];
+                        bins[u] += warp_offset;
+                    }
+                }
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeOffsetsWarpDownsweep(int (&offsets)[BINS_PER_THREAD])
+        {
+            #pragma unroll
+            for (int u = 0; u < BINS_PER_THREAD; ++u)
+            {
+                int bin = ThreadBin(u);
+                if (FULL_BINS || (bin >= 0 && bin < RADIX_DIGITS))
+                {
+                    int digit_offset = offsets[u];
+                    #pragma unroll
+                    for (int j_warp = 0; j_warp < BLOCK_WARPS; ++j_warp)
+                    {
+                        s.warp_offsets[j_warp][bin] += digit_offset;
+                    }
+                }
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeRanksItem(
+            UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD],
+            Int2Type<WARP_MATCH_ATOMIC_OR>)
+        {
+            // compute key ranks
+            int lane_mask = 1 << lane;
+            int* warp_offsets = &s.warp_offsets[warp][0];
+            int* match_masks = &s.match_masks[warp][0];
+            #pragma unroll
+            for (int u = 0; u < KEYS_PER_THREAD; ++u)
+            {
+                int bin = Digit(keys[u]);
+                int* p_match_mask = &match_masks[bin];
+                atomicOr(p_match_mask, lane_mask);
+                WARP_SYNC(WARP_MASK);
+                int bin_mask = *p_match_mask;
+                int leader = (WARP_THREADS - 1) - __clz(bin_mask);
+                int warp_offset = 0;
+                int popc = __popc(bin_mask & LaneMaskLe());
+                if (lane == leader)
+                {
+                    // atomic is a bit faster
+                    warp_offset = atomicAdd(&warp_offsets[bin], popc);
+                }
+                warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask);
+                if (lane == leader) *p_match_mask = 0;
+                WARP_SYNC(WARP_MASK);
+                ranks[u] = warp_offset + popc - 1;
+            }
+        }
+
+        __device__ __forceinline__
+        void ComputeRanksItem(
+            UnsignedBits (&keys)[KEYS_PER_THREAD], int (&ranks)[KEYS_PER_THREAD],
+            Int2Type<WARP_MATCH_ANY>)
+        {
+            // compute key ranks
+            int* warp_offsets = &s.warp_offsets[warp][0];
+            #pragma unroll
+            for (int u = 0; u < KEYS_PER_THREAD; ++u)
+            {
+                int bin = Digit(keys[u]);
+                int bin_mask = MatchAny<RADIX_BITS>(bin);
+                int leader = (WARP_THREADS - 1) - __clz(bin_mask);
+                int warp_offset = 0;
+                int popc = __popc(bin_mask & LaneMaskLe());
+                if (lane == leader)
+                {
+                    // atomic is a bit faster
+                    warp_offset = atomicAdd(&warp_offsets[bin], popc);
+                }
+                warp_offset = SHFL_IDX_SYNC(warp_offset, leader, bin_mask);
+                ranks[u] = warp_offset + popc - 1;
+            }
+        }
+
+        __device__ __forceinline__ void RankKeys(
+            UnsignedBits (&keys)[KEYS_PER_THREAD],
+            int (&ranks)[KEYS_PER_THREAD],
+            int (&exclusive_digit_prefix)[BINS_PER_THREAD])
+        {
+            ComputeHistogramsWarp(keys);
+
+            CTA_SYNC();
+            int bins[BINS_PER_THREAD];
+            ComputeOffsetsWarpUpsweep(bins);
+            callback(bins);
+
+            BlockScan(s.prefix_tmp).ExclusiveSum(bins, exclusive_digit_prefix);
+
+            ComputeOffsetsWarpDownsweep(exclusive_digit_prefix);
+            CTA_SYNC();
+            ComputeRanksItem(keys, ranks, Int2Type<MATCH_ALGORITHM>());
+        }
+
+        __device__ __forceinline__ BlockRadixRankMatchInternal
+        (TempStorage& temp_storage, DigitExtractorT digit_extractor, CountsCallback callback)
+            : s(temp_storage), digit_extractor(digit_extractor),
+              callback(callback), warp(threadIdx.x / WARP_THREADS), lane(LaneId())
+            {}
+    };
+
+    __device__ __forceinline__ BlockRadixRankMatchEarlyCounts
+    (TempStorage& temp_storage) : temp_storage(temp_storage) {}
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT,
+        typename CountsCallback>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],
+        int             (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor,
+        int             (&exclusive_digit_prefix)[BINS_PER_THREAD],
+        CountsCallback  callback)
+    {
+        BlockRadixRankMatchInternal<UnsignedBits, KEYS_PER_THREAD, DigitExtractorT, CountsCallback>
+            internal(temp_storage, digit_extractor, callback);
+        internal.RankKeys(keys, ranks, exclusive_digit_prefix);
+    }
+
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],
+        int             (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor,
+        int             (&exclusive_digit_prefix)[BINS_PER_THREAD])
+    {
+        typedef BlockRadixRankEmptyCallback<BINS_PER_THREAD> CountsCallback;
+        BlockRadixRankMatchInternal<UnsignedBits, KEYS_PER_THREAD, DigitExtractorT, CountsCallback>
+            internal(temp_storage, digit_extractor, CountsCallback());
+        internal.RankKeys(keys, ranks, exclusive_digit_prefix);
+    }
+
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],
+        int             (&ranks)[KEYS_PER_THREAD],
+        DigitExtractorT digit_extractor)
+    {
+        int exclusive_digit_prefix[BINS_PER_THREAD];
+        RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix);
+    }
 };
 
 
diff --git a/src/3rdparty/cub/block/block_radix_sort.cuh b/src/3rdparty/cub/block/block_radix_sort.cuh
index e666902..213b7f2 100644
--- a/src/3rdparty/cub/block/block_radix_sort.cuh
+++ b/src/3rdparty/cub/block/block_radix_sort.cuh
@@ -36,6 +36,7 @@
 
 #include "block_exchange.cuh"
 #include "block_radix_rank.cuh"
+#include "radix_rank_sort_operations.cuh"
 #include "../config.cuh"
 #include "../util_ptx.cuh"
 #include "../util_type.cuh"
@@ -63,21 +64,60 @@ namespace cub {
  * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
  *
  * \par Overview
- * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- *   items into ascending order.  It relies upon a positional representation for
- *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- *   characters, etc.) specified from least-significant to most-significant.  For a
- *   given input sequence of keys and a set of rules specifying a total ordering
- *   of the symbolic alphabet, the radix sorting method produces a lexicographic
- *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
- *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- *   half-precision floating-point type. Within each key, the implementation treats fixed-length
- *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
- *   method can only be applied to unsigned integral types, BlockRadixSort
- *   is able to sort signed and floating-point types via simple bit-wise transformations
- *   that ensure lexicographic key ordering.
- * - \rowmajor
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending order.  It relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \rowmajor
+ *
+ * \par Supported Types
+ * BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.
+ *
+ * \par Floating-Point Special Cases
+ *
+ * - Positive and negative zeros are considered equivalent, and will be treated
+ *   as such in the output.
+ * - No special handling is implemented for NaN values; these are sorted
+ *   according to their bit representations after any transformations.
+ *
+ * \par Bitwise Key Transformations
+ * Although the direct radix sorting method can only be applied to unsigned
+ * integral types, BlockRadixSort is able to sort signed and floating-point
+ * types via simple bit-wise transformations that ensure lexicographic key
+ * ordering.
+ *
+ * These transformations must be considered when restricting the
+ * `[begin_bit, end_bit)` range, as the bitwise transformations will occur
+ * before the bit-range truncation.
+ *
+ * Any transformations applied to the keys prior to sorting are reversed
+ * while writing to the final output buffer.
+ *
+ * \par Type Specific Bitwise Transformations
+ * To convert the input values into a radix-sortable bitwise representation,
+ * the following transformations take place prior to sorting:
+ *
+ * - For unsigned integral values, the keys are used directly.
+ * - For signed integral values, the sign bit is inverted.
+ * - For positive floating point values, the sign bit is inverted.
+ * - For negative floating point values, the full key is inverted.
+ *
+ * \par No Descending Sort Transformations
+ * Unlike `DeviceRadixSort`, `BlockRadixSort` does not invert the input key bits
+ * when performing a descending sort. Instead, it has special logic to reverse
+ * the order of the keys while sorting.
+ *
+ * \par Stability
+ * BlockRadixSort is stable. For floating-point types -0.0 and +0.0
+ * are considered equal and appear in the result in the same order as they
+ * appear in the input.
+ *
  *
  * \par Performance Considerations
  * - \granularity
@@ -115,6 +155,13 @@ namespace cub {
  * corresponding output \p thread_keys in those threads will be
  * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockRadixSort.
  */
 template <
     typename                KeyT,
@@ -175,6 +222,9 @@ private:
             PTX_ARCH>
         DescendingBlockRadixRank;
 
+    /// Digit extractor type
+    typedef BFEDigitExtractor<KeyT> DigitExtractorT;
+
     /// BlockExchange utility type for keys
     typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
 
@@ -216,30 +266,26 @@ private:
     __device__ __forceinline__ void RankKeys(
         UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
         int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
+        DigitExtractorT digit_extractor,
         Int2Type<false> /*is_descending*/)
     {
         AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
+                unsigned_keys,
+                ranks,
+                digit_extractor);
     }
 
     /// Rank keys (specialized for descending sort)
     __device__ __forceinline__ void RankKeys(
         UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
         int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
+        DigitExtractorT digit_extractor,
         Int2Type<true>  /*is_descending*/)
     {
         DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
+                unsigned_keys,
+                ranks,
+                digit_extractor);
     }
 
     /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
@@ -301,10 +347,11 @@ private:
         while (true)
         {
             int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+            DigitExtractorT digit_extractor(begin_bit, pass_bits);
 
             // Rank the blocked keys
             int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
             begin_bit += RADIX_BITS;
 
             CTA_SYNC();
@@ -357,10 +404,11 @@ public:
         while (true)
         {
             int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+            DigitExtractorT digit_extractor(begin_bit, pass_bits);
 
             // Rank the blocked keys
             int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            RankKeys(unsigned_keys, ranks, digit_extractor, is_descending);
             begin_bit += RADIX_BITS;
 
             CTA_SYNC();
diff --git a/src/3rdparty/cub/block/block_scan.cuh b/src/3rdparty/cub/block/block_scan.cuh
index 513ef35..208d5f8 100644
--- a/src/3rdparty/cub/block/block_scan.cuh
+++ b/src/3rdparty/cub/block/block_scan.cuh
@@ -181,6 +181,13 @@ enum BlockScanAlgorithm
  * The corresponding output \p thread_data in those threads will be
  * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockScan.
  */
 template <
     typename            T,
diff --git a/src/3rdparty/cub/block/block_store.cuh b/src/3rdparty/cub/block/block_store.cuh
index 495a155..df654ea 100644
--- a/src/3rdparty/cub/block/block_store.cuh
+++ b/src/3rdparty/cub/block/block_store.cuh
@@ -34,6 +34,7 @@
 #pragma once
 
 #include <iterator>
+#include <type_traits>
 
 #include "block_exchange.cuh"
 #include "../config.cuh"
@@ -364,6 +365,17 @@ enum BlockStoreAlgorithm
      */
     BLOCK_STORE_DIRECT,
 
+    /**
+     * \par Overview
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_STRIPED,
+
     /**
      * \par Overview
      *
@@ -432,7 +444,6 @@ enum BlockStoreAlgorithm
      *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
      */
     BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-
 };
 
 
@@ -445,7 +456,6 @@ enum BlockStoreAlgorithm
  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
  * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
  * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
  * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
  * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
  * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
@@ -457,6 +467,8 @@ enum BlockStoreAlgorithm
  * - BlockStore can be optionally specialized by different data movement strategies:
  *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
  *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_STRIPED</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
  *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is written directly to memory using CUDA's built-in vectorized stores as a
  *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
@@ -466,6 +478,10 @@ enum BlockStoreAlgorithm
  *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
  *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory. To reduce the shared memory requireent, only one warp's worth of shared
+ *      memory is provisioned and is subsequently time-sliced among warps.  [More...](\ref cub::BlockStoreAlgorithm)
  * - \rowmajor
  *
  * \par A Simple Example
@@ -502,6 +518,13 @@ enum BlockStoreAlgorithm
  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
  * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
  *
+ * \par Re-using dynamically allocating shared memory
+ * The following example under the examples/block folder illustrates usage of
+ * dynamically shared memory with BlockReduce and how to re-purpose
+ * the same memory region:
+ * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ *
+ * This example can be easily adapted to the storage required by BlockStore.
  */
 template <
     typename                T,
@@ -576,6 +599,47 @@ private:
     };
 
 
+    /**
+    * BLOCK_STORE_STRIPED specialization of store helper
+    */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_STRIPED, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
     /**
      * BLOCK_STORE_VECTORIZE specialization of store helper
      */
@@ -697,7 +761,7 @@ private:
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
         typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
@@ -765,7 +829,7 @@ private:
         };
 
         // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+        CUB_STATIC_ASSERT((int(BLOCK_THREADS) % int(WARP_THREADS) == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
         typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
@@ -820,6 +884,7 @@ private:
         }
     };
 
+
     /******************************************************************************
      * Type definitions
      ******************************************************************************/
@@ -993,6 +1058,16 @@ public:
     }
 };
 
+template <class Policy,
+          class It,
+          class T = typename std::iterator_traits<It>::value_type>
+struct BlockStoreType
+{
+  using type = cub::BlockStore<T,
+                               Policy::BLOCK_THREADS,
+                               Policy::ITEMS_PER_THREAD,
+                               Policy::STORE_ALGORITHM>;
+};
 
 }               // CUB namespace
 CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/src/3rdparty/cub/block/specializations/block_scan_raking.cuh b/src/3rdparty/cub/block/specializations/block_scan_raking.cuh
index 1d6c2f7..8f20818 100644
--- a/src/3rdparty/cub/block/specializations/block_scan_raking.cuh
+++ b/src/3rdparty/cub/block/specializations/block_scan_raking.cuh
@@ -84,7 +84,7 @@ struct BlockScanRaking
         SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
 
         /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+        WARP_SYNCHRONOUS = (int(BLOCK_THREADS) == int(RAKING_THREADS)),
     };
 
     ///  WarpScan utility type
diff --git a/src/3rdparty/cub/device/device_segmented_radix_sort.cuh b/src/3rdparty/cub/device/device_segmented_radix_sort.cuh
index 2ab2a7d..401bc1f 100644
--- a/src/3rdparty/cub/device/device_segmented_radix_sort.cuh
+++ b/src/3rdparty/cub/device/device_segmented_radix_sort.cuh
@@ -60,13 +60,9 @@ namespace cub {
  * of the symbolic alphabet, the radix sorting method produces a lexicographic
  * ordering of those keys.
  *
- * \par
- * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
- * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- * half-precision floating-point type.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
+ * \par See Also
+ * DeviceSegmentedRadixSort shares its implementation with DeviceRadixSort. See
+ * that algorithm's documentation for more information.
  *
  * \par Usage Considerations
  * \cdp_class{DeviceSegmentedRadixSort}
@@ -130,14 +126,16 @@ struct DeviceSegmentedRadixSort
      *
      * \endcode
      *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam KeyT                  <b>[inferred]</b> Key type
+     * \tparam ValueT                <b>[inferred]</b> Value type
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename            KeyT,
         typename            ValueT,
-        typename            OffsetIteratorT>
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortPairs(
         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -148,8 +146,8 @@ struct DeviceSegmentedRadixSort
         ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
         int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -161,7 +159,7 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
         DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
@@ -241,12 +239,14 @@ struct DeviceSegmentedRadixSort
      *
      * \tparam KeyT             <b>[inferred]</b> Key type
      * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename                KeyT,
         typename                ValueT,
-        typename                OffsetIteratorT>
+        typename                BeginOffsetIteratorT,
+        typename                EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortPairs(
         void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -255,8 +255,8 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
         int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT    d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT      d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -265,7 +265,7 @@ struct DeviceSegmentedRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
@@ -334,12 +334,14 @@ struct DeviceSegmentedRadixSort
      *
      * \tparam KeyT             <b>[inferred]</b> Key type
      * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename            KeyT,
         typename            ValueT,
-        typename            OffsetIteratorT>
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortPairsDescending(
         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -350,8 +352,8 @@ struct DeviceSegmentedRadixSort
         ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
         int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -363,7 +365,7 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
         DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
 
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
@@ -443,12 +445,14 @@ struct DeviceSegmentedRadixSort
      *
      * \tparam KeyT             <b>[inferred]</b> Key type
      * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename                KeyT,
         typename                ValueT,
-        typename                OffsetIteratorT>
+        typename                BeginOffsetIteratorT,
+        typename                EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortPairsDescending(
         void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -457,8 +461,8 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
         int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT    d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT      d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -467,7 +471,7 @@ struct DeviceSegmentedRadixSort
         // Signed integer type for global offsets
         typedef int OffsetT;
 
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
@@ -536,11 +540,13 @@ struct DeviceSegmentedRadixSort
      * \endcode
      *
      * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename            KeyT,
-        typename            OffsetIteratorT>
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortKeys(
         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -549,8 +555,8 @@ struct DeviceSegmentedRadixSort
         KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
         int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -563,7 +569,7 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
         DoubleBuffer<NullType>  d_values;
 
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
@@ -635,11 +641,13 @@ struct DeviceSegmentedRadixSort
      * \endcode
      *
      * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename            KeyT,
-        typename            OffsetIteratorT>
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortKeys(
         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -647,8 +655,8 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
         int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -660,7 +668,7 @@ struct DeviceSegmentedRadixSort
         // Null value type
         DoubleBuffer<NullType> d_values;
 
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
@@ -724,11 +732,13 @@ struct DeviceSegmentedRadixSort
      * \endcode
      *
      * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename            KeyT,
-        typename            OffsetIteratorT>
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortKeysDescending(
         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -737,8 +747,8 @@ struct DeviceSegmentedRadixSort
         KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
         int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT d_begin_offsets,                       ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -750,7 +760,7 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
         DoubleBuffer<NullType>  d_values;
 
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
@@ -822,11 +832,13 @@ struct DeviceSegmentedRadixSort
      * \endcode
      *
      * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam BeginOffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment beginning offsets \iterator
+     * \tparam EndOffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment ending offsets \iterator
      */
     template <
         typename            KeyT,
-        typename            OffsetIteratorT>
+        typename            BeginOffsetIteratorT,
+        typename            EndOffsetIteratorT>
     CUB_RUNTIME_FUNCTION
     static cudaError_t SortKeysDescending(
         void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
@@ -834,8 +846,8 @@ struct DeviceSegmentedRadixSort
         DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
         int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
         int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT d_begin_offsets,                        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT  d_end_offsets,                          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
         int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
         cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -847,7 +859,7 @@ struct DeviceSegmentedRadixSort
         // Null value type
         DoubleBuffer<NullType> d_values;
 
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>::Dispatch(
             d_temp_storage,
             temp_storage_bytes,
             d_keys,
diff --git a/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh b/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh
index 2b0919f..263d610 100644
--- a/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/src/3rdparty/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -37,6 +37,8 @@
 #include <stdio.h>
 #include <iterator>
 
+#include "../../agent/agent_radix_sort_histogram.cuh"
+#include "../../agent/agent_radix_sort_onesweep.cuh"
 #include "../../agent/agent_radix_sort_upsweep.cuh"
 #include "../../agent/agent_radix_sort_downsweep.cuh"
 #include "../../agent/agent_scan.cuh"
@@ -46,15 +48,21 @@
 #include "../../util_type.cuh"
 #include "../../util_debug.cuh"
 #include "../../util_device.cuh"
+#include "../../util_math.cuh"
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
+// suppress warnings triggered by #pragma unroll:
+// "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]"
+#if defined(__clang__)
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wpass-failed"
+#endif
 
 /// CUB namespace
 namespace cub {
 
+
 /******************************************************************************
  * Kernel entry points
  *****************************************************************************/
@@ -69,8 +77,8 @@ template <
     typename                KeyT,                           ///< Key type
     typename                OffsetT>                        ///< Signed integer type for global offsets
 __launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
+    int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) :
+    int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
 __global__ void DeviceRadixSortUpsweepKernel(
     const KeyT              *d_keys,                        ///< [in] Input keys buffer
     OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
@@ -156,6 +164,13 @@ __global__ void RadixSortScanBinsKernel(
         block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
         block_offset += AgentScanT::TILE_ITEMS;
     }
+
+    // Process the remaining partial tile (if any).
+    if (block_offset < num_counts)
+    {
+        block_scan.template ConsumeTile<false, true>(block_offset, prefix_op,
+                                                     num_counts - block_offset);
+    }
 }
 
 
@@ -170,8 +185,8 @@ template <
     typename                ValueT,                         ///< Value type
     typename                OffsetT>                        ///< Signed integer type for global offsets
 __launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
+    int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) :
+    int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
 __global__ void DeviceRadixSortDownsweepKernel(
     const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
     KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
@@ -345,7 +360,8 @@ template <
     bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
     typename                KeyT,                           ///< Key type
     typename                ValueT,                         ///< Value type
-    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                BeginOffsetIteratorT,           ///< Random-access input iterator type for reading segment beginning offsets \iterator
+    typename                EndOffsetIteratorT,             ///< Random-access input iterator type for reading segment ending offsets \iterator
     typename                OffsetT>                        ///< Signed integer type for global offsets
 __launch_bounds__ (int((ALT_DIGIT_BITS) ?
     ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
@@ -355,8 +371,8 @@ __global__ void DeviceSegmentedRadixSortKernel(
     KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
     const ValueT            *d_values_in,                   ///< [in] Input values buffer
     ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    BeginOffsetIteratorT    d_begin_offsets,                ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    EndOffsetIteratorT      d_end_offsets,                  ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
     int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
     int                     current_bit,                    ///< [in] Bit position of current radix digit
     int                     pass_bits)                      ///< [in] Number of bits of current radix digit
@@ -498,6 +514,96 @@ __global__ void DeviceSegmentedRadixSortKernel(
 }
 
 
+/******************************************************************************
+ * Onesweep kernels
+ ******************************************************************************/
+
+/**
+ * Kernel for computing multiple histograms
+ */
+
+/**
+ * Histogram kernel
+ */
+template <
+    typename ChainedPolicyT,
+    bool IS_DESCENDING,
+    typename KeyT,
+    typename OffsetT>
+__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS)
+DeviceRadixSortHistogramKernel
+    (OffsetT* d_bins_out, const KeyT* d_keys_in, OffsetT num_items, int start_bit, int end_bit)
+{
+    typedef typename ChainedPolicyT::ActivePolicy::HistogramPolicy HistogramPolicyT;
+    typedef AgentRadixSortHistogram<HistogramPolicyT, IS_DESCENDING, KeyT, OffsetT> AgentT;
+    __shared__ typename AgentT::TempStorage temp_storage;
+    AgentT agent(temp_storage, d_bins_out, d_keys_in, num_items, start_bit, end_bit);
+    agent.Process();
+}
+
+template <
+    typename ChainedPolicyT,
+    bool IS_DESCENDING,
+    typename KeyT,
+    typename ValueT,
+    typename OffsetT,
+    typename AtomicOffsetT = OffsetT>
+__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS)
+DeviceRadixSortOnesweepKernel
+    (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out,
+     const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out,
+     const ValueT* d_values_in, OffsetT num_items, int current_bit, int num_bits)
+{
+    typedef typename ChainedPolicyT::ActivePolicy::OnesweepPolicy OnesweepPolicyT;
+    typedef AgentRadixSortOnesweep<OnesweepPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> AgentT;
+    __shared__ typename AgentT::TempStorage s;
+
+    AgentT agent(s, d_lookback, d_ctrs, d_bins_out, d_bins_in, d_keys_out, d_keys_in,
+                 d_values_out, d_values_in, num_items, current_bit, num_bits);
+    agent.Process();
+}
+
+
+/**
+ * Exclusive sum kernel
+ */
+template <
+    typename ChainedPolicyT,
+    typename OffsetT>
+__global__ void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins)
+{
+    typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT;
+    const int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS;
+    const int RADIX_DIGITS = 1 << RADIX_BITS;
+    const int BLOCK_THREADS = ExclusiveSumPolicyT::BLOCK_THREADS;
+    const int BINS_PER_THREAD = (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS;
+    typedef cub::BlockScan<OffsetT, BLOCK_THREADS> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+
+    // load the bins
+    OffsetT bins[BINS_PER_THREAD];
+    int bin_start = blockIdx.x * RADIX_DIGITS;
+    #pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+        int bin = threadIdx.x * BINS_PER_THREAD + u;
+        if (bin >= RADIX_DIGITS) break;
+        bins[u] = d_bins[bin_start + bin];
+    }
+
+    // compute offsets
+    BlockScan(temp_storage).ExclusiveSum(bins, bins);
+
+    // store the offsets
+    #pragma unroll
+    for (int u = 0; u < BINS_PER_THREAD; ++u)
+    {
+        int bin = threadIdx.x * BINS_PER_THREAD + u;
+        if (bin >= RADIX_DIGITS) break;
+        d_bins[bin_start + bin] = bins[u];
+    }
+}
+
 
 /******************************************************************************
  * Policy
@@ -529,102 +635,25 @@ struct DeviceRadixSortPolicy
     // Architecture-specific tuning policies
     //------------------------------------------------------------------------------
 
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-
-            // Relative size of KeyT type to a 4-byte word
-            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <64, 18, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <128, 13, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <512, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <64, 18, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, 13, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy350>
     {
         enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+            ONESWEEP = false,
+            ONESWEEP_RADIX_BITS = 8,
         };
 
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <256, 7, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
 
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <256, 5, DominantT, LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
 
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <128, 14, DominantT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, 10, DominantT, BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
-        };
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT,
+            ONESWEEP_RADIX_BITS> OnesweepPolicy;
 
         // Scan policy
         typedef AgentScanPolicy <1024, 4, OffsetT, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
@@ -663,8 +692,21 @@ struct DeviceRadixSortPolicy
             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
             SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
             SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
+            ONESWEEP = false,
+            ONESWEEP_RADIX_BITS = 8,
         };
 
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 21, DominantT, 1,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_STORE_DIRECT,
+            ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
@@ -692,8 +734,21 @@ struct DeviceRadixSortPolicy
             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
             SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
             SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),  // 10.0B 32b keys/s (GP100, 64M random keys)
+            ONESWEEP_RADIX_BITS = 8,
         };
 
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
@@ -722,8 +777,21 @@ struct DeviceRadixSortPolicy
             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
             SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
             SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),
+            ONESWEEP_RADIX_BITS = 8,
         };
 
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
@@ -750,8 +818,21 @@ struct DeviceRadixSortPolicy
         enum {
             PRIMARY_RADIX_BITS      = 5,
             ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),
+            ONESWEEP_RADIX_BITS = 8,
         };
 
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256, 30, DominantT, 2,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
@@ -779,8 +860,65 @@ struct DeviceRadixSortPolicy
             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
             SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
             SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),  // 15.8B 32b keys/s (V100-SXM2, 64M random keys)
+            ONESWEEP_RADIX_BITS = 8,
         };
 
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <256, 8, 8, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <256,
+            sizeof(KeyT) == 4 && sizeof(ValueT) == 4 ? 46 : 23, DominantT, 4,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_WARP_SCANS,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <512, 23, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <(sizeof(KeyT) > 1) ? 256 : 128, 47, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, 23, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS>     UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <256, 47, DominantT, LOAD_DEFAULT, PRIMARY_RADIX_BITS - 1> AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <256, 19, DominantT,  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <192, 39, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <384, 11, DominantT,  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM80
+    struct Policy800 : ChainedPolicy<800, Policy800, Policy700>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,
+            ONESWEEP = sizeof(KeyT) >= sizeof(uint32_t),
+            ONESWEEP_RADIX_BITS = 8,
+        };
+
+        // Histogram policy
+        typedef AgentRadixSortHistogramPolicy <128, 16, 1, KeyT, ONESWEEP_RADIX_BITS> HistogramPolicy;
+
+        // Exclusive sum policy
+        typedef AgentRadixSortExclusiveSumPolicy <256, ONESWEEP_RADIX_BITS> ExclusiveSumPolicy;
+
+        // Onesweep policy
+        typedef AgentRadixSortOnesweepPolicy <384, 21, DominantT, 1,
+            RADIX_RANK_MATCH_EARLY_COUNTS_ANY, BLOCK_SCAN_RAKING_MEMOIZE,
+            RADIX_SORT_STORE_DIRECT, ONESWEEP_RADIX_BITS> OnesweepPolicy;
+
         // ScanPolicy
         typedef AgentScanPolicy <512, 23, OffsetT, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
 
@@ -802,7 +940,7 @@ struct DeviceRadixSortPolicy
 
 
     /// MaxPolicy
-    typedef Policy700 MaxPolicy;
+    typedef Policy800 MaxPolicy;
 
 
 };
@@ -968,7 +1106,7 @@ struct DispatchRadixSort :
         const ValueT    *d_values_in,
         ValueT          *d_values_out,
         OffsetT         *d_spine,
-        int             spine_length,
+        int             /*spine_length*/,
         int             &current_bit,
         PassConfigT     &pass_config)
     {
@@ -983,6 +1121,9 @@ struct DispatchRadixSort :
                 pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
                 pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
 
+            // Spine length written by the upsweep kernel in the current pass.
+            int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits;
+
             // Invoke upsweep_kernel with same grid size as downsweep_kernel
             thrust::cuda_cub::launcher::triple_chevron(
                 pass_config.even_share.grid_size,
@@ -1010,7 +1151,7 @@ struct DispatchRadixSort :
                 1, pass_config.scan_config.block_threads, 0, stream
             ).doit(pass_config.scan_kernel,
                 d_spine,
-                spine_length);
+                pass_spine_length);
 
             // Check for failure to launch
             if (CubDebug(error = cudaPeekAtLastError())) break;
@@ -1084,7 +1225,7 @@ struct DispatchRadixSort :
             DownsweepKernelT    downsweep_kernel,
             int                 ptx_version,
             int                 sm_count,
-            int                 num_items)
+            OffsetT             num_items)
         {
             cudaError error = cudaSuccess;
             do
@@ -1113,6 +1254,137 @@ struct DispatchRadixSort :
 
     };
 
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeOnesweep()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+        typedef OffsetT AtomicOffsetT;
+
+        // compute temporary storage size
+        const int RADIX_BITS = ActivePolicyT::ONESWEEP_RADIX_BITS;
+        const int RADIX_DIGITS = 1 << RADIX_BITS;
+        const int ONESWEEP_ITEMS_PER_THREAD = ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD;
+        const int ONESWEEP_BLOCK_THREADS = ActivePolicyT::OnesweepPolicy::BLOCK_THREADS;
+        const int ONESWEEP_TILE_ITEMS = ONESWEEP_ITEMS_PER_THREAD * ONESWEEP_BLOCK_THREADS;
+        // parts handle inputs with >=2**30 elements, due to the way lookback works
+        // for testing purposes, one part is <= 2**28 elements
+        const int PART_SIZE = ((1 << 28) - 1) / ONESWEEP_TILE_ITEMS * ONESWEEP_TILE_ITEMS;
+        int num_passes = cub::DivideAndRoundUp(end_bit - begin_bit, RADIX_BITS);
+        int num_parts = static_cast<int>(cub::DivideAndRoundUp(num_items, PART_SIZE));
+        OffsetT max_num_blocks = cub::DivideAndRoundUp(CUB_MIN(num_items, PART_SIZE), ONESWEEP_TILE_ITEMS);
+
+        size_t value_size = KEYS_ONLY ? 0 : sizeof(ValueT);
+        size_t allocation_sizes[] =
+        {
+            // bins
+            num_parts * num_passes * RADIX_DIGITS * sizeof(OffsetT),
+            // lookback
+            max_num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
+            // extra key buffer
+            is_overwrite_okay || num_passes <= 1 ? 0 : num_items * sizeof(KeyT),
+            // extra value buffer
+            is_overwrite_okay || num_passes <= 1 ? 0 : num_items * value_size,
+            // counters
+            num_parts * num_passes * sizeof(AtomicOffsetT),
+        };
+        const int NUM_ALLOCATIONS = sizeof(allocation_sizes) / sizeof(allocation_sizes[0]);
+        void* allocations[NUM_ALLOCATIONS] = {};
+        AliasTemporaries<NUM_ALLOCATIONS>(d_temp_storage, temp_storage_bytes,
+                                          allocations, allocation_sizes);
+
+        // just return if no temporary storage is provided
+        cudaError_t error = cudaSuccess;
+        if (d_temp_storage == NULL) return error;
+
+        OffsetT* d_bins = (OffsetT*)allocations[0];
+        AtomicOffsetT* d_lookback = (AtomicOffsetT*)allocations[1];
+        KeyT* d_keys_tmp2 = (KeyT*)allocations[2];
+        ValueT* d_values_tmp2 = (ValueT*)allocations[3];
+        AtomicOffsetT* d_ctrs = (AtomicOffsetT*)allocations[4];
+
+        do {
+            // initialization
+            if (CubDebug(error = cudaMemsetAsync(
+                   d_ctrs, 0, num_parts * num_passes * sizeof(AtomicOffsetT), stream))) break;
+
+            // compute num_passes histograms with RADIX_DIGITS bins each
+            if (CubDebug(error = cudaMemsetAsync
+                   (d_bins, 0, num_passes * RADIX_DIGITS * sizeof(OffsetT), stream))) break;
+            int device = -1;
+            int num_sms = 0;
+            if (CubDebug(error = cudaGetDevice(&device))) break;
+            if (CubDebug(error = cudaDeviceGetAttribute(
+                   &num_sms, cudaDevAttrMultiProcessorCount, device))) break;
+
+            const int HISTO_BLOCK_THREADS = ActivePolicyT::HistogramPolicy::BLOCK_THREADS;
+            int histo_blocks_per_sm = 1;
+            auto histogram_kernel = DeviceRadixSortHistogramKernel<
+                MaxPolicyT, IS_DESCENDING, KeyT, OffsetT>;
+            if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break;
+            histogram_kernel<<<histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream>>>
+                (d_bins, d_keys.Current(), num_items, begin_bit, end_bit);
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // exclusive sums to determine starts
+            const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;
+            DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>
+                <<<num_passes, SCAN_BLOCK_THREADS, 0, stream>>>(d_bins);
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // use the other buffer if no overwrite is allowed
+            KeyT* d_keys_tmp = d_keys.Alternate();
+            ValueT* d_values_tmp = d_values.Alternate();
+            if (!is_overwrite_okay && num_passes % 2 == 0)
+            {
+                d_keys.d_buffers[1] = d_keys_tmp2;
+                d_values.d_buffers[1] = d_values_tmp2;
+            }
+
+            for (int current_bit = begin_bit, pass = 0; current_bit < end_bit;
+                 current_bit += RADIX_BITS, ++pass)
+            {
+                int num_bits = CUB_MIN(end_bit - current_bit, RADIX_BITS);
+                for (int part = 0; part < num_parts; ++part)
+                {
+                    int part_num_items = CUB_MIN(num_items - part * PART_SIZE, PART_SIZE);
+                    int num_blocks = cub::DivideAndRoundUp(part_num_items, ONESWEEP_TILE_ITEMS);
+                    if (CubDebug(error = cudaMemsetAsync(
+                           d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
+                           stream))) break;
+                    auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
+                        MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>;
+                    onesweep_kernel<<<num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream>>>
+                        (d_lookback, d_ctrs + part * num_passes + pass,
+                         part < num_parts - 1 ?
+                             d_bins + ((part + 1) * num_passes + pass) * RADIX_DIGITS : NULL,
+                         d_bins + (part * num_passes + pass) * RADIX_DIGITS,
+                         d_keys.Alternate(),
+                         d_keys.Current() + part * PART_SIZE,
+                         d_values.Alternate(),
+                         d_values.Current() + part * PART_SIZE,
+                         part_num_items, current_bit, num_bits);
+                    if (CubDebug(error = cudaPeekAtLastError())) break;
+                }
+
+                // use the temporary buffers if no overwrite is allowed
+                if (!is_overwrite_okay && pass == 0)
+                {
+                    d_keys = num_passes % 2 == 0 ?
+                        DoubleBuffer<KeyT>(d_keys_tmp, d_keys_tmp2) :
+                        DoubleBuffer<KeyT>(d_keys_tmp2, d_keys_tmp);
+                    d_values = num_passes % 2 == 0 ?
+                        DoubleBuffer<ValueT>(d_values_tmp, d_values_tmp2) :
+                        DoubleBuffer<ValueT>(d_values_tmp2, d_values_tmp);
+                }
+                d_keys.selector ^= 1;
+                d_values.selector ^= 1;
+            }
+        } while (0);
+
+        return error;
+    }
 
     /// Invocation (run multiple digit passes)
     template <
@@ -1152,17 +1424,33 @@ struct DispatchRadixSort :
 
             // Init regular and alternate-digit kernel configurations
             PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
-            if ((error = pass_config.template InitPassConfig<
-                    typename ActivePolicyT::UpsweepPolicy,
-                    typename ActivePolicyT::ScanPolicy,
-                    typename ActivePolicyT::DownsweepPolicy>(
-                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
-
-            if ((error = alt_pass_config.template InitPassConfig<
-                    typename ActivePolicyT::AltUpsweepPolicy,
-                    typename ActivePolicyT::ScanPolicy,
-                    typename ActivePolicyT::AltDownsweepPolicy>(
-                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
+            error = pass_config.template InitPassConfig<
+              typename ActivePolicyT::UpsweepPolicy,
+              typename ActivePolicyT::ScanPolicy,
+              typename ActivePolicyT::DownsweepPolicy>(upsweep_kernel,
+                                                       scan_kernel,
+                                                       downsweep_kernel,
+                                                       ptx_version,
+                                                       sm_count,
+                                                       num_items);
+            if (error)
+            {
+              break;
+            }
+
+            error = alt_pass_config.template InitPassConfig<
+              typename ActivePolicyT::AltUpsweepPolicy,
+              typename ActivePolicyT::ScanPolicy,
+              typename ActivePolicyT::AltDownsweepPolicy>(alt_upsweep_kernel,
+                                                          scan_kernel,
+                                                          alt_downsweep_kernel,
+                                                          ptx_version,
+                                                          sm_count,
+                                                          num_items);
+            if (error)
+            {
+              break;
+            }
 
             // Get maximum spine length
             int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
@@ -1186,7 +1474,7 @@ struct DispatchRadixSort :
 
             // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
             int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            int num_passes          = cub::DivideAndRoundUp(num_bits, pass_config.radix_bits);
             bool is_num_passes_odd  = num_passes & 1;
             int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
             int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
@@ -1244,6 +1532,28 @@ struct DispatchRadixSort :
     // Chained policy invocation
     //------------------------------------------------------------------------------
 
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeManyTiles(Int2Type<false>)
+    {
+        // Invoke upsweep-downsweep
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        return InvokePasses<ActivePolicyT>(
+            DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+            DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+            RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+            DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+            DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+    }
+
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeManyTiles(Int2Type<true>)
+    {
+        // Invoke onesweep
+        return InvokeOnesweep<ActivePolicyT>();
+    }
+
     /// Invocation
     template <typename ActivePolicyT>
     CUB_RUNTIME_FUNCTION __forceinline__
@@ -1262,12 +1572,7 @@ struct DispatchRadixSort :
         else
         {
             // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
-                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+            return InvokeManyTiles<ActivePolicyT>(Int2Type<ActivePolicyT::ONESWEEP>());
         }
     }
 
@@ -1330,7 +1635,8 @@ template <
     bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
     typename KeyT,              ///< Key type
     typename ValueT,            ///< Value type
-    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename BeginOffsetIteratorT,   ///< Random-access input iterator type for reading segment beginning offsets \iterator
+    typename EndOffsetIteratorT,   ///< Random-access input iterator type for reading segment ending offsets \iterator
     typename OffsetT,           ///< Signed integer type for global offsets
     typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT> >
 struct DispatchSegmentedRadixSort :
@@ -1357,8 +1663,8 @@ struct DispatchSegmentedRadixSort :
     DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
     OffsetT                 num_items;              ///< [in] Number of items to sort
     OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
-    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    BeginOffsetIteratorT    d_begin_offsets;        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    EndOffsetIteratorT      d_end_offsets;          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
     int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
     int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
     cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
@@ -1380,8 +1686,8 @@ struct DispatchSegmentedRadixSort :
         DoubleBuffer<ValueT>    &d_values,
         OffsetT                 num_items,
         OffsetT                 num_segments,
-        OffsetIteratorT         d_begin_offsets,
-        OffsetIteratorT         d_end_offsets,
+        BeginOffsetIteratorT    d_begin_offsets,
+        EndOffsetIteratorT      d_end_offsets,
         int                     begin_bit,
         int                     end_bit,
         bool                    is_overwrite_okay,
@@ -1601,8 +1907,8 @@ struct DispatchSegmentedRadixSort :
 
         // Force kernel code-generation in all compiler passes
         return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, OffsetT>);
     }
 
 
@@ -1620,8 +1926,8 @@ struct DispatchSegmentedRadixSort :
         DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
         int                     num_items,              ///< [in] Number of items to sort
         int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        BeginOffsetIteratorT    d_begin_offsets,        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        EndOffsetIteratorT      d_end_offsets,          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
         int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
         int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
         bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
@@ -1658,3 +1964,7 @@ struct DispatchSegmentedRadixSort :
 CUB_NS_POSTFIX  // Optional outer namespace(s)
 
 
+#if defined(__clang__)
+#  pragma clang diagnostic pop
+#endif
+
diff --git a/src/3rdparty/cub/grid/grid_even_share.cuh b/src/3rdparty/cub/grid/grid_even_share.cuh
index d5f8b34..badbfd6 100644
--- a/src/3rdparty/cub/grid/grid_even_share.cuh
+++ b/src/3rdparty/cub/grid/grid_even_share.cuh
@@ -37,6 +37,7 @@
 #include "../config.cuh"
 #include "../util_namespace.cuh"
 #include "../util_macro.cuh"
+#include "../util_math.cuh"
 #include "../util_type.cuh"
 #include "grid_mapping.cuh"
 
@@ -77,7 +78,7 @@ struct GridEvenShare
 {
 private:
 
-    OffsetT     total_tiles;
+    int         total_tiles;
     int         big_shares;
     OffsetT     big_share_items;
     OffsetT     normal_share_items;
@@ -122,17 +123,18 @@ public:
      * \brief Dispatch initializer. To be called prior prior to kernel launch.
      */
     __host__ __device__ __forceinline__ void DispatchInit(
-        OffsetT num_items,          ///< Total number of input items
+        OffsetT num_items_,          ///< Total number of input items
         int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
         int     tile_items)         ///< Number of data items per input tile
     {
-        this->block_offset          = num_items;    // Initialize past-the-end
-        this->block_end             = num_items;    // Initialize past-the-end
-        this->num_items             = num_items;
-        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
+        this->block_offset          = num_items_;    // Initialize past-the-end
+        this->block_end             = num_items_;    // Initialize past-the-end
+        this->num_items             = num_items_;
+        this->total_tiles           = static_cast<int>(cub::DivideAndRoundUp(num_items_, tile_items));
         this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
-        OffsetT avg_tiles_per_block = total_tiles / grid_size;
-        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
+        int avg_tiles_per_block     = total_tiles / grid_size;
+        // leftover grains go to big blocks:
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);
         this->normal_share_items    = avg_tiles_per_block * tile_items;
         this->normal_base_offset    = big_shares * tile_items;
         this->big_share_items       = normal_share_items + tile_items;
diff --git a/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh b/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh
index 5219e50..a3d8272 100644
--- a/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh
+++ b/src/3rdparty/cub/iterator/cache_modified_input_iterator.cuh
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
@@ -12,7 +12,7 @@
  *     * Neither the name of the NVIDIA CORPORATION nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
- * 
+ *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -66,7 +66,7 @@ namespace cub {
  * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
  *
  * \par Overview
- * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
+ * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
  *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
  *   made by reading \p ValueType values through loads modified by \p MODIFIER.
  * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
@@ -76,7 +76,7 @@ namespace cub {
  * - Compatible with Thrust API v1.7 or newer.
  *
  * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
+ * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
  * dereference a device array of double using the "ldg" PTX load modifier
  * (i.e., load values through texture cache).
  * \par
@@ -117,9 +117,9 @@ public:
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
+    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
+        THRUST_NS_QUALIFIER::device_system_tag,
+        THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
       >::type iterator_category;                                        ///< The iterator category
diff --git a/src/3rdparty/cub/util_arch.cuh b/src/3rdparty/cub/util_arch.cuh
index 58d0c73..82acfef 100644
--- a/src/3rdparty/cub/util_arch.cuh
+++ b/src/3rdparty/cub/util_arch.cuh
@@ -45,9 +45,10 @@ namespace cub {
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \
-        !defined(CUB_USE_COOPERATIVE_GROUPS)
-    #define CUB_USE_COOPERATIVE_GROUPS
+#if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__) ||            \
+     CUDA_VERSION >= 9000) &&                                                  \
+  !defined(CUB_USE_COOPERATIVE_GROUPS)
+#define CUB_USE_COOPERATIVE_GROUPS
 #endif
 
 /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
diff --git a/src/3rdparty/cub/util_compiler.cuh b/src/3rdparty/cub/util_compiler.cuh
index 9be9492..6239d0e 100644
--- a/src/3rdparty/cub/util_compiler.cuh
+++ b/src/3rdparty/cub/util_compiler.cuh
@@ -63,7 +63,7 @@
 #endif // CUB_HOST_COMPILER
 
 // figure out which device compiler we're using
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(__NVCOMPILER_CUDA__)
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
 #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
 #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
diff --git a/src/3rdparty/cub/util_cpp_dialect.cuh b/src/3rdparty/cub/util_cpp_dialect.cuh
index b4cbe92..23adf8e 100644
--- a/src/3rdparty/cub/util_cpp_dialect.cuh
+++ b/src/3rdparty/cub/util_cpp_dialect.cuh
@@ -108,27 +108,43 @@
 #  define CUB_COMP_DEPR_IMPL1 /* intentionally blank */
 #endif
 
-#define CUB_COMPILER_DEPRECATION(REQ, FIX) \
-  CUB_COMP_DEPR_IMPL(CUB requires REQ. Please FIX. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+#define CUB_COMPILER_DEPRECATION(REQ) \
+  CUB_COMP_DEPR_IMPL(CUB requires at least REQ. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#define CUB_COMPILER_DEPRECATION_SOFT(REQ, CUR) \
+  CUB_COMP_DEPR_IMPL(CUB requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a future release. Define CUB_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
 
-// Minimum required compiler checks:
 #ifndef CUB_IGNORE_DEPRECATED_COMPILER
+
+// Compiler checks:
 #  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC && CUB_GCC_VERSION < 50000
-     CUB_COMPILER_DEPRECATION(GCC 5.0, upgrade your compiler);
+     CUB_COMPILER_DEPRECATION(GCC 5.0);
+#  elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 70000
+     CUB_COMPILER_DEPRECATION(Clang 7.0);
+#  elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
+     // <2017. Hard upgrade message:
+     CUB_COMPILER_DEPRECATION(MSVC 2019 (19.20/16.0/14.20));
+#  elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1920
+     // >=2017, <2019. Soft deprecation message:
+     CUB_COMPILER_DEPRECATION_SOFT(MSVC 2019 (19.20/16.0/14.20), MSVC 2017);
 #  endif
-#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG && CUB_CLANG_VERSION < 60000
-     CUB_COMPILER_DEPRECATION(Clang 6.0, upgrade your compiler);
-#  endif
-#  if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC && CUB_MSVC_VERSION < 1910
-     CUB_COMPILER_DEPRECATION(MSVC 2017, upgrade your compiler);
+
+#endif // CUB_IGNORE_DEPRECATED_COMPILER
+
+#ifndef CUB_IGNORE_DEPRECATED_DIALECT
+
+// Dialect checks:
+#  if CUB_CPP_DIALECT < 2011
+     // <C++11. Hard upgrade message:
+     CUB_COMPILER_DEPRECATION(C++14);
+#  elif CUB_CPP_DIALECT == 2011 && !defined(CUB_IGNORE_DEPRECATED_CPP_11)
+     // =C++11. Soft upgrade message:
+     CUB_COMPILER_DEPRECATION_SOFT(C++14, C++11);
 #  endif
-#endif
 
-#if !defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT) && CUB_CPP_DIALECT < 2014 && \
-    (CUB_CPP_DIALECT != 2011 || !defined(CUB_IGNORE_DEPRECATED_CPP_11))
-  CUB_COMPILER_DEPRECATION(C++14, pass -std=c++14 to your compiler);
-#endif
+#endif // CUB_IGNORE_DEPRECATED_DIALECT
 
+#undef CUB_COMPILER_DEPRECATION_SOFT
 #undef CUB_COMPILER_DEPRECATION
 #undef CUB_COMP_DEPR_IMPL
 #undef CUB_COMP_DEPR_IMPL0
diff --git a/src/3rdparty/cub/util_debug.cuh b/src/3rdparty/cub/util_debug.cuh
index 354eab6..a7bd770 100644
--- a/src/3rdparty/cub/util_debug.cuh
+++ b/src/3rdparty/cub/util_debug.cuh
@@ -102,7 +102,7 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
  * \brief Debug macro
  */
 #ifndef CubDebug
-    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+    #define CubDebug(e) CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)
 #endif
 
 
@@ -110,7 +110,7 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
  * \brief Debug macro with exit
  */
 #ifndef CubDebugExit
-    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+    #define CubDebugExit(e) if (CUB_NS_QUALIFIER::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
 #endif
 
 
@@ -146,9 +146,9 @@ __host__ __device__ __forceinline__ cudaError_t Debug(
         #endif
             }
         #ifndef __CUDA_ARCH__
-            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
+            #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf(format,__VA_ARGS__);
         #else
-            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+            #define _CubLog(format, ...) CUB_NS_QUALIFIER::va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
         #endif
     #endif
 #endif
diff --git a/src/3rdparty/cub/util_deprecated.cuh b/src/3rdparty/cub/util_deprecated.cuh
index b2bf465..6819f73 100644
--- a/src/3rdparty/cub/util_deprecated.cuh
+++ b/src/3rdparty/cub/util_deprecated.cuh
@@ -33,8 +33,17 @@
 #pragma once
 
 #include "util_compiler.cuh"
+#include "util_cpp_dialect.cuh"
 
-#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
+#if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API)
+#  define CUB_IGNORE_DEPRECATED_API
+#endif
+
+#ifdef CUB_IGNORE_DEPRECATED_API
+#  define CUB_DEPRECATED
+#elif CUB_CPP_DIALECT >= 2014
+#  define CUB_DEPRECATED [[deprecated]]
+#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
 #  define CUB_DEPRECATED __declspec(deprecated)
 #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
 #  define CUB_DEPRECATED __attribute__((deprecated))
@@ -43,4 +52,3 @@
 #else
 #  define CUB_DEPRECATED
 #endif
-
diff --git a/src/3rdparty/cub/util_device.cuh b/src/3rdparty/cub/util_device.cuh
index df0ee07..10da757 100644
--- a/src/3rdparty/cub/util_device.cuh
+++ b/src/3rdparty/cub/util_device.cuh
@@ -33,6 +33,8 @@
 
 #pragma once
 
+#include "detail/device_synchronize.cuh"
+
 #include "util_type.cuh"
 #include "util_arch.cuh"
 #include "util_debug.cuh"
@@ -121,7 +123,7 @@ __global__ void EmptyKernel(void) { }
 /**
  * \brief Returns the current device or -1 if an error occurred.
  */
-CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
+CUB_RUNTIME_FUNCTION inline int CurrentDevice()
 {
 #if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
 
@@ -147,14 +149,14 @@ private:
     int const old_device;
     bool const needs_reset;
 public:
-    __host__ __forceinline__ SwitchDevice(int new_device)
+    __host__ inline SwitchDevice(int new_device)
       : old_device(CurrentDevice()), needs_reset(old_device != new_device)
     {
         if (needs_reset)
             CubDebug(cudaSetDevice(new_device));
     }
 
-    __host__ __forceinline__ ~SwitchDevice()
+    __host__ inline ~SwitchDevice()
     {
         if (needs_reset)
             CubDebug(cudaSetDevice(old_device));
@@ -165,7 +167,7 @@ public:
  * \brief Returns the number of CUDA devices available or -1 if an error
  *        occurred.
  */
-CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached()
+CUB_RUNTIME_FUNCTION inline int DeviceCountUncached()
 {
 #if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
 
@@ -198,11 +200,22 @@ struct ValueCache
      * \brief Call the nullary function to produce the value and construct the
      *        cache.
      */
-    __host__ __forceinline__ ValueCache() : value(Function()) {}
+    __host__ inline ValueCache() : value(Function()) {}
 };
 
 #endif
 
+#if CUB_CPP_DIALECT >= 2011
+// Host code, only safely usable in C++11 or newer, where thread-safe
+// initialization of static locals is guaranteed.  This is a separate function
+// to avoid defining a local static in a host/device function.
+__host__ inline int DeviceCountCachedValue()
+{
+    static ValueCache<int, DeviceCountUncached> cache;
+    return cache.value;
+}
+#endif
+
 /**
  * \brief Returns the number of CUDA devices available.
  *
@@ -210,17 +223,14 @@ struct ValueCache
  *
  * \note This function is thread safe.
  */
-CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount()
+CUB_RUNTIME_FUNCTION inline int DeviceCount()
 {
     int result = -1;
     if (CUB_IS_HOST_CODE) {
         #if CUB_INCLUDE_HOST_CODE
             #if CUB_CPP_DIALECT >= 2011
                 // Host code and C++11.
-                // C++11 guarantees that initialization of static locals is thread safe.
-                static ValueCache<int, DeviceCountUncached> cache;
-
-                result = cache.value;
+                result = DeviceCountCachedValue();
             #else
                 // Host code and C++98.
                 result = DeviceCountUncached();
@@ -273,7 +283,7 @@ public:
     /**
      * \brief Construct the cache.
      */
-    __host__ __forceinline__ PerDeviceAttributeCache() : entries_()
+    __host__ inline PerDeviceAttributeCache() : entries_()
     {
         assert(DeviceCount() <= CUB_MAX_DEVICES);
     }
@@ -312,7 +322,8 @@ public:
 
                 // We don't use `CubDebug` here because we let the user code
                 // decide whether or not errors are hard errors.
-                if (payload.error = std::forward<Invocable>(f)(payload.attribute))
+                payload.error = std::forward<Invocable>(f)(payload.attribute);
+                if (payload.error)
                     // Clear the global CUDA error state which may have been
                     // set by the last call. Otherwise, errors may "leak" to
                     // unrelated kernel launches.
@@ -350,7 +361,7 @@ public:
 /**
  * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
  */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version)
+CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersionUncached(int& ptx_version)
 {
     // Instantiate `EmptyKernel<void>` in both host and device code to ensure
     // it can be called.
@@ -390,15 +401,16 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_ver
 /**
  * \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
  */
-__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device)
+__host__ inline cudaError_t PtxVersionUncached(int& ptx_version, int device)
 {
     SwitchDevice sd(device);
+    (void)sd;
     return PtxVersionUncached(ptx_version);
 }
 
 #if CUB_CPP_DIALECT >= 2011 // C++11 and later.
 template <typename Tag>
-__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache()
+__host__ inline PerDeviceAttributeCache& GetPerDeviceAttributeCache()
 {
     // C++11 guarantees that initialization of static locals is thread safe.
     static PerDeviceAttributeCache cache;
@@ -416,7 +428,7 @@ struct SmVersionCacheTag {};
  *
  * \note This function is thread safe.
  */
-__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
+__host__ inline cudaError_t PtxVersion(int& ptx_version, int device)
 {
 #if CUB_CPP_DIALECT >= 2011 // C++11 and later.
 
@@ -445,7 +457,7 @@ __host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
  *
  * \note This function is thread safe.
  */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
+CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
 {
     cudaError_t result = cudaErrorUnknown;
     if (CUB_IS_HOST_CODE) {
@@ -481,7 +493,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
 /**
  * \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
  */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
+CUB_RUNTIME_FUNCTION inline cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
 {
 #if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.
 
@@ -515,7 +527,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_versi
  *
  * \note This function is thread safe.
  */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
+CUB_RUNTIME_FUNCTION inline cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
 {
     cudaError_t result = cudaErrorUnknown;
     if (CUB_IS_HOST_CODE) {
@@ -548,7 +560,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int
 /**
  * Synchronize the specified \p stream.
  */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
+CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream(cudaStream_t stream)
 {
     cudaError_t result = cudaErrorUnknown;
     if (CUB_IS_HOST_CODE) {
@@ -560,7 +572,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
             #if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
                 (void)stream;
                 // Device can't yet sync on a specific stream
-                result = CubDebug(cudaDeviceSynchronize());
+                result = CubDebug(cub::detail::device_synchronize());
             #else // Device code without the CUDA runtime.
                 (void)stream;
                 // CUDA API calls are not supported from this device.
@@ -604,7 +616,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
  *
  */
 template <typename KernelPtr>
-CUB_RUNTIME_FUNCTION __forceinline__
+CUB_RUNTIME_FUNCTION inline
 cudaError_t MaxSmOccupancy(
     int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
     KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
diff --git a/src/3rdparty/cub/util_macro.cuh b/src/3rdparty/cub/util_macro.cuh
index ff86365..697944f 100644
--- a/src/3rdparty/cub/util_macro.cuh
+++ b/src/3rdparty/cub/util_macro.cuh
@@ -34,6 +34,8 @@
 
 #include "util_namespace.cuh"
 
+#include <utility>
+
 /// Optional outer namespace(s)
 CUB_NS_PREFIX
 
@@ -56,6 +58,24 @@ namespace cub {
     #endif
 #endif
 
+#define CUB_PREVENT_MACRO_SUBSTITUTION
+
+template <typename T, typename U>
+constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
+                                                                      U &&u)
+  -> decltype(t < u ? std::forward<T>(t) : std::forward<U>(u))
+{
+  return t < u ? std::forward<T>(t) : std::forward<U>(u);
+}
+
+template <typename T, typename U>
+constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
+                                                                      U &&u)
+  -> decltype(t < u ? std::forward<U>(u) : std::forward<T>(t))
+{
+  return t < u ? std::forward<U>(u) : std::forward<T>(t);
+}
+
 #ifndef CUB_MAX
     /// Select maximum(a, b)
     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
diff --git a/src/3rdparty/cub/util_namespace.cuh b/src/3rdparty/cub/util_namespace.cuh
index 4488d97..2a1bb38 100644
--- a/src/3rdparty/cub/util_namespace.cuh
+++ b/src/3rdparty/cub/util_namespace.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -27,27 +27,110 @@
  ******************************************************************************/
 
 /**
- * \file
- * Place-holder for prefixing the cub namespace
+ * \file util_namespace.cuh
+ * \brief Utilities that allow `cub::` to be placed inside an
+ * application-specific namespace.
  */
 
+
 #pragma once
 
+// This is not used by this file; this is a hack so that we can detect the
+// CUB version from Thrust on older versions of CUB that did not have
+// version.cuh.
 #include "version.cuh"
 
-// For example:
-//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
-//#define CUB_NS_POSTFIX } }
+// Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users
+// that they must now define the qualifier macro, too.
+#if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER)
+#error CUB requires a definition of CUB_NS_QUALIFIER when CUB_NS_PREFIX/POSTFIX are defined.
+#endif
+
+/**
+ * \def THRUST_CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `thrust::` and `cub::` namespaces.
+ * This macro should not be used with any other CUB namespace macros.
+ */
+#ifdef THRUST_CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE
+#endif
+
+/**
+ * \def CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `cub::` namespace.
+ * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value.
+ * This macro should not be used with any other CUB namespace macros.
+ */
+#ifdef CUB_WRAPPED_NAMESPACE
+#define CUB_NS_PREFIX                                                       \
+  namespace CUB_WRAPPED_NAMESPACE                                           \
+  {
 
+#define CUB_NS_POSTFIX }
+
+#define CUB_NS_QUALIFIER ::CUB_WRAPPED_NAMESPACE::cub
+#endif
+
+/**
+ * \def CUB_NS_PREFIX
+ * This macro is inserted prior to all `namespace cub { ... }` blocks. It is
+ * derived from CUB_WRAPPED_NAMESPACE, if set, and will be empty otherwise.
+ * It may be defined by users, in which case CUB_NS_PREFIX,
+ * CUB_NS_POSTFIX, and CUB_NS_QUALIFIER must all be set consistently.
+ */
 #ifndef CUB_NS_PREFIX
 #define CUB_NS_PREFIX
 #endif
 
+/**
+ * \def CUB_NS_POSTFIX
+ * This macro is inserted following the closing braces of all
+ * `namespace cub { ... }` block. It is defined appropriately when
+ * CUB_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be
+ * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and
+ * CUB_NS_QUALIFIER must all be set consistently.
+ */
 #ifndef CUB_NS_POSTFIX
 #define CUB_NS_POSTFIX
 #endif
 
+/**
+ * \def CUB_NS_QUALIFIER
+ * This macro is used to qualify members of cub:: when accessing them from
+ * outside of their namespace. By default, this is just `::cub`, and will be
+ * set appropriately when CUB_WRAPPED_NAMESPACE is defined. This macro may be
+ * defined by users, in which case CUB_NS_PREFIX, CUB_NS_POSTFIX, and
+ * CUB_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef CUB_NS_QUALIFIER
+#define CUB_NS_QUALIFIER ::cub
+#endif
+
+/**
+ * \def CUB_NAMESPACE_BEGIN
+ * This macro is used to open a `cub::` namespace block, along with any
+ * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by CUB and may not be overridden.
+ */
+#define CUB_NAMESPACE_BEGIN                                                 \
+  CUB_NS_PREFIX                                                             \
+  namespace cub                                                             \
+  {
+
+/**
+ * \def CUB_NAMESPACE_END
+ * This macro is used to close a `cub::` namespace block, along with any
+ * enclosing namespaces requested by CUB_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by CUB and may not be overridden.
+ */
+#define CUB_NAMESPACE_END                                                   \
+  } /* end namespace cub */                                                 \
+  CUB_NS_POSTFIX
+
 // Declare these namespaces here for the purpose of Doxygenating them
+CUB_NS_PREFIX
 
 /*! \namespace cub
  *  \brief \p cub is the top-level namespace which contains all CUB
@@ -55,5 +138,6 @@
  */
 namespace cub
 {
-
 }
+
+CUB_NS_POSTFIX
diff --git a/src/3rdparty/cub/util_ptx.cuh b/src/3rdparty/cub/util_ptx.cuh
index 3f20c11..7b3ce7a 100644
--- a/src/3rdparty/cub/util_ptx.cuh
+++ b/src/3rdparty/cub/util_ptx.cuh
@@ -243,6 +243,15 @@ __device__  __forceinline__ int CTA_SYNC_AND(int p)
 }
 
 
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_OR(int p)
+{
+    return __syncthreads_or(p);
+}
+
+
 /**
  * Warp barrier
  */
@@ -292,6 +301,7 @@ __device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_m
 #endif
 }
 
+
 /**
  * Warp synchronous shfl_up
  */
@@ -340,6 +350,19 @@ unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned
     return word;
 }
 
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+  return __shfl_sync(member_mask, word, src_lane);
+#else
+  return __shfl(word, src_lane);
+#endif
+}
+
 /**
  * Floating point multiply. (Mantissa LSB rounds towards zero.)
  */
@@ -713,22 +736,5 @@ inline __device__ unsigned int MatchAny(unsigned int label)
 
 }
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 }               // CUB namespace
 CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/src/3rdparty/cub/util_type.cuh b/src/3rdparty/cub/util_type.cuh
index 0ba41e1..c380510 100644
--- a/src/3rdparty/cub/util_type.cuh
+++ b/src/3rdparty/cub/util_type.cuh
@@ -37,9 +37,12 @@
 #include <limits>
 #include <cfloat>
 
-#if (__CUDACC_VER_MAJOR__ >= 9)
+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__
     #include <cuda_fp16.h>
 #endif
+#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__
+    #include <cuda_bf16.h>
+#endif
 
 #include "util_macro.cuh"
 #include "util_arch.cuh"
@@ -62,7 +65,7 @@ namespace cub {
 
 
 /******************************************************************************
- * Type equality
+ * Conditional types
  ******************************************************************************/
 
 /**
@@ -88,7 +91,7 @@ struct If<false, ThenType, ElseType>
 
 
 /******************************************************************************
- * Conditional types
+ * Type equality
  ******************************************************************************/
 
 /**
@@ -358,7 +361,7 @@ struct UnitWord
     {
         enum {
             UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
-            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (int(ALIGN_BYTES) % int(UNIT_ALIGN_BYTES) == 0)
         };
     };
 
@@ -1063,7 +1066,7 @@ struct FpLimits<double>
 };
 
 
-#if (__CUDACC_VER_MAJOR__ >= 9)
+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__
 template <>
 struct FpLimits<__half>
 {
@@ -1079,6 +1082,21 @@ struct FpLimits<__half>
 };
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__
+template <>
+struct FpLimits<__nv_bfloat16>
+{
+    static __host__ __device__ __forceinline__ __nv_bfloat16 Max() {
+        unsigned short max_word = 0x7F7F;
+        return reinterpret_cast<__nv_bfloat16&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __nv_bfloat16 Lowest() {
+        unsigned short lowest_word = 0xFF7F;
+        return reinterpret_cast<__nv_bfloat16&>(lowest_word);
+    }
+};
+#endif
 
 /**
  * Basic type traits (fp primitive specialization)
@@ -1143,9 +1161,12 @@ template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTE
 
 template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
 template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
-#if (__CUDACC_VER_MAJOR__ >= 9)
+#if (__CUDACC_VER_MAJOR__ >= 9 || CUDA_VERSION >= 9000) && !__NVCOMPILER_CUDA__
     template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
 #endif
+#if (__CUDACC_VER_MAJOR__ >= 11 || CUDA_VERSION >= 11000) && !__NVCOMPILER_CUDA__
+    template <> struct NumericTraits<__nv_bfloat16> :   BaseTraits<FLOATING_POINT, true, false, unsigned short, __nv_bfloat16> {};
+#endif
 
 template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
 
diff --git a/src/3rdparty/cub/version.cuh b/src/3rdparty/cub/version.cuh
index 122fb9a..0919222 100644
--- a/src/3rdparty/cub/version.cuh
+++ b/src/3rdparty/cub/version.cuh
@@ -43,7 +43,7 @@
  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>CUB_VERSION / 100000</tt> is the major version.
  */
-#define CUB_VERSION 101000
+#define CUB_VERSION 101500
 
 /*! \def CUB_MAJOR_VERSION
  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the