diff --git a/cub/cub/detail/launcher/cuda_runtime.cuh b/cub/cub/detail/launcher/cuda_runtime.cuh
index 650a7dea68f..ec00f1aea3d 100644
--- a/cub/cub/detail/launcher/cuda_runtime.cuh
+++ b/cub/cub/detail/launcher/cuda_runtime.cuh
@@ -21,10 +21,10 @@ namespace detail
 
 struct TripleChevronFactory
 {
-  CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron
+  CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron
   operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const
   {
-    return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream);
+    return THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid, block, shared_mem, stream);
   }
 
   CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version)
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 9e05f86dd43..cec5986b44f 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -221,7 +221,7 @@ struct DispatchAdjacentDifference
                 reinterpret_cast<long long>(stream));
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_block_size, 0, stream)
           .doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
                 d_input,
                 first_tile_previous,
@@ -251,7 +251,7 @@ struct DispatchAdjacentDifference
               reinterpret_cast<long long>(stream));
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
         .doit(DeviceAdjacentDifferenceDifferenceKernel<
                 typename PolicyHub::MaxPolicy,
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index df2b0963828..daeae156d2a 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -548,7 +548,7 @@ struct DispatchBatchMemcpy
 #endif
 
     // Invoke init_kernel to initialize buffer prefix sum-tile descriptors
-    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+    error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
               .doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);
 
     // Check for failure to launch
@@ -578,7 +578,7 @@ struct DispatchBatchMemcpy
     // Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked
     // up by next kernel
     error =
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         batch_memcpy_grid_size, ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS, 0, stream)
         .doit(batch_memcpy_non_blev_kernel,
               input_buffer_it,
@@ -615,7 +615,7 @@ struct DispatchBatchMemcpy
 #endif
 
     error =
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream)
         .doit(multi_block_memcpy_kernel,
               d_blev_src_buffers,
               d_blev_dst_buffers,
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 7ba478e3c00..4c309140c3a 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -110,7 +110,7 @@ struct dispatch_t
             static_cast<int>(items_per_thread));
 #endif
 
-    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+    error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
               static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
               .doit(detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
     error = CubDebug(error);
@@ -153,7 +153,7 @@ struct dispatch_t
             static_cast<int>(items_per_thread));
 #endif
 
-    error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+    error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
               static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
               .doit(detail::for_each::static_kernel<typename PolicyHubT::MaxPolicy, OffsetT, OpT>, num_items, op);
     error = CubDebug(error);
diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
index 6e346316d48..65412e54e8b 100644
--- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
+++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
@@ -125,7 +125,7 @@ public:
             items_per_thread);
 #  endif
     auto status =
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream)
         .doit(detail::for_each_in_extents::
                 static_kernel<max_policy_t, OpType, ExtentsType, decltype(sub_sizes_div_array), Ranks...>,
               _op,
@@ -162,7 +162,7 @@ public:
             _stream,
             items_per_thread);
 #  endif
-    status = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream)
+    status = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream)
                .doit(kernel, _op, _ext, sub_sizes_div_array, extents_div_array);
     _CUB_RETURN_IF_ERROR(status)
     _CUB_RETURN_IF_STREAM_ERROR(_stream)
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index d4bc8306eca..151fb08f3ab 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -442,7 +442,7 @@ struct dispatch_histogram
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke histogram_init_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         histogram_init_grid_dims, histogram_init_block_threads, 0, stream)
         .doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue);
 
@@ -466,7 +466,7 @@ struct dispatch_histogram
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke histogram_sweep_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
         .doit(histogram_sweep_kernel,
               d_samples,
               num_output_bins_wrapper,
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index fa467cdd5fe..e083fe36d00 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -220,7 +220,7 @@ struct dispatch_t
       const int partition_grid_size = static_cast<int>(::cuda::ceil_div(num_partitions, threads_per_partition_block));
 
       auto error = CubDebug(
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
           partition_grid_size, threads_per_partition_block, 0, stream)
           .doit(device_partition_merge_path_kernel<
                   max_policy_t,
@@ -255,7 +255,7 @@ struct dispatch_t
     {
       auto vshmem_ptr = vsmem_t{allocations[1]};
       auto error      = CubDebug(
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
           static_cast<int>(num_tiles), static_cast<int>(agent_t::policy::BLOCK_THREADS), 0, stream)
           .doit(
             device_merge_kernel<max_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>,
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 8a0e31e9bbd..028f4fd9df1 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -513,7 +513,7 @@ struct DispatchMergeSort
       auto items_buffer     = static_cast<ValueT*>(allocations[2]);
 
       // Invoke DeviceMergeSortBlockSortKernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         static_cast<int>(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true)
         .doit(
           DeviceMergeSortBlockSortKernel<
@@ -572,7 +572,7 @@ struct DispatchMergeSort
         const OffsetT target_merged_tiles_number = OffsetT(2) << pass;
 
         // Partition
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
           partition_grid_size, threads_per_partition_block, 0, stream, true)
           .doit(DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
                 ping,
@@ -599,7 +599,7 @@ struct DispatchMergeSort
         }
 
         // Merge
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
           static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true)
           .doit(
             DeviceMergeSortMergeKernel<typename PolicyHub::MaxPolicy,
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index 6cf5261ab46..1ed25d21796 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1008,8 +1008,7 @@ struct DispatchRadixSort
 #endif
 
       // Invoke upsweep_kernel with same grid size as downsweep_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-        1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
         .doit(single_tile_kernel,
               d_keys.Current(),
               d_keys.Alternate(),
@@ -1082,7 +1081,7 @@ struct DispatchRadixSort
       int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits;
 
       // Invoke upsweep_kernel with same grid size as downsweep_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream)
         .doit(pass_config.upsweep_kernel,
               d_keys_in,
@@ -1117,7 +1116,7 @@ struct DispatchRadixSort
 #endif
 
       // Invoke scan_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream)
         .doit(pass_config.scan_kernel, d_spine, pass_spine_length);
 
       // Check for failure to launch
@@ -1145,7 +1144,7 @@ struct DispatchRadixSort
 #endif
 
       // Invoke downsweep_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream)
         .doit(pass_config.downsweep_kernel,
               d_keys_in,
@@ -1346,7 +1345,7 @@ struct DispatchRadixSort
               ActivePolicyT::HistogramPolicy::RADIX_BITS);
 #endif
 
-      error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
                 histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream)
                 .doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer);
       error = CubDebug(error);
@@ -1373,7 +1372,7 @@ struct DispatchRadixSort
               ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS);
 #endif
 
-      error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
+      error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
                 .doit(DeviceRadixSortExclusiveSumKernel<max_policy_t, OffsetT>, d_bins);
       error = CubDebug(error);
       if (cudaSuccess != error)
@@ -1437,7 +1436,7 @@ struct DispatchRadixSort
             DecomposerT>;
 
           error =
-            THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
+            THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
               .doit(onesweep_kernel,
                     d_lookback,
                     d_ctrs + portion * num_passes + pass,
@@ -2096,7 +2095,7 @@ struct DispatchSegmentedRadixSort
               pass_bits);
 #endif
 
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         num_segments, pass_config.segmented_config.block_threads, 0, stream)
         .doit(pass_config.segmented_kernel,
               d_keys_in,
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index d3e20785e57..170237d5646 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -969,7 +969,7 @@ struct DispatchSegmentedReduce
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke DeviceReduceKernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
         num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream)
         .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init);
 
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 76a39317212..5637f462c04 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -346,7 +346,7 @@ struct DispatchReduceByKey
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
         .doit(init_kernel, tile_state, num_tiles, d_num_runs_out);
 
       // Check for failure to launch
@@ -403,7 +403,7 @@ struct DispatchReduceByKey
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke reduce_by_key_kernel
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
           .doit(reduce_by_key_kernel,
                 d_keys_in,
                 d_unique_out,
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 5d567d45778..f22417886b2 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -357,7 +357,7 @@ struct DeviceRleDispatch
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
         .doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out);
 
       // Check for failure to launch
@@ -418,7 +418,7 @@ struct DeviceRleDispatch
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke device_rle_sweep_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
         .doit(device_rle_sweep_kernel,
               d_in,
               d_offsets_out,
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index cba024791b7..5131c73ba22 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -430,7 +430,7 @@ struct DispatchScan
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
         .doit(init_kernel, tile_state, num_tiles);
 
       // Check for failure to launch
@@ -482,7 +482,7 @@ struct DispatchScan
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke scan_kernel
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
           .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items);
 
         // Check for failure to launch
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index d3f7e680a72..cfed41b44f7 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -442,7 +442,7 @@ struct DispatchScanByKey
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
         .doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, static_cast<OffsetT>(tile_size), num_tiles);
 
       // Check for failure to launch
@@ -483,7 +483,7 @@ struct DispatchScanByKey
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
         // Invoke scan_kernel
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
           .doit(scan_kernel,
                 d_keys_in,
                 d_keys_prev_in,
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index dc98438e669..dc63f68633b 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -553,8 +553,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
             (long long) stream);
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
-    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
-      blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream)
+    THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream)
       .doit(large_kernel,
             large_and_medium_segments_indices,
             d_current_keys,
@@ -601,7 +600,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont
             (long long) stream);
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
-    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
+    THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
       small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream)
       .doit(small_kernel,
             small_segments,
@@ -1234,43 +1233,43 @@ private:
 
 #else // CUB_RDC_ENABLED
 
-#  define CUB_TEMP_DEVICE_CODE                                                 \
-    error =                                                                    \
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \
-        .doit(                                                                 \
-          DeviceSegmentedSortContinuationKernel<                               \
-            typename PolicyHub::MaxPolicy,                                     \
-            LargeKernelT,                                                      \
-            SmallKernelT,                                                      \
-            KeyT,                                                              \
-            ValueT,                                                            \
-            BeginOffsetIteratorT,                                              \
-            EndOffsetIteratorT>,                                               \
-          large_kernel,                                                        \
-          small_kernel,                                                        \
-          num_segments,                                                        \
-          d_keys.Current(),                                                    \
-          GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),       \
-          d_keys_double_buffer,                                                \
-          d_values.Current(),                                                  \
-          GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values),   \
-          d_values_double_buffer,                                              \
-          d_begin_offsets,                                                     \
-          d_end_offsets,                                                       \
-          group_sizes.get(),                                                   \
-          large_and_medium_segments_indices.get(),                             \
-          small_segments_indices.get());                                       \
-    error = CubDebug(error);                                                   \
-                                                                               \
-    if (cudaSuccess != error)                                                  \
-    {                                                                          \
-      return error;                                                            \
-    }                                                                          \
-                                                                               \
-    error = CubDebug(detail::DebugSyncStream(stream));                         \
-    if (cudaSuccess != error)                                                  \
-    {                                                                          \
-      return error;                                                            \
+#  define CUB_TEMP_DEVICE_CODE                                               \
+    error =                                                                  \
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, 1, 0, stream) \
+        .doit(                                                               \
+          DeviceSegmentedSortContinuationKernel<                             \
+            typename PolicyHub::MaxPolicy,                                   \
+            LargeKernelT,                                                    \
+            SmallKernelT,                                                    \
+            KeyT,                                                            \
+            ValueT,                                                          \
+            BeginOffsetIteratorT,                                            \
+            EndOffsetIteratorT>,                                             \
+          large_kernel,                                                      \
+          small_kernel,                                                      \
+          num_segments,                                                      \
+          d_keys.Current(),                                                  \
+          GetFinalOutput<KeyT>(LargeSegmentPolicyT::RADIX_BITS, d_keys),     \
+          d_keys_double_buffer,                                              \
+          d_values.Current(),                                                \
+          GetFinalOutput<ValueT>(LargeSegmentPolicyT::RADIX_BITS, d_values), \
+          d_values_double_buffer,                                            \
+          d_begin_offsets,                                                   \
+          d_end_offsets,                                                     \
+          group_sizes.get(),                                                 \
+          large_and_medium_segments_indices.get(),                           \
+          small_segments_indices.get());                                     \
+    error = CubDebug(error);                                                 \
+                                                                             \
+    if (cudaSuccess != error)                                                \
+    {                                                                        \
+      return error;                                                          \
+    }                                                                        \
+                                                                             \
+    error = CubDebug(detail::DebugSyncStream(stream));                       \
+    if (cudaSuccess != error)                                                \
+    {                                                                        \
+      return error;                                                          \
     }
 
 #endif // CUB_RDC_ENABLED
@@ -1348,7 +1347,7 @@ private:
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
     // Invoke fallback kernel
-    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
+    THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
       .doit(fallback_kernel,
             d_keys.Current(),
             GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys),
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 98093b01404..81a8fbe72ae 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -670,7 +670,7 @@ struct DispatchSelectIf
 #endif
 
         // Invoke scan_init_kernel to initialize tile descriptors
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
           .doit(scan_init_kernel, tile_status, current_num_tiles, d_num_selected_out);
 
         // Check for failure to launch
@@ -718,7 +718,7 @@ struct DispatchSelectIf
 #endif
 
         // Invoke select_if_kernel
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream)
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(current_num_tiles, block_threads, 0, stream)
           .doit(select_if_kernel,
                 d_in,
                 d_flags,
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index 90295f2c06f..519873eb7b7 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -289,7 +289,7 @@ struct DispatchThreeWayPartitionIf
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke three_way_partition_init_kernel to initialize tile descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
         .doit(three_way_partition_init_kernel, tile_status, num_tiles, d_num_selected_out);
 
       // Check for failure to launch
@@ -346,7 +346,7 @@ struct DispatchThreeWayPartitionIf
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke select_if_kernel
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
         .doit(three_way_partition_kernel,
               d_in,
               d_first_part_out,
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index ed2e104746e..24fdfbb5da7 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -663,7 +663,7 @@ struct dispatch_t<RequiresStableAddress,
   CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto configure_ublkcp_kernel()
     -> PoorExpected<
       ::cuda::std::
-        tuple<THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
+        tuple<THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron, decltype(CUB_DETAIL_TRANSFORM_KERNEL_PTR), int>>
   {
     using policy_t          = typename ActivePolicy::algo_policy;
     constexpr int block_dim = policy_t::block_threads;
@@ -739,7 +739,7 @@ struct dispatch_t<RequiresStableAddress,
 
     const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{config->tile_size}));
     return ::cuda::std::make_tuple(
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, config->smem_size, stream),
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid_dim, block_dim, config->smem_size, stream),
       CUB_DETAIL_TRANSFORM_KERNEL_PTR,
       config->elem_per_thread);
   }
@@ -820,7 +820,7 @@ struct dispatch_t<RequiresStableAddress,
     const int tile_size = block_dim * items_per_thread_clamped;
     const auto grid_dim = static_cast<unsigned int>(::cuda::ceil_div(num_items, Offset{tile_size}));
     return CubDebug(
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid_dim, block_dim, 0, stream)
         .doit(
           CUB_DETAIL_TRANSFORM_KERNEL_PTR,
           num_items,
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index b0747b9b42a..5b78d4ae344 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -430,7 +430,7 @@ struct DispatchUniqueByKey
 #endif // CUB_DETAIL_DEBUG_ENABLE_LOG
 
       // Invoke init_kernel to initialize tile descriptors
-      THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
+      THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
         .doit(init_kernel, tile_state, num_tiles, d_num_selected_out);
 
       // Check for failure to launch
@@ -494,7 +494,7 @@ struct DispatchUniqueByKey
 
       // Invoke select_if_kernel
       error =
-        THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
+        THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
           .doit(scan_kernel,
                 d_keys_in,
                 d_values_in,
diff --git a/cub/test/catch2_test_launch_wrapper.cu b/cub/test/catch2_test_launch_wrapper.cu
index 80be1b7f4ce..b942c84a8a2 100644
--- a/cub/test/catch2_test_launch_wrapper.cu
+++ b/cub/test/catch2_test_launch_wrapper.cu
@@ -95,7 +95,7 @@ struct cub_api_example_t
 
     const int blocks_in_grid = (num_items + threads_in_block - 1) / threads_in_block;
 
-    return thrust::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
+    return thrust::cuda_cub::detail::triple_chevron(blocks_in_grid, threads_in_block, 0, stream)
       .doit(kernel, d_in, d_out, num_items);
   }
 
diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu
index cf86389d68c..4301089cfa8 100644
--- a/cub/test/catch2_test_vsmem.cu
+++ b/cub/test/catch2_test_vsmem.cu
@@ -324,7 +324,7 @@ struct dispatch_dummy_algorithm_t
     launch_config_info->config_assumes_block_threads = static_cast<std::size_t>(block_threads);
     launch_config_info->config_vsmem_per_block       = vsmem_helper_t::vsmem_per_block;
 
-    THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_tiles, block_threads, 0, stream)
+    THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_tiles, block_threads, 0, stream)
       .doit(dummy_algorithm_kernel<typename PolicyHub::max_policy_t, InputIteratorT, OutputIteratorT, OffsetT>,
             d_in,
             d_out,
diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst
index 4cc639e27fb..9d0ccbd3b10 100644
--- a/docs/cub/developer_overview.rst
+++ b/docs/cub/developer_overview.rst
@@ -720,7 +720,7 @@ To eliminate the symbol visibility issues on our end, we follow the following ru
        it's important that kernels compiled for a given GPU architecture are always used by the host
        API compiled for that architecture.
 
-To satisfy (1), the visibility of ``thrust::cuda_cub::launcher::triple_chevron`` is hidden.
+To satisfy (1), the visibility of ``thrust::cuda_cub::detail::triple_chevron`` is hidden.
 
 To satisfy (2), instead of annotating kernels as ``__global__`` we annotate them as
 ``CUB_DETAIL_KERNEL_ATTRIBUTES``. Apart from annotating a kernel as global function, the macro also
diff --git a/libcudacxx/include/cuda/std/__functional/identity.h b/libcudacxx/include/cuda/std/__functional/identity.h
index f09f4038877..e22f9b5af3b 100644
--- a/libcudacxx/include/cuda/std/__functional/identity.h
+++ b/libcudacxx/include/cuda/std/__functional/identity.h
@@ -52,12 +52,8 @@ template <>
 struct __is_identity<reference_wrapper<const __identity>> : true_type
 {};
 
-#if _CCCL_STD_VER > 2011
-
 using identity = __identity;
 
-#endif // _CCCL_STD_VER > 2011
-
 _LIBCUDACXX_END_NAMESPACE_STD
 
 #endif // _LIBCUDACXX___FUNCTIONAL_IDENTITY_H
diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
index 218ee4ab771..a051cd99ac4 100644
--- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h
+++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h
@@ -279,7 +279,8 @@ struct AgentLauncher : Agent
   {
     assert(has_shmem && vshmem == nullptr);
     print_info(_kernel_agent<Agent, Args...>);
-    launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream).doit(_kernel_agent<Agent, Args...>, args...);
+    cuda_cub::detail::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+      .doit(_kernel_agent<Agent, Args...>, args...);
   }
 
   // If there is a risk of not having enough shared memory
@@ -295,7 +296,7 @@ struct AgentLauncher : Agent
   {
     assert((has_shmem && vshmem == nullptr) || (!has_shmem && vshmem != nullptr && shmem_size == 0));
     print_info(_kernel_agent_vshmem<Agent, Args...>);
-    launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+    cuda_cub::detail::triple_chevron(grid, plan.block_threads, shmem_size, stream)
       .doit(_kernel_agent_vshmem<Agent, Args...>, vshmem, args...);
   }
 
diff --git a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
index 80bbc2bc650..cde9219dc7c 100644
--- a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
+++ b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h
@@ -45,7 +45,7 @@ THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub
 {
-namespace launcher
+namespace detail
 {
 
 struct _CCCL_VISIBILITY_HIDDEN triple_chevron
@@ -165,7 +165,11 @@ struct _CCCL_VISIBILITY_HIDDEN triple_chevron
   }
 
 }; // struct triple_chevron
+} // namespace detail
 
+namespace launcher
+{
+using triple_chevron CCCL_DEPRECATED = detail::triple_chevron;
 } // namespace launcher
 } // namespace cuda_cub