diff --git a/cub/cub/detail/launcher/cuda_runtime.cuh b/cub/cub/detail/launcher/cuda_runtime.cuh index 650a7dea68f..ec00f1aea3d 100644 --- a/cub/cub/detail/launcher/cuda_runtime.cuh +++ b/cub/cub/detail/launcher/cuda_runtime.cuh @@ -21,10 +21,10 @@ namespace detail struct TripleChevronFactory { - CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron + CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const { - return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream); + return THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid, block, shared_mem, stream); } CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 9e05f86dd43..cec5986b44f 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -221,7 +221,7 @@ struct DispatchAdjacentDifference reinterpret_cast(stream)); #endif // CUB_DETAIL_DEBUG_ENABLE_LOG - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_block_size, 0, stream) .doit(DeviceAdjacentDifferenceInitKernel, d_input, first_tile_previous, @@ -251,7 +251,7 @@ struct DispatchAdjacentDifference reinterpret_cast(stream)); #endif // CUB_DETAIL_DEBUG_ENABLE_LOG - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream) .doit(DeviceAdjacentDifferenceDifferenceKernel< typename PolicyHub::MaxPolicy, diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index df2b0963828..daeae156d2a 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -548,7 +548,7 @@ struct DispatchBatchMemcpy #endif // Invoke init_kernel to initialize buffer prefix sum-tile descriptors - error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles); // Check for failure to launch @@ -578,7 +578,7 @@ struct DispatchBatchMemcpy // Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked // up by next kernel error = - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( batch_memcpy_grid_size, ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS, 0, stream) .doit(batch_memcpy_non_blev_kernel, input_buffer_it, @@ -615,7 +615,7 @@ struct DispatchBatchMemcpy #endif error = - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream) .doit(multi_block_memcpy_kernel, d_blev_src_buffers, d_blev_dst_buffers, diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index 7ba478e3c00..4c309140c3a 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -110,7 +110,7 @@ struct dispatch_t static_cast(items_per_thread)); #endif - error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( static_cast(num_tiles), static_cast(block_threads), 0, stream) .doit(detail::for_each::dynamic_kernel, num_items, op); error = CubDebug(error); @@ -153,7 +153,7 @@ struct dispatch_t static_cast(items_per_thread)); #endif - error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( static_cast(num_tiles), static_cast(block_threads), 0, stream) .doit(detail::for_each::static_kernel, num_items, op); error = CubDebug(error); diff --git a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh index 6e346316d48..65412e54e8b 100644 --- a/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh +++ b/cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh @@ -125,7 +125,7 @@ public: items_per_thread); # endif auto status = - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream) .doit(detail::for_each_in_extents:: static_kernel, _op, @@ -162,7 +162,7 @@ public: _stream, items_per_thread); # endif - status = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream) + status = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream) .doit(kernel, _op, _ext, sub_sizes_div_array, extents_div_array); _CUB_RETURN_IF_ERROR(status) _CUB_RETURN_IF_STREAM_ERROR(_stream) diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index d4bc8306eca..151fb08f3ab 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -442,7 +442,7 @@ struct dispatch_histogram #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke histogram_init_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( histogram_init_grid_dims, histogram_init_block_threads, 0, stream) .doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue); @@ -466,7 +466,7 @@ struct dispatch_histogram #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke histogram_sweep_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(sweep_grid_dims, block_threads, 0, stream) .doit(histogram_sweep_kernel, d_samples, num_output_bins_wrapper, diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index fa467cdd5fe..e083fe36d00 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -220,7 +220,7 @@ struct dispatch_t const int partition_grid_size = static_cast(::cuda::ceil_div(num_partitions, threads_per_partition_block)); auto error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( partition_grid_size, threads_per_partition_block, 0, stream) .doit(device_partition_merge_path_kernel< max_policy_t, @@ -255,7 +255,7 @@ struct dispatch_t { auto vshmem_ptr = vsmem_t{allocations[1]}; auto error = CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( static_cast(num_tiles), static_cast(agent_t::policy::BLOCK_THREADS), 0, stream) .doit( device_merge_kernel, diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 8a0e31e9bbd..028f4fd9df1 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -513,7 +513,7 @@ struct DispatchMergeSort auto items_buffer = static_cast(allocations[2]); // Invoke DeviceMergeSortBlockSortKernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( static_cast(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true) .doit( DeviceMergeSortBlockSortKernel< @@ -572,7 +572,7 @@ struct DispatchMergeSort const OffsetT target_merged_tiles_number = OffsetT(2) << pass; // Partition - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( partition_grid_size, threads_per_partition_block, 0, stream, true) .doit(DeviceMergeSortPartitionKernel, ping, @@ -599,7 +599,7 @@ struct DispatchMergeSort } // Merge - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( static_cast(num_tiles), static_cast(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true) .doit( DeviceMergeSortMergeKernel, d_bins); error = CubDebug(error); if (cudaSuccess != error) @@ -1437,7 +1436,7 @@ struct DispatchRadixSort DecomposerT>; error = - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream) .doit(onesweep_kernel, d_lookback, d_ctrs + portion * num_passes + pass, @@ -2096,7 +2095,7 @@ struct DispatchSegmentedRadixSort pass_bits); #endif - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( num_segments, pass_config.segmented_config.block_threads, 0, stream) .doit(pass_config.segmented_kernel, d_keys_in, diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index d3e20785e57..170237d5646 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -969,7 +969,7 @@ struct DispatchSegmentedReduce #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke DeviceReduceKernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream) .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init); diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 76a39317212..5637f462c04 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -346,7 +346,7 @@ struct DispatchReduceByKey #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles, d_num_runs_out); // Check for failure to launch @@ -403,7 +403,7 @@ struct DispatchReduceByKey #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke reduce_by_key_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(reduce_by_key_kernel, d_keys_in, d_unique_out, diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 5d567d45778..f22417886b2 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -357,7 +357,7 @@ struct DeviceRleDispatch #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out); // Check for failure to launch @@ -418,7 +418,7 @@ struct DeviceRleDispatch #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke device_rle_sweep_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(device_rle_sweep_kernel, d_in, d_offsets_out, diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index cba024791b7..5131c73ba22 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -430,7 +430,7 @@ struct DispatchScan #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles); // Check for failure to launch @@ -482,7 +482,7 @@ struct DispatchScan #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke scan_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) .doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items); // Check for failure to launch diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index d3f7e680a72..cfed41b44f7 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -442,7 +442,7 @@ struct DispatchScanByKey #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, static_cast(tile_size), num_tiles); // Check for failure to launch @@ -483,7 +483,7 @@ struct DispatchScanByKey #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke scan_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream) .doit(scan_kernel, d_keys_in, d_keys_prev_in, diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index dc98438e669..dc63f68633b 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -553,8 +553,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont (long long) stream); #endif // CUB_DETAIL_DEBUG_ENABLE_LOG - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(blocks_in_grid, LargeSegmentPolicyT::BLOCK_THREADS, 0, stream) .doit(large_kernel, large_and_medium_segments_indices, d_current_keys, @@ -601,7 +600,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN cudaError_t DeviceSegmentedSortCont (long long) stream); #endif // CUB_DETAIL_DEBUG_ENABLE_LOG - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron( small_and_medium_blocks_in_grid, SmallAndMediumPolicyT::BLOCK_THREADS, 0, stream) .doit(small_kernel, small_segments, @@ -1234,43 +1233,43 @@ private: #else // CUB_RDC_ENABLED -# define CUB_TEMP_DEVICE_CODE \ - error = \ - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, 1, 0, stream) \ - .doit( \ - DeviceSegmentedSortContinuationKernel< \ - typename PolicyHub::MaxPolicy, \ - LargeKernelT, \ - SmallKernelT, \ - KeyT, \ - ValueT, \ - BeginOffsetIteratorT, \ - EndOffsetIteratorT>, \ - large_kernel, \ - small_kernel, \ - num_segments, \ - d_keys.Current(), \ - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ - d_keys_double_buffer, \ - d_values.Current(), \ - GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ - d_values_double_buffer, \ - d_begin_offsets, \ - d_end_offsets, \ - group_sizes.get(), \ - large_and_medium_segments_indices.get(), \ - small_segments_indices.get()); \ - error = CubDebug(error); \ - \ - if (cudaSuccess != error) \ - { \ - return error; \ - } \ - \ - error = CubDebug(detail::DebugSyncStream(stream)); \ - if (cudaSuccess != error) \ - { \ - return error; \ +# define CUB_TEMP_DEVICE_CODE \ + error = \ + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, 1, 0, stream) \ + .doit( \ + DeviceSegmentedSortContinuationKernel< \ + typename PolicyHub::MaxPolicy, \ + LargeKernelT, \ + SmallKernelT, \ + KeyT, \ + ValueT, \ + BeginOffsetIteratorT, \ + EndOffsetIteratorT>, \ + large_kernel, \ + small_kernel, \ + num_segments, \ + d_keys.Current(), \ + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), \ + d_keys_double_buffer, \ + d_values.Current(), \ + GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_values), \ + d_values_double_buffer, \ + d_begin_offsets, \ + d_end_offsets, \ + group_sizes.get(), \ + large_and_medium_segments_indices.get(), \ + small_segments_indices.get()); \ + error = CubDebug(error); \ + \ + if (cudaSuccess != error) \ + { \ + return error; \ + } \ + \ + error = CubDebug(detail::DebugSyncStream(stream)); \ + if (cudaSuccess != error) \ + { \ + return error; \ } #endif // CUB_RDC_ENABLED @@ -1348,7 +1347,7 @@ private: #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke fallback kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(fallback_kernel, d_keys.Current(), GetFinalOutput(LargeSegmentPolicyT::RADIX_BITS, d_keys), diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 98093b01404..81a8fbe72ae 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -670,7 +670,7 @@ struct DispatchSelectIf #endif // Invoke scan_init_kernel to initialize tile descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(scan_init_kernel, tile_status, current_num_tiles, d_num_selected_out); // Check for failure to launch @@ -718,7 +718,7 @@ struct DispatchSelectIf #endif // Invoke select_if_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(current_num_tiles, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(current_num_tiles, block_threads, 0, stream) .doit(select_if_kernel, d_in, d_flags, diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 90295f2c06f..519873eb7b7 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -289,7 +289,7 @@ struct DispatchThreeWayPartitionIf #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke three_way_partition_init_kernel to initialize tile descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(three_way_partition_init_kernel, tile_status, num_tiles, d_num_selected_out); // Check for failure to launch @@ -346,7 +346,7 @@ struct DispatchThreeWayPartitionIf #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke select_if_kernel - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(three_way_partition_kernel, d_in, d_first_part_out, diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index ed2e104746e..24fdfbb5da7 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -663,7 +663,7 @@ struct dispatch_t PoorExpected< ::cuda::std:: - tuple> + tuple> { using policy_t = typename ActivePolicy::algo_policy; constexpr int block_dim = policy_t::block_threads; @@ -739,7 +739,7 @@ struct dispatch_t(::cuda::ceil_div(num_items, Offset{config->tile_size})); return ::cuda::std::make_tuple( - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, config->smem_size, stream), + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid_dim, block_dim, config->smem_size, stream), CUB_DETAIL_TRANSFORM_KERNEL_PTR, config->elem_per_thread); } @@ -820,7 +820,7 @@ struct dispatch_t(::cuda::ceil_div(num_items, Offset{tile_size})); return CubDebug( - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid_dim, block_dim, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid_dim, block_dim, 0, stream) .doit( CUB_DETAIL_TRANSFORM_KERNEL_PTR, num_items, diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index b0747b9b42a..5b78d4ae344 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -430,7 +430,7 @@ struct DispatchUniqueByKey #endif // CUB_DETAIL_DEBUG_ENABLE_LOG // Invoke init_kernel to initialize tile descriptors - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream) .doit(init_kernel, tile_state, num_tiles, d_num_selected_out); // Check for failure to launch @@ -494,7 +494,7 @@ struct DispatchUniqueByKey // Invoke select_if_kernel error = - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream) .doit(scan_kernel, d_keys_in, d_values_in, diff --git a/cub/test/catch2_test_launch_wrapper.cu b/cub/test/catch2_test_launch_wrapper.cu index 80be1b7f4ce..b942c84a8a2 100644 --- a/cub/test/catch2_test_launch_wrapper.cu +++ b/cub/test/catch2_test_launch_wrapper.cu @@ -95,7 +95,7 @@ struct cub_api_example_t const int blocks_in_grid = (num_items + threads_in_block - 1) / threads_in_block; - return thrust::cuda_cub::launcher::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) + return thrust::cuda_cub::detail::triple_chevron(blocks_in_grid, threads_in_block, 0, stream) .doit(kernel, d_in, d_out, num_items); } diff --git a/cub/test/catch2_test_vsmem.cu b/cub/test/catch2_test_vsmem.cu index cf86389d68c..4301089cfa8 100644 --- a/cub/test/catch2_test_vsmem.cu +++ b/cub/test/catch2_test_vsmem.cu @@ -324,7 +324,7 @@ struct dispatch_dummy_algorithm_t launch_config_info->config_assumes_block_threads = static_cast(block_threads); launch_config_info->config_vsmem_per_block = vsmem_helper_t::vsmem_per_block; - THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_tiles, block_threads, 0, stream) + THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_tiles, block_threads, 0, stream) .doit(dummy_algorithm_kernel, d_in, d_out, diff --git a/docs/cub/developer_overview.rst b/docs/cub/developer_overview.rst index 4cc639e27fb..9d0ccbd3b10 100644 --- a/docs/cub/developer_overview.rst +++ b/docs/cub/developer_overview.rst @@ -720,7 +720,7 @@ To eliminate the symbol visibility issues on our end, we follow the following ru it's important that kernels compiled for a given GPU architecture are always used by the host API compiled for that architecture. -To satisfy (1), the visibility of ``thrust::cuda_cub::launcher::triple_chevron`` is hidden. +To satisfy (1), the visibility of ``thrust::cuda_cub::detail::triple_chevron`` is hidden. To satisfy (2), instead of annotating kernels as ``__global__`` we annotate them as ``CUB_DETAIL_KERNEL_ATTRIBUTES``. Apart from annotating a kernel as global function, the macro also diff --git a/libcudacxx/include/cuda/std/__functional/identity.h b/libcudacxx/include/cuda/std/__functional/identity.h index f09f4038877..e22f9b5af3b 100644 --- a/libcudacxx/include/cuda/std/__functional/identity.h +++ b/libcudacxx/include/cuda/std/__functional/identity.h @@ -52,12 +52,8 @@ template <> struct __is_identity> : true_type {}; -#if _CCCL_STD_VER > 2011 - using identity = __identity; -#endif // _CCCL_STD_VER > 2011 - _LIBCUDACXX_END_NAMESPACE_STD #endif // _LIBCUDACXX___FUNCTIONAL_IDENTITY_H diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index 218ee4ab771..a051cd99ac4 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -279,7 +279,8 @@ struct AgentLauncher : Agent { assert(has_shmem && vshmem == nullptr); print_info(_kernel_agent); - launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream).doit(_kernel_agent, args...); + cuda_cub::detail::triple_chevron(grid, plan.block_threads, shmem_size, stream) + .doit(_kernel_agent, args...); } // If there is a risk of not having enough shared memory @@ -295,7 +296,7 @@ struct AgentLauncher : Agent { assert((has_shmem && vshmem == nullptr) || (!has_shmem && vshmem != nullptr && shmem_size == 0)); print_info(_kernel_agent_vshmem); - launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream) + cuda_cub::detail::triple_chevron(grid, plan.block_threads, shmem_size, stream) .doit(_kernel_agent_vshmem, vshmem, args...); } diff --git a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h index 80bbc2bc650..cde9219dc7c 100644 --- a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h +++ b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h @@ -45,7 +45,7 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { -namespace launcher +namespace detail { struct _CCCL_VISIBILITY_HIDDEN triple_chevron @@ -165,7 +165,11 @@ struct _CCCL_VISIBILITY_HIDDEN triple_chevron } }; // struct triple_chevron +} // namespace detail +namespace launcher +{ +using triple_chevron CCCL_DEPRECATED = detail::triple_chevron; } // namespace launcher } // namespace cuda_cub