Skip to content

Commit

Permalink
Backport to 2.8: Internalize triple_chevron (#3648) (#3650)
Browse files Browse the repository at this point in the history
Also make cuda::std::identity available in C++11
  • Loading branch information
bernhardmgruber authored Feb 3, 2025
1 parent 5b38c4a commit 80e50c9
Show file tree
Hide file tree
Showing 25 changed files with 95 additions and 96 deletions.
4 changes: 2 additions & 2 deletions cub/cub/detail/launcher/cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ namespace detail

struct TripleChevronFactory
{
CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron
CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron
operator()(dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream) const
{
return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream);
return THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid, block, shared_mem, stream);
}

CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ struct DispatchAdjacentDifference
reinterpret_cast<long long>(stream));
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_block_size, 0, stream)
.doit(DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
d_input,
first_tile_previous,
Expand Down Expand Up @@ -251,7 +251,7 @@ struct DispatchAdjacentDifference
reinterpret_cast<long long>(stream));
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
.doit(DeviceAdjacentDifferenceDifferenceKernel<
typename PolicyHub::MaxPolicy,
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ struct DispatchBatchMemcpy
#endif

// Invoke init_kernel to initialize buffer prefix sum-tile descriptors
error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);

// Check for failure to launch
Expand Down Expand Up @@ -578,7 +578,7 @@ struct DispatchBatchMemcpy
// Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked
// up by next kernel
error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
batch_memcpy_grid_size, ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS, 0, stream)
.doit(batch_memcpy_non_blev_kernel,
input_buffer_it,
Expand Down Expand Up @@ -615,7 +615,7 @@ struct DispatchBatchMemcpy
#endif

error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream)
.doit(multi_block_memcpy_kernel,
d_blev_src_buffers,
d_blev_dst_buffers,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ struct dispatch_t
static_cast<int>(items_per_thread));
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
.doit(detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
error = CubDebug(error);
Expand Down Expand Up @@ -153,7 +153,7 @@ struct dispatch_t
static_cast<int>(items_per_thread));
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
.doit(detail::for_each::static_kernel<typename PolicyHubT::MaxPolicy, OffsetT, OpT>, num_items, op);
error = CubDebug(error);
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ public:
items_per_thread);
# endif
auto status =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream)
.doit(detail::for_each_in_extents::
static_kernel<max_policy_t, OpType, ExtentsType, decltype(sub_sizes_div_array), Ranks...>,
_op,
Expand Down Expand Up @@ -162,7 +162,7 @@ public:
_stream,
items_per_thread);
# endif
status = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream)
status = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream)
.doit(kernel, _op, _ext, sub_sizes_div_array, extents_div_array);
_CUB_RETURN_IF_ERROR(status)
_CUB_RETURN_IF_STREAM_ERROR(_stream)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ struct dispatch_histogram
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke histogram_init_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
histogram_init_grid_dims, histogram_init_block_threads, 0, stream)
.doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue);

Expand All @@ -466,7 +466,7 @@ struct dispatch_histogram
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke histogram_sweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
.doit(histogram_sweep_kernel,
d_samples,
num_output_bins_wrapper,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ struct dispatch_t
const int partition_grid_size = static_cast<int>(::cuda::ceil_div(num_partitions, threads_per_partition_block));

auto error = CubDebug(
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
partition_grid_size, threads_per_partition_block, 0, stream)
.doit(device_partition_merge_path_kernel<
max_policy_t,
Expand Down Expand Up @@ -255,7 +255,7 @@ struct dispatch_t
{
auto vshmem_ptr = vsmem_t{allocations[1]};
auto error = CubDebug(
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<int>(num_tiles), static_cast<int>(agent_t::policy::BLOCK_THREADS), 0, stream)
.doit(
device_merge_kernel<max_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>,
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,7 @@ struct DispatchMergeSort
auto items_buffer = static_cast<ValueT*>(allocations[2]);

// Invoke DeviceMergeSortBlockSortKernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<int>(num_tiles), merge_sort_helper_t::policy_t::BLOCK_THREADS, 0, stream, true)
.doit(
DeviceMergeSortBlockSortKernel<
Expand Down Expand Up @@ -572,7 +572,7 @@ struct DispatchMergeSort
const OffsetT target_merged_tiles_number = OffsetT(2) << pass;

// Partition
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
partition_grid_size, threads_per_partition_block, 0, stream, true)
.doit(DeviceMergeSortPartitionKernel<KeyIteratorT, OffsetT, CompareOpT, KeyT>,
ping,
Expand All @@ -599,7 +599,7 @@ struct DispatchMergeSort
}

// Merge
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<int>(num_tiles), static_cast<int>(merge_sort_helper_t::policy_t::BLOCK_THREADS), 0, stream, true)
.doit(
DeviceMergeSortMergeKernel<typename PolicyHub::MaxPolicy,
Expand Down
17 changes: 8 additions & 9 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1008,8 +1008,7 @@ struct DispatchRadixSort
#endif

// Invoke upsweep_kernel with same grid size as downsweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
.doit(single_tile_kernel,
d_keys.Current(),
d_keys.Alternate(),
Expand Down Expand Up @@ -1082,7 +1081,7 @@ struct DispatchRadixSort
int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits;

// Invoke upsweep_kernel with same grid size as downsweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream)
.doit(pass_config.upsweep_kernel,
d_keys_in,
Expand Down Expand Up @@ -1117,7 +1116,7 @@ struct DispatchRadixSort
#endif

// Invoke scan_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream)
.doit(pass_config.scan_kernel, d_spine, pass_spine_length);

// Check for failure to launch
Expand Down Expand Up @@ -1145,7 +1144,7 @@ struct DispatchRadixSort
#endif

// Invoke downsweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream)
.doit(pass_config.downsweep_kernel,
d_keys_in,
Expand Down Expand Up @@ -1346,7 +1345,7 @@ struct DispatchRadixSort
ActivePolicyT::HistogramPolicy::RADIX_BITS);
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream)
.doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer);
error = CubDebug(error);
Expand All @@ -1373,7 +1372,7 @@ struct DispatchRadixSort
ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS);
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
.doit(DeviceRadixSortExclusiveSumKernel<max_policy_t, OffsetT>, d_bins);
error = CubDebug(error);
if (cudaSuccess != error)
Expand Down Expand Up @@ -1437,7 +1436,7 @@ struct DispatchRadixSort
DecomposerT>;

error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
.doit(onesweep_kernel,
d_lookback,
d_ctrs + portion * num_passes + pass,
Expand Down Expand Up @@ -2096,7 +2095,7 @@ struct DispatchSegmentedRadixSort
pass_bits);
#endif

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
num_segments, pass_config.segmented_config.block_threads, 0, stream)
.doit(pass_config.segmented_kernel,
d_keys_in,
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@ struct DispatchSegmentedReduce
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke DeviceReduceKernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream)
.doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init);

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ struct DispatchReduceByKey
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(init_kernel, tile_state, num_tiles, d_num_runs_out);

// Check for failure to launch
Expand Down Expand Up @@ -403,7 +403,7 @@ struct DispatchReduceByKey
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke reduce_by_key_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
.doit(reduce_by_key_kernel,
d_keys_in,
d_unique_out,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ struct DeviceRleDispatch
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out);

// Check for failure to launch
Expand Down Expand Up @@ -418,7 +418,7 @@ struct DeviceRleDispatch
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke device_rle_sweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
.doit(device_rle_sweep_kernel,
d_in,
d_offsets_out,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ struct DispatchScan
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(init_kernel, tile_state, num_tiles);

// Check for failure to launch
Expand Down Expand Up @@ -482,7 +482,7 @@ struct DispatchScan
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke scan_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
.doit(scan_kernel, d_in, d_out, tile_state, start_tile, scan_op, init_value, num_items);

// Check for failure to launch
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ struct DispatchScanByKey
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, static_cast<OffsetT>(tile_size), num_tiles);

// Check for failure to launch
Expand Down Expand Up @@ -483,7 +483,7 @@ struct DispatchScanByKey
#endif // CUB_DETAIL_DEBUG_ENABLE_LOG

// Invoke scan_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
.doit(scan_kernel,
d_keys_in,
d_keys_prev_in,
Expand Down
Loading

0 comments on commit 80e50c9

Please sign in to comment.