Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Priority Queue #105

Open
wants to merge 56 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
5ab856e
Initial priority queue commit
Sep 9, 2021
1f2092c
Add priority queue benchmark
Sep 9, 2021
6a9dc99
Class comment
Sep 9, 2021
6b263e3
Improve comments and switch to cuco style
Sep 9, 2021
0eaaedf
Iterators
Sep 17, 2021
249165c
Test for iterators with thrust device_vector
Sep 17, 2021
c28a5ad
Add allocator template parameter
Oct 19, 2021
e8a9c1e
Allocator
andrewbriand Oct 20, 2021
012ebde
Accept arbitrary comparison
andrewbriand Oct 20, 2021
8cf681a
Accept arbitrary types instead of just pairs
andrewbriand Oct 24, 2021
8485bec
Remove pq_pair.h
andrewbriand Nov 2, 2021
da608cc
Start porting priority queue benchmark to gbenchmark
andrewbriand Nov 2, 2021
8a11b7f
Finish porting priority queue benchmark to gbenchmark
andrewbriand Nov 3, 2021
d1392b9
Add multiple node sizes to benchmark
andrewbriand Dec 18, 2021
9ee6c8b
Start porting tests to Catch2
andrewbriand Dec 18, 2021
e223598
Prevent block size from being larger than node size
andrewbriand Dec 18, 2021
dd8c6b7
Continue porting tests to Catch2
andrewbriand Dec 19, 2021
d031519
Make generate_element for KVPair generic
andrewbriand Dec 19, 2021
ba3a6fd
Finish Catch2 tests
andrewbriand Dec 26, 2021
16db085
Hide kernel launch details
andrewbriand Dec 26, 2021
052cec0
Clean up partial deletion code
andrewbriand Dec 27, 2021
a11bea5
Correct test comparisons
andrewbriand Dec 27, 2021
e3c4a27
Commenting and cleanup
andrewbriand Dec 27, 2021
f6fa484
Commenting for Compare
andrewbriand Dec 27, 2021
599067f
Cleanup, arbitrary number of elements for device API functions
andrewbriand Dec 27, 2021
44db340
Formatting
andrewbriand Dec 27, 2021
acfdf7e
Add missing syncs
andrewbriand Apr 12, 2022
d870e29
Merge NVIDIA:dev into andrewbriand:dev
andrewbriand Apr 14, 2022
71775b6
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Apr 14, 2022
9838569
Add copyright to priority_queue_bench.cu
andrewbriand May 31, 2022
aab4ba0
Add copyright to priority queue files
andrewbriand May 31, 2022
0196bde
Order headers from near to far in priority queue files
andrewbriand May 31, 2022
4af61ca
Bug fix in priority queue test code
andrewbriand May 31, 2022
a1d074a
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] May 31, 2022
bf930dd
Remove unnecessary allocator
andrewbriand May 31, 2022
2d9bda9
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] May 31, 2022
54dc9f3
Add missing member docs in priority_queue.cuh
andrewbriand Jun 11, 2022
a5c169d
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 11, 2022
4269e9c
Add stream parameter to priority queue ctor
andrewbriand Jun 11, 2022
30cbf83
Snake case in priority queue files
andrewbriand Jun 12, 2022
bec63f3
Put priority queue kernels in detail namespace
andrewbriand Jun 12, 2022
aa12404
generate_keys_uniform -> generate_kv_pairs_uniform
andrewbriand Jun 13, 2022
55cf2e6
Remove FavorInsertionPerformance template parameter
andrewbriand Jun 13, 2022
f4814db
Default node size 64 -> 1024
andrewbriand Jun 15, 2022
89eea18
Avoid c-style expressions in priority queue files
andrewbriand Jun 15, 2022
7d47200
Remove FavorInsertionPerformance in priority queue benchmark
andrewbriand Jun 15, 2022
007316a
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 15, 2022
192e263
Snake case in priority_queue_test.cu
andrewbriand Jun 17, 2022
66dd359
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 17, 2022
9da822f
kPBufferIdx -> p_buffer_idx and kRootIdx -> root_idx
andrewbriand Jun 17, 2022
0cfdd94
Use const and constexpr wherever possible in priority queue files
andrewbriand Jun 19, 2022
828b00b
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 19, 2022
1932418
Add missing const in priority queue
andrewbriand Jun 19, 2022
7c4b1f6
Add docs for stream parameter to priority queue ctor
andrewbriand Jun 19, 2022
838e4ea
Add value_type to priority_queue::device_mutable_view
andrewbriand Jun 19, 2022
d58dd9f
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Jun 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,7 @@ ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")
###################################################################################################
set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")

###################################################################################################
set(PRIORITY_QUEUE_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_bench.cu")
ConfigureBench(PRIORITY_QUEUE_BENCH "${PRIORITY_QUEUE_BENCH_SRC}")
100 changes: 100 additions & 0 deletions benchmarks/priority_queue/priority_queue_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#include <vector>
#include <cstdint>
andrewbriand marked this conversation as resolved.
Show resolved Hide resolved
#include <random>

#include <benchmark/benchmark.h>

#include <cuco/priority_queue.cuh>
#include <cuco/detail/pair.cuh>

#include <thrust/device_vector.h>

using namespace cuco;

template <typename T>
struct pair_less {
__host__ __device__ bool operator()(const T& a, const T& b) const {
return a.first < b.first;
}
};

template<typename Key, typename Value, typename OutputIt>
static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) {
std::random_device rd;
std::mt19937 gen{rd()};

auto num_keys = std::distance(output_begin, output_end);

for (auto i = 0; i < num_keys; ++i) {
output_begin[i] = {static_cast<Key>(gen()), static_cast<Value>(gen())};
}
}

template <typename Key, typename Value, int NumKeys,
bool FavorInsertionPerformance>
static void BM_insert(::benchmark::State& state)
{
for (auto _ : state) {
state.PauseTiming();

priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>,
FavorInsertionPerformance> pq(NumKeys);

std::vector<pair<Key, Value>> h_pairs(NumKeys);
generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end());
thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs);

state.ResumeTiming();
pq.push(d_pairs.begin(), d_pairs.end());
cudaDeviceSynchronize();
}

}

template <typename Key, typename Value, int NumKeys,
bool FavorInsertionPerformance>
static void BM_delete(::benchmark::State& state)
{
for (auto _ : state) {
state.PauseTiming();

priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>,
FavorInsertionPerformance> pq(NumKeys);

std::vector<pair<Key, Value>> h_pairs(NumKeys);
generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end());
thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs);

pq.push(d_pairs.begin(), d_pairs.end());
cudaDeviceSynchronize();

state.ResumeTiming();
pq.pop(d_pairs.begin(), d_pairs.end());
cudaDeviceSynchronize();
}

}

BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false)
->Unit(benchmark::kMillisecond);

BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false)
->Unit(benchmark::kMillisecond);

BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false)
->Unit(benchmark::kMillisecond);

BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false)
->Unit(benchmark::kMillisecond);

BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true)
->Unit(benchmark::kMillisecond);

BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true)
->Unit(benchmark::kMillisecond);

BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true)
->Unit(benchmark::kMillisecond);

BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true)
->Unit(benchmark::kMillisecond);
175 changes: 175 additions & 0 deletions include/cuco/detail/priority_queue.inl
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#pragma once
#include <cmath>

#include <cuco/detail/priority_queue_kernels.cuh>
#include <cuco/detail/error.hpp>

namespace cuco {

template <typename T, typename Compare, bool FavorInsertionPerformance,
typename Allocator>
priority_queue<T, Compare, FavorInsertionPerformance,
Allocator>::priority_queue
(size_t initial_capacity,
Allocator const& allocator) :
allocator_{allocator},
int_allocator_{allocator},
t_allocator_{allocator},
size_t_allocator_{allocator} {

node_size_ = NodeSize;

// Round up to the nearest multiple of node size
int nodes = ((initial_capacity + node_size_ - 1) / node_size_);

node_capacity_ = nodes;
lowest_level_start_ = 1 << (int)log2(nodes);

// Allocate device variables

d_size_ = std::allocator_traits<int_allocator_type>::allocate(int_allocator_,
1);

CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int)));

d_p_buffer_size_ = std::allocator_traits<size_t_allocator_type>
::allocate(size_t_allocator_, 1);

CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t)));

d_heap_ = std::allocator_traits<t_allocator_type>
::allocate(t_allocator_,
node_capacity_ * node_size_ + node_size_);

d_locks_ = std::allocator_traits<int_allocator_type>
::allocate(int_allocator_, node_capacity_ + 1);

CUCO_CUDA_TRY(cudaMemset(d_locks_, 0,
sizeof(int) * (node_capacity_ + 1)));


}

template <typename T, typename Compare, bool FavorInsertionPerformance,
typename Allocator>
priority_queue<T, Compare, FavorInsertionPerformance,
Allocator>::~priority_queue() {
std::allocator_traits<int_allocator_type>::deallocate(int_allocator_,
d_size_, 1);
std::allocator_traits<size_t_allocator_type>::deallocate(size_t_allocator_,
d_p_buffer_size_, 1);
std::allocator_traits<t_allocator_type>::deallocate(t_allocator_,
d_heap_,
node_capacity_ * node_size_ + node_size_);
std::allocator_traits<int_allocator_type>::deallocate(int_allocator_,
d_locks_,
node_capacity_ + 1);
}


template <typename T, typename Compare, bool FavorInsertionPerformance,
typename Allocator>
template <typename InputIt>
void priority_queue<T, Compare, FavorInsertionPerformance,
Allocator>::push(InputIt first,
InputIt last,
cudaStream_t stream) {

const int kBlockSize = min(256, (int)node_size_);
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
const int kNumBlocks = min(64000,
max(1, (int)((last - first) / node_size_)));

PushKernel<<<kNumBlocks, kBlockSize,
get_shmem_size(kBlockSize), stream>>>
(first, last - first, d_heap_, d_size_,
node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_,
compare_);

CUCO_CUDA_TRY(cudaGetLastError());
}

template <typename T, typename Compare, bool FavorInsertionPerformance,
typename Allocator>
template <typename OutputIt>
void priority_queue<T, Compare, FavorInsertionPerformance,
Allocator>::pop(OutputIt first,
OutputIt last,
cudaStream_t stream) {

int pop_size = last - first;
const int partial = pop_size % node_size_;

const int kBlockSize = min(256, (int)node_size_);
const int kNumBlocks = min(64000,
max(1, (int)((pop_size - partial) / node_size_)));

PopKernel<<<kNumBlocks, kBlockSize,
get_shmem_size(kBlockSize), stream>>>
(first, pop_size, d_heap_, d_size_,
node_size_, d_locks_, d_p_buffer_size_,
lowest_level_start_, node_capacity_, compare_);

CUCO_CUDA_TRY(cudaGetLastError());
}

template <typename T, typename Compare, bool FavorInsertionPerformance,
typename Allocator>
template <typename CG, typename InputIt>
__device__ void priority_queue<T, Compare,
FavorInsertionPerformance, Allocator>
::device_mutable_view::push(
CG const& g,
InputIt first,
InputIt last,
void *temp_storage) {

SharedMemoryLayout<T> shmem =
GetSharedMemoryLayout<T>((int*)temp_storage,
g.size(), node_size_);

auto push_size = last - first;
for (size_t i = 0; i < push_size / node_size_; i++) {
PushSingleNode(g, first + i * node_size_, d_heap_, d_size_, node_size_,
d_locks_, lowest_level_start_, shmem, compare_);
}

if (push_size % node_size_ != 0) {
PushPartialNode(g, first + (push_size / node_size_) * node_size_,
push_size % node_size_, d_heap_,
d_size_, node_size_, d_locks_,
d_p_buffer_size_, lowest_level_start_, shmem,
compare_);
}
}

template <typename T, typename Compare, bool FavorInsertionPerformance,
typename Allocator>
template <typename CG, typename OutputIt>
__device__ void priority_queue<T, Compare,
FavorInsertionPerformance, Allocator>
::device_mutable_view::pop(
CG const& g,
OutputIt first,
OutputIt last,
void *temp_storage) {
SharedMemoryLayout<T> shmem =
GetSharedMemoryLayout<T>((int*)temp_storage,
g.size(), node_size_);

auto pop_size = last - first;
for (size_t i = 0; i < pop_size / node_size_; i++) {
PopSingleNode(g, first + i * node_size_,
d_heap_, d_size_, node_size_, d_locks_,
d_p_buffer_size_, lowest_level_start_,
node_capacity_, shmem, compare_);
}

if (pop_size % node_size_ != 0) {
PopPartialNode(g, first + (pop_size / node_size_) * node_size_,
last - first, d_heap_, d_size_, node_size_,
d_locks_, d_p_buffer_size_, lowest_level_start_,
node_capacity_, shmem, compare_);
}
}

}
Loading