NVIDIA · andrewbriand · Sep 9, 2021 · Sep 9, 2021 · Sep 9, 2021 · Sep 9, 2021
@@ -50,3 +50,7 @@ ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}")
 ###################################################################################################
 set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu")
 ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}")
+
+###################################################################################################
+set(PRIORITY_QUEUE_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_bench.cu")
+ConfigureBench(PRIORITY_QUEUE_BENCH "${PRIORITY_QUEUE_BENCH_SRC}")
@@ -0,0 +1,100 @@
+#include <vector>
+#include <cstdint>
+#include <random>
+
+#include <benchmark/benchmark.h>
+
+#include <cuco/priority_queue.cuh>
+#include <cuco/detail/pair.cuh>
+
+#include <thrust/device_vector.h>
+
+using namespace cuco;
+
+template <typename T>
+struct pair_less {
+  __host__ __device__ bool operator()(const T& a, const T& b) const {
+    return a.first < b.first;
+  }
+};
+
+template<typename Key, typename Value, typename OutputIt>
+static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) {
+  std::random_device rd;
+  std::mt19937 gen{rd()};
+
+  auto num_keys = std::distance(output_begin, output_end);
+
+  for (auto i = 0; i < num_keys; ++i) {
+    output_begin[i] = {static_cast<Key>(gen()), static_cast<Value>(gen())};
+  }
+}
+
+template <typename Key, typename Value, int NumKeys,
+	  bool FavorInsertionPerformance>
+static void BM_insert(::benchmark::State& state)
+{
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>,
+	           FavorInsertionPerformance> pq(NumKeys);
+
+    std::vector<pair<Key, Value>> h_pairs(NumKeys);
+    generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end());
+    thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs);
+
+    state.ResumeTiming();
+    pq.push(d_pairs.begin(), d_pairs.end());
+    cudaDeviceSynchronize();
+  }
+
+}
+
+template <typename Key, typename Value, int NumKeys,
+	  bool FavorInsertionPerformance>
+static void BM_delete(::benchmark::State& state)
+{
+  for (auto _ : state) {
+    state.PauseTiming();
+
+    priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>,
+	           FavorInsertionPerformance> pq(NumKeys);
+
+    std::vector<pair<Key, Value>> h_pairs(NumKeys);
+    generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end());
+    thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs);
+
+    pq.push(d_pairs.begin(), d_pairs.end());
+    cudaDeviceSynchronize();
+
+    state.ResumeTiming();
+    pq.pop(d_pairs.begin(), d_pairs.end());
+    cudaDeviceSynchronize();
+  }
+
+}
+
+BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false)
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false)
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false)
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false)
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true)
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true)
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true)
+  ->Unit(benchmark::kMillisecond);
+
+BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true)
+  ->Unit(benchmark::kMillisecond);
@@ -0,0 +1,175 @@
+#pragma once
+#include <cmath>
+
+#include <cuco/detail/priority_queue_kernels.cuh>
+#include <cuco/detail/error.hpp>
+
+namespace cuco {
+
+template <typename T, typename Compare, bool FavorInsertionPerformance,
+          typename Allocator>
+priority_queue<T, Compare, FavorInsertionPerformance,
+               Allocator>::priority_queue
+                                               (size_t initial_capacity,
+                                                Allocator const& allocator) :
+                                                allocator_{allocator},
+                                                int_allocator_{allocator},
+                                                t_allocator_{allocator},
+                                                size_t_allocator_{allocator} {
+
+  node_size_ = NodeSize;
+
+  // Round up to the nearest multiple of node size
+  int nodes = ((initial_capacity + node_size_ - 1) / node_size_);
+
+  node_capacity_ = nodes;
+  lowest_level_start_ = 1 << (int)log2(nodes);
+
+  // Allocate device variables
+
+  d_size_ = std::allocator_traits<int_allocator_type>::allocate(int_allocator_,
+                                                                1);
+
+  CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int)));
+
+  d_p_buffer_size_ = std::allocator_traits<size_t_allocator_type>
+                        ::allocate(size_t_allocator_, 1);
+
+  CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t)));
+
+  d_heap_ = std::allocator_traits<t_allocator_type>
+               ::allocate(t_allocator_,
+                          node_capacity_ * node_size_ + node_size_);
+
+  d_locks_ = std::allocator_traits<int_allocator_type>
+                ::allocate(int_allocator_, node_capacity_ + 1);
+
+  CUCO_CUDA_TRY(cudaMemset(d_locks_, 0,
+                          sizeof(int) * (node_capacity_ + 1)));
+
+
+}
+
+template <typename T, typename Compare, bool FavorInsertionPerformance,
+          typename Allocator>
+priority_queue<T, Compare, FavorInsertionPerformance,
+                      Allocator>::~priority_queue() {
+  std::allocator_traits<int_allocator_type>::deallocate(int_allocator_,
+                                                        d_size_, 1);
+  std::allocator_traits<size_t_allocator_type>::deallocate(size_t_allocator_,
+                                                        d_p_buffer_size_, 1);
+  std::allocator_traits<t_allocator_type>::deallocate(t_allocator_,
+                                                         d_heap_,
+                                node_capacity_ * node_size_ + node_size_);
+  std::allocator_traits<int_allocator_type>::deallocate(int_allocator_,
+                                                        d_locks_,
+                                                    node_capacity_ + 1);
+}
+
+
+template <typename T, typename Compare, bool FavorInsertionPerformance,
+          typename Allocator>
+template <typename InputIt>
+void priority_queue<T, Compare, FavorInsertionPerformance,
+                          Allocator>::push(InputIt first,
+                                           InputIt last,
+                                           cudaStream_t stream) {
+
+  const int kBlockSize = min(256, (int)node_size_);
+  const int kNumBlocks = min(64000,
+                             max(1, (int)((last - first) / node_size_)));
+
+  PushKernel<<<kNumBlocks, kBlockSize,
+                 get_shmem_size(kBlockSize), stream>>>
+              (first, last - first, d_heap_, d_size_,
+               node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_,
+               compare_);
+
+  CUCO_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename T, typename Compare, bool FavorInsertionPerformance,
+          typename Allocator>
+template <typename OutputIt>
+void priority_queue<T, Compare, FavorInsertionPerformance,
+                          Allocator>::pop(OutputIt first,
+                                          OutputIt last,
+                                          cudaStream_t stream) {
+
+  int pop_size = last - first;
+  const int partial = pop_size % node_size_;
+
+  const int kBlockSize = min(256, (int)node_size_);
+  const int kNumBlocks = min(64000,
+                             max(1, (int)((pop_size - partial) / node_size_)));
+
+  PopKernel<<<kNumBlocks, kBlockSize,
+                 get_shmem_size(kBlockSize), stream>>>
+             (first, pop_size, d_heap_, d_size_,
+              node_size_, d_locks_, d_p_buffer_size_,
+              lowest_level_start_, node_capacity_, compare_);
+
+  CUCO_CUDA_TRY(cudaGetLastError());
+}
+
+template <typename T, typename Compare, bool FavorInsertionPerformance,
+          typename Allocator>
+template <typename CG, typename InputIt>
+__device__ void priority_queue<T, Compare,
+                               FavorInsertionPerformance, Allocator>
+                                 ::device_mutable_view::push(
+                                                  CG const& g,
+                                                  InputIt first,
+                                                  InputIt last,
+                                                  void *temp_storage) {
+
+  SharedMemoryLayout<T> shmem =
+       GetSharedMemoryLayout<T>((int*)temp_storage,
+                                         g.size(), node_size_);
+
+  auto push_size = last - first;
+  for (size_t i = 0; i < push_size / node_size_; i++) {
+    PushSingleNode(g, first + i * node_size_, d_heap_, d_size_, node_size_,
+                   d_locks_, lowest_level_start_, shmem, compare_);
+  }
+
+  if (push_size % node_size_ != 0) {
+    PushPartialNode(g, first + (push_size / node_size_) * node_size_,
+                         push_size % node_size_, d_heap_,
+                         d_size_, node_size_, d_locks_,
+                         d_p_buffer_size_, lowest_level_start_, shmem,
+                         compare_);
+  }
+}
+
+template <typename T, typename Compare, bool FavorInsertionPerformance,
+          typename Allocator>
+template <typename CG, typename OutputIt>
+__device__ void priority_queue<T, Compare,
+                               FavorInsertionPerformance, Allocator>
+                                       ::device_mutable_view::pop(
+                                                      CG const& g,
+                                                      OutputIt first,
+                                                      OutputIt last,
+                                                      void *temp_storage) {
+  SharedMemoryLayout<T> shmem =
+       GetSharedMemoryLayout<T>((int*)temp_storage,
+                                         g.size(), node_size_);
+
+  auto pop_size = last - first;
+  for (size_t i = 0; i < pop_size / node_size_; i++) {
+    PopSingleNode(g, first + i * node_size_,
+                  d_heap_, d_size_, node_size_, d_locks_,
+                  d_p_buffer_size_, lowest_level_start_,
+                  node_capacity_, shmem, compare_);
+  }
+
+  if (pop_size % node_size_ != 0) {
+    PopPartialNode(g, first + (pop_size / node_size_) * node_size_,
+                   last - first, d_heap_, d_size_, node_size_,
+                   d_locks_, d_p_buffer_size_, lowest_level_start_,
+                   node_capacity_, shmem, compare_);
+  }
+}
+
+}