From 89c815a3dcf4d122de243f13072ac2243004609b Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Mon, 24 Jul 2023 17:08:53 -0400
Subject: [PATCH 01/10] v23.10 Updates [skip ci]


From 58ba5a482d484f4eda3a5c82466431113416c587 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Thu, 17 Aug 2023 20:57:04 +0800
Subject: [PATCH 02/10] PR: Use top-k from RAFT (#53)

Closes #5
- fix bugs in `cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu` and `cpp/tests/graph_ops/csr_add_self_loop_utils.cu`
- use `raft::warp_sort( select_k)` to impl weighted_sampling_without_replacement, when sample_count>256, cub::DeviceSegmentSort is used for the implementation.
- remove `block_radix_topk.cuh`. replace `block_topk` in file `embedding_cache_func.cuh` with `raft::warp_sort`.

Authors:
  - Chuang Zhu (https://github.com/chuangz0)

Approvers:
  - https://github.com/dongxuy04
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/53
---
 cpp/src/wholegraph_ops/block_radix_topk.cuh   | 371 -----------
 ...ighted_sample_without_replacement_func.cuh | 622 ++++++++++--------
 .../functions/embedding_cache_func.cuh        | 200 +++---
 .../graph_ops/csr_add_self_loop_utils.cu      |   2 +-
 .../graph_sampling_test_utils.cu              |  54 +-
 ...ighted_sample_without_replacement_tests.cu |  16 +-
 ...aph_weighted_sample_without_replacement.py |  39 +-
 7 files changed, 482 insertions(+), 822 deletions(-)
 delete mode 100644 cpp/src/wholegraph_ops/block_radix_topk.cuh
diff --git a/cpp/src/wholegraph_ops/block_radix_topk.cuh b/cpp/src/wholegraph_ops/block_radix_topk.cuh
deleted file mode 100644
index 624c07510..000000000
--- a/cpp/src/wholegraph_ops/block_radix_topk.cuh
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_scan.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_ptx.cuh>
-
-namespace wholegraph_ops {
-
-template <typename KeyT, int BLOCK_SIZE, bool GREATER = true, int RADIX_BITS = 8>
-class BlockRadixTopKGlobalMemory {
-  static_assert(cub::PowerOfTwo<RADIX_BITS>::VALUE && (RADIX_BITS <= (sizeof(KeyT) * 8)),
-                "RADIX_BITS should be power of 2, and <= (sizeof(KeyT) * 8)");
-  static_assert(cub::PowerOfTwo<BLOCK_SIZE>::VALUE, "BLOCK_SIZE should be power of 2");
-  using KeyTraits                            = cub::Traits<KeyT>;
-  using UnsignedBits                         = typename KeyTraits::UnsignedBits;
-  using BlockScanT                           = cub::BlockScan<int, BLOCK_SIZE>;
-  static constexpr int RADIX_SIZE            = (1 << RADIX_BITS);
-  static constexpr int SCAN_ITEMS_PER_THREAD = (RADIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
-  using BinBlockLoad  = cub::BlockLoad<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  using BinBlockStore = cub::BlockStore<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  struct _TempStorage {
-    typename BlockScanT::TempStorage scan_storage;
-    union {
-      typename BinBlockLoad::TempStorage load_storage;
-      typename BinBlockStore::TempStorage store_storage;
-    } load_store;
-    union {
-      int shared_bins[RADIX_SIZE];
-    };
-    int share_target_k;
-    int share_bucket_id;
-  };
-
- public:
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-  __device__ __forceinline__ BlockRadixTopKGlobalMemory(TempStorage& temp_storage)
-    : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){};
-  __device__ __forceinline__ void radixTopKGetThreshold(
-    const KeyT* data, int k, int size, KeyT& topK, bool& topk_is_unique)
-  {
-    assert(k < size && k > 0);
-    int target_k             = k;
-    UnsignedBits key_pattern = 0;
-    int digit_pos            = sizeof(KeyT) * 8 - RADIX_BITS;
-    for (; digit_pos >= 0; digit_pos -= RADIX_BITS) {
-      UpdateSharedBins(data, size, digit_pos, key_pattern);
-      InclusiveScanBins();
-      UpdateTopK(digit_pos, target_k, key_pattern);
-      if (target_k == 0) break;
-    }
-    if (target_k == 0) {
-      key_pattern -= 1;
-      topk_is_unique = true;
-    } else {
-      topk_is_unique = false;
-    }
-    if (GREATER) key_pattern = ~key_pattern;
-    UnsignedBits topK_unsigned = KeyTraits::TwiddleOut(key_pattern);
-    topK                       = reinterpret_cast<KeyT&>(topK_unsigned);
-  }
-
- private:
-  __device__ __forceinline__ void UpdateSharedBins(const KeyT* key,
-                                                   int size,
-                                                   int digit_pos,
-                                                   UnsignedBits key_pattern)
-  {
-    for (int id = tid_; id < RADIX_SIZE; id += BLOCK_SIZE) {
-      temp_storage_.shared_bins[id] = 0;
-    }
-    cub::CTA_SYNC();
-    UnsignedBits key_mask = ((UnsignedBits)(-1)) << ((UnsignedBits)(digit_pos + RADIX_BITS));
-#pragma unroll
-    for (int idx = tid_; idx < size; idx += BLOCK_SIZE) {
-      KeyT key_data              = key[idx];
-      UnsignedBits twiddled_data = KeyTraits::TwiddleIn(reinterpret_cast<UnsignedBits&>(key_data));
-      if (GREATER) twiddled_data = ~twiddled_data;
-      UnsignedBits digit_in_radix = cub::BFE<UnsignedBits>(twiddled_data, digit_pos, RADIX_BITS);
-      if ((twiddled_data & key_mask) == (key_pattern & key_mask)) {
-        atomicAdd(&temp_storage_.shared_bins[digit_in_radix], 1);
-      }
-    }
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void InclusiveScanBins()
-  {
-    int items[SCAN_ITEMS_PER_THREAD];
-    BinBlockLoad(temp_storage_.load_store.load_storage)
-      .Load(temp_storage_.shared_bins, items, RADIX_SIZE, 0);
-    cub::CTA_SYNC();
-    BlockScanT(temp_storage_.scan_storage).InclusiveSum(items, items);
-    cub::CTA_SYNC();
-    BinBlockStore(temp_storage_.load_store.store_storage)
-      .Store(temp_storage_.shared_bins, items, RADIX_SIZE);
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void UpdateTopK(int digit_pos,
-                                             int& target_k,
-                                             UnsignedBits& target_pattern)
-  {
-    for (int idx = tid_; (idx < RADIX_SIZE); idx += BLOCK_SIZE) {
-      int prev_count = (idx == 0) ? 0 : temp_storage_.shared_bins[idx - 1];
-      int cur_count  = temp_storage_.shared_bins[idx];
-      if (prev_count <= target_k && cur_count > target_k) {
-        temp_storage_.share_target_k  = target_k - prev_count;
-        temp_storage_.share_bucket_id = idx;
-      }
-    }
-    cub::CTA_SYNC();
-    target_k                 = temp_storage_.share_target_k;
-    int target_bucket_id     = temp_storage_.share_bucket_id;
-    UnsignedBits key_segment = ((UnsignedBits)target_bucket_id) << ((UnsignedBits)digit_pos);
-    target_pattern |= key_segment;
-  }
-  _TempStorage& temp_storage_;
-  int tid_;
-};
-
-template <typename KeyT,
-          int BLOCK_SIZE,
-          int ITEMS_PER_THREAD,
-          bool GREATER    = true,
-          typename ValueT = cub::NullType,
-          int RADIX_BITS  = 8>
-class BlockRadixTopKRegister {
-  static_assert(cub::PowerOfTwo<RADIX_BITS>::VALUE && (RADIX_BITS <= (sizeof(KeyT) * 8)),
-                "RADIX_BITS should be power of 2, and <= (sizeof(KeyT) * 8)");
-  static_assert(cub::PowerOfTwo<BLOCK_SIZE>::VALUE, "BLOCK_SIZE should be power of 2");
-  using KeyTraits                            = cub::Traits<KeyT>;
-  using UnsignedBits                         = typename KeyTraits::UnsignedBits;
-  using BlockScanT                           = cub::BlockScan<int, BLOCK_SIZE>;
-  static constexpr int RADIX_SIZE            = (1 << RADIX_BITS);
-  static constexpr bool KEYS_ONLY            = std::is_same<ValueT, cub::NullType>::value;
-  static constexpr int SCAN_ITEMS_PER_THREAD = (RADIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
-  using BinBlockLoad       = cub::BlockLoad<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  using BinBlockStore      = cub::BlockStore<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  using BlockExchangeKey   = cub::BlockExchange<KeyT, BLOCK_SIZE, ITEMS_PER_THREAD>;
-  using BlockExchangeValue = cub::BlockExchange<ValueT, BLOCK_SIZE, ITEMS_PER_THREAD>;
-
-  using _ExchangeKeyTempStorage   = typename BlockExchangeKey::TempStorage;
-  using _ExchangeValueTempStorage = typename BlockExchangeValue::TempStorage;
-  typedef union ExchangeKeyTempStorageType {
-    _ExchangeKeyTempStorage key_storage;
-  } ExchKeyTempStorageType;
-  typedef union ExchangeKeyValueTempStorageType {
-    _ExchangeKeyTempStorage key_storage;
-    _ExchangeValueTempStorage value_storage;
-  } ExchKeyValueTempStorageType;
-  using _ExchangeType =
-    typename std::conditional<KEYS_ONLY, ExchKeyTempStorageType, ExchKeyValueTempStorageType>::type;
-
-  struct _TempStorage {
-    typename BlockScanT::TempStorage scan_storage;
-    union {
-      typename BinBlockLoad::TempStorage load_storage;
-      typename BinBlockStore::TempStorage store_storage;
-    } load_store;
-    union {
-      int shared_bins[RADIX_SIZE];
-      _ExchangeType exchange_storage;
-    };
-    int share_target_k;
-    int share_bucket_id;
-    int share_prev_count;
-  };
-
- public:
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-  __device__ __forceinline__ BlockRadixTopKRegister(TempStorage& temp_storage)
-    : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){};
-  __device__ __forceinline__ void radixTopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
-                                                     const int k,
-                                                     const int valid_count)
-  {
-    if (k == valid_count) return;
-    TopKGenRank(keys, k, valid_count);
-    int is_valid[ITEMS_PER_THREAD];
-    GenValidArray(is_valid, k);
-    BlockExchangeKey{temp_storage_.exchange_storage.key_storage}.ScatterToStripedFlagged(
-      keys, keys, ranks_, is_valid);
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void radixTopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
-                                                     ValueT (&values)[ITEMS_PER_THREAD],
-                                                     const int k,
-                                                     const int valid_count)
-  {
-    if (k == valid_count) return;
-    TopKGenRank(keys, k, valid_count);
-    int is_valid[ITEMS_PER_THREAD];
-    GenValidArray(is_valid, k);
-    BlockExchangeKey{temp_storage_.exchange_storage.key_storage}.ScatterToStripedFlagged(
-      keys, keys, ranks_, is_valid);
-    cub::CTA_SYNC();
-    BlockExchangeValue{temp_storage_.exchange_storage.value_storage}.ScatterToStripedFlagged(
-      values, values, ranks_, is_valid);
-    cub::CTA_SYNC();
-  }
-
- private:
-  __device__ __forceinline__ void TopKGenRank(KeyT (&keys)[ITEMS_PER_THREAD],
-                                              const int k,
-                                              const int valid_count)
-  {
-    assert(k <= BLOCK_SIZE * ITEMS_PER_THREAD);
-    assert(k <= valid_count);
-    UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD] =
-      reinterpret_cast<UnsignedBits(&)[ITEMS_PER_THREAD]>(keys);
-    search_mask_ = 0;
-    top_k_mask_  = 0;
-
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      int idx            = KEY * BLOCK_SIZE + tid_;
-      unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-      if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY];
-      if (idx < valid_count) search_mask_ |= (1U << KEY);
-    }
-
-    int target_k = k;
-    int prefix_k = 0;
-
-    for (int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0; digit_pos -= RADIX_BITS) {
-      UpdateSharedBins(unsigned_keys, digit_pos, prefix_k);
-      InclusiveScanBins();
-      UpdateTopK(unsigned_keys, digit_pos, target_k, prefix_k, digit_pos == 0);
-      if (target_k == 0) break;
-    }
-
-#pragma unroll
-    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY];
-      unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-    }
-  }
-  __device__ __forceinline__ void GenValidArray(int (&is_valid)[ITEMS_PER_THREAD], int k)
-  {
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      if ((top_k_mask_ & (1U << KEY)) && ranks_[KEY] < k) {
-        is_valid[KEY] = 1;
-      } else {
-        is_valid[KEY] = 0;
-      }
-    }
-  }
-  __device__ __forceinline__ void UpdateSharedBins(UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD],
-                                                   int digit_pos,
-                                                   int prefix_k)
-  {
-    for (int id = tid_; id < RADIX_SIZE; id += BLOCK_SIZE) {
-      temp_storage_.shared_bins[id] = 0;
-    }
-    cub::CTA_SYNC();
-// #define USE_MATCH
-#ifdef USE_MATCH
-    int lane_mask = cub::LaneMaskLt();
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      bool is_search = search_mask_ & (1U << KEY);
-      int bucket_idx = -1;
-      if (is_search) {
-        UnsignedBits digit_in_radix =
-          cub::BFE<UnsignedBits>(unsigned_keys[KEY], digit_pos, RADIX_BITS);
-        bucket_idx = (int)digit_in_radix;
-      }
-      int warp_match_mask       = __match_any_sync(0xffffffff, bucket_idx);
-      int same_count            = __popc(warp_match_mask);
-      int idx_in_same_bucket    = __popc(warp_match_mask & lane_mask);
-      int same_bucket_root_lane = __ffs(warp_match_mask) - 1;
-      int same_bucket_start_idx;
-      if (idx_in_same_bucket == 0 && is_search) {
-        same_bucket_start_idx = atomicAdd(&temp_storage_.shared_bins[bucket_idx], same_count);
-      }
-      same_bucket_start_idx =
-        __shfl_sync(0xffffffff, same_bucket_start_idx, same_bucket_root_lane, 32);
-      if (is_search) { ranks_[KEY] = same_bucket_start_idx + idx_in_same_bucket + prefix_k; }
-    }
-#else
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      bool is_search = search_mask_ & (1U << KEY);
-      int bucket_idx = -1;
-      if (is_search) {
-        UnsignedBits digit_in_radix =
-          cub::BFE<UnsignedBits>(unsigned_keys[KEY], digit_pos, RADIX_BITS);
-        bucket_idx  = (int)digit_in_radix;
-        ranks_[KEY] = atomicAdd(&temp_storage_.shared_bins[bucket_idx], 1) + prefix_k;
-      }
-    }
-#endif
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void InclusiveScanBins()
-  {
-    int items[SCAN_ITEMS_PER_THREAD];
-    BinBlockLoad(temp_storage_.load_store.load_storage)
-      .Load(temp_storage_.shared_bins, items, RADIX_SIZE, 0);
-    cub::CTA_SYNC();
-    BlockScanT(temp_storage_.scan_storage).InclusiveSum(items, items);
-    cub::CTA_SYNC();
-    BinBlockStore(temp_storage_.load_store.store_storage)
-      .Store(temp_storage_.shared_bins, items, RADIX_SIZE);
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void UpdateTopK(UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD],
-                                             int digit_pos,
-                                             int& target_k,
-                                             int& prefix_k,
-                                             bool mark_equal)
-  {
-    for (int idx = tid_; (idx < RADIX_SIZE); idx += BLOCK_SIZE) {
-      int prev_count = (idx == 0) ? 0 : temp_storage_.shared_bins[idx - 1];
-      int cur_count  = temp_storage_.shared_bins[idx];
-      if (prev_count <= target_k && cur_count > target_k) {
-        temp_storage_.share_target_k   = target_k - prev_count;
-        temp_storage_.share_bucket_id  = idx;
-        temp_storage_.share_prev_count = prev_count;
-      }
-    }
-    cub::CTA_SYNC();
-    target_k = temp_storage_.share_target_k;
-    prefix_k += temp_storage_.share_prev_count;
-    int target_bucket_id = temp_storage_.share_bucket_id;
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      if (search_mask_ & (1U << KEY)) {
-        UnsignedBits digit_in_radix =
-          cub::BFE<UnsignedBits>(unsigned_keys[KEY], digit_pos, RADIX_BITS);
-        if (digit_in_radix < target_bucket_id) {
-          top_k_mask_ |= (1U << KEY);
-          search_mask_ &= ~(1U << KEY);
-        } else if (digit_in_radix > target_bucket_id) {
-          search_mask_ &= ~(1U << KEY);
-        } else {
-          if (mark_equal) top_k_mask_ |= (1U << KEY);
-        }
-        if (digit_in_radix <= target_bucket_id) {
-          int prev_count =
-            (digit_in_radix == 0) ? 0 : temp_storage_.shared_bins[digit_in_radix - 1];
-          ranks_[KEY] += prev_count;
-        }
-      }
-    }
-    cub::CTA_SYNC();
-  }
-
-  _TempStorage& temp_storage_;
-  int tid_;
-  int ranks_[ITEMS_PER_THREAD];
-  unsigned int search_mask_;
-  unsigned int top_k_mask_;
-};
-
-}  // namespace wholegraph_ops
diff --git a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
index 22a97fd19..a2915cd00 100644
--- a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
+++ b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
@@ -14,22 +14,26 @@
  * limitations under the License.
  */
 #pragma once
+#include <cstdlib>
 #include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_sort.cuh>
+#include <driver_types.h>
+#include <raft/matrix/select_k.cuh>
 #include <random>
 #include <thrust/scan.h>
 
+#include "raft/matrix/detail/select_warpsort.cuh"
+#include "raft/util/cuda_dev_essentials.cuh"
+#include "wholememory_ops/output_memory_handle.hpp"
+#include "wholememory_ops/raft_random.cuh"
+#include "wholememory_ops/temp_memory_handle.hpp"
+#include "wholememory_ops/thrust_allocator.hpp"
 #include <raft/util/integer_utils.hpp>
 #include <wholememory/device_reference.cuh>
 #include <wholememory/env_func_ptrs.h>
 #include <wholememory/global_reference.h>
 #include <wholememory/tensor_description.h>
 
-#include "wholememory_ops/output_memory_handle.hpp"
-#include "wholememory_ops/raft_random.cuh"
-#include "wholememory_ops/temp_memory_handle.hpp"
-#include "wholememory_ops/thrust_allocator.hpp"
-
-#include "block_radix_topk.cuh"
 #include "cuda_macros.hpp"
 #include "error.hpp"
 #include "sample_comm.cuh"
@@ -54,16 +58,14 @@ __device__ __forceinline__ float gen_key_from_weight(const WeightType weight, PC
 }
 
 template <typename IdType,
-          typename LocalIdType,
           typename WeightType,
           typename WeightKeyType,
+          typename NeighborIdxType,
           typename WMIdType,
           typename WMOffsetType,
           typename WMWeightType,
-          unsigned int BLOCK_SIZE,
-          bool NeedRandom = true,
-          bool Ascending  = false>
-__launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacement_large_kernel(
+          unsigned int BLOCK_SIZE>
+__launch_bounds__(BLOCK_SIZE) __global__ void generate_weighted_keys_and_idxs_kernel(
   wholememory_gref_t wm_csr_row_ptr,
   wholememory_array_description_t wm_csr_row_ptr_desc,
   wholememory_gref_t wm_csr_col_ptr,
@@ -74,18 +76,14 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   const int input_node_count,
   const int max_sample_count,
   unsigned long long random_seed,
-  const int* sample_offset,
-  wholememory_array_description_t sample_offset_desc,
   const int* target_neighbor_offset,
-  WMIdType* output,
-  int* src_lid,
-  int64_t* out_edge_gid,
-  WeightKeyType* weight_keys_buff)
+  WeightKeyType* output_weighted_keys,
+  NeighborIdxType* output_idxs,
+  bool need_random = true)
 {
   int input_idx = blockIdx.x;
   if (input_idx >= input_node_count) return;
   int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE;
-
   wholememory::device_reference<WMOffsetType> csr_row_ptr_gen(wm_csr_row_ptr);
   wholememory::device_reference<WMIdType> csr_col_ptr_gen(wm_csr_col_ptr);
   wholememory::device_reference<WMWeightType> csr_weight_ptr_gen(wm_csr_weight_ptr);
@@ -93,13 +91,57 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   int64_t start      = csr_row_ptr_gen[nid];
   int64_t end        = csr_row_ptr_gen[nid + 1];
   int neighbor_count = (int)(end - start);
+  if (neighbor_count <= max_sample_count) { need_random = false; }
+
+  PCGenerator rng(random_seed, (uint64_t)gidx, (uint64_t)0);
+  int output_offset = target_neighbor_offset[input_idx];
+  output_weighted_keys += output_offset;
+  output_idxs += output_offset;
+  for (int id = threadIdx.x; id < neighbor_count; id += BLOCK_SIZE) {
+    WeightType thread_weight = csr_weight_ptr_gen[start + id];
+    output_weighted_keys[id] =
+      need_random ? static_cast<WeightKeyType>(gen_key_from_weight(thread_weight, rng))
+                  : (static_cast<WeightKeyType>(thread_weight));
+    output_idxs[id] = static_cast<NeighborIdxType>(id);
+  }
+}
+
+template <typename IdType,
+          typename LocalIdType,
+          typename NeighborIdxType,
+          typename WMIdType,
+          typename WMOffsetType,
+          int BLOCK_SIZE>
+__launch_bounds__(BLOCK_SIZE) __global__
+  void weighted_sample_select_k_kernel(wholememory_gref_t wm_csr_row_ptr,
+                                       wholememory_array_description_t wm_csr_row_ptr_desc,
+                                       wholememory_gref_t wm_csr_col_ptr,
+                                       wholememory_array_description_t wm_csr_col_ptr_desc,
+                                       const IdType* input_nodes,
+                                       const int input_node_count,
+                                       const int max_sample_count,
+                                       const int* sample_offset,
+                                       wholememory_array_description_t sample_offset_desc,
+                                       const NeighborIdxType* sorted_idxs,
+                                       const int* target_neighbor_offset,
+                                       WMIdType* output,
+                                       LocalIdType* src_lid,
+                                       int64_t* out_edge_gid)
+{
+  int input_idx = blockIdx.x;
+  if (input_idx >= input_node_count) return;
+  wholememory::device_reference<WMOffsetType> csr_row_ptr_gen(wm_csr_row_ptr);
+  wholememory::device_reference<WMIdType> csr_col_ptr_gen(wm_csr_col_ptr);
+  IdType nid         = input_nodes[input_idx];
+  int64_t start      = csr_row_ptr_gen[nid];
+  int64_t end        = csr_row_ptr_gen[nid + 1];
+  int neighbor_count = (int)(end - start);
+
+  int offset = sample_offset[input_idx];
 
-  WeightKeyType* weight_keys_local_buff = weight_keys_buff + target_neighbor_offset[input_idx];
-  int offset                            = sample_offset[input_idx];
   if (neighbor_count <= max_sample_count) {
     for (int sample_id = threadIdx.x; sample_id < neighbor_count; sample_id += BLOCK_SIZE) {
-      int neighbor_idx           = sample_id;
-      int original_neighbor_idx  = neighbor_idx;
+      int original_neighbor_idx  = sample_id;
       IdType gid                 = csr_col_ptr_gen[start + original_neighbor_idx];
       output[offset + sample_id] = gid;
       if (src_lid) src_lid[offset + sample_id] = (LocalIdType)input_idx;
@@ -108,83 +150,14 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
     }
     return;
   }
-
-  PCGenerator rng(random_seed, (uint64_t)gidx, (uint64_t)0);
-  for (int id = threadIdx.x; id < neighbor_count; id += BLOCK_SIZE) {
-    WeightType thread_weight = csr_weight_ptr_gen[start + id];
-    weight_keys_local_buff[id] =
-      NeedRandom ? static_cast<WeightKeyType>(gen_key_from_weight(thread_weight, rng))
-                 : (static_cast<WeightKeyType>(thread_weight));
-  }
-
-  __syncthreads();
-
-  WeightKeyType topk_val;
-  bool topk_is_unique;
-
-  using BlockRadixSelectT =
-    std::conditional_t<Ascending,
-                       BlockRadixTopKGlobalMemory<WeightKeyType, BLOCK_SIZE, false>,
-                       BlockRadixTopKGlobalMemory<WeightKeyType, BLOCK_SIZE, true>>;
-  __shared__ typename BlockRadixSelectT::TempStorage share_storage;
-
-  BlockRadixSelectT{share_storage}.radixTopKGetThreshold(
-    weight_keys_local_buff, max_sample_count, neighbor_count, topk_val, topk_is_unique);
-  __shared__ int cnt;
-
-  if (threadIdx.x == 0) { cnt = 0; }
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < max_sample_count; i += BLOCK_SIZE) {
-    if (src_lid) src_lid[offset + i] = (LocalIdType)input_idx;
-  }
-
-  // We use atomicAdd 1 operations instead of binaryScan to calculate the write
-  // index, since we do not need to keep the relative positions of element.
-
-  if (topk_is_unique) {
-    for (int neighbor_idx = threadIdx.x; neighbor_idx < neighbor_count;
-         neighbor_idx += BLOCK_SIZE) {
-      WeightKeyType key = weight_keys_local_buff[neighbor_idx];
-      bool has_topk     = Ascending ? (key <= topk_val) : (key >= topk_val);
-
-      if (has_topk) {
-        int write_index                = atomicAdd(&cnt, 1);
-        LocalIdType local_original_idx = neighbor_idx;
-        output[offset + write_index]   = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + write_index] = static_cast<IdType>(start + local_original_idx);
-      }
-    }
-  } else {
-    for (int neighbor_idx = threadIdx.x; neighbor_idx < neighbor_count;
-         neighbor_idx += BLOCK_SIZE) {
-      WeightKeyType key = weight_keys_local_buff[neighbor_idx];
-      bool has_topk     = Ascending ? (key < topk_val) : (key > topk_val);
-
-      if (has_topk) {
-        int write_index                = atomicAdd(&cnt, 1);
-        LocalIdType local_original_idx = neighbor_idx;
-        output[offset + write_index]   = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + write_index] = static_cast<IdType>(start + local_original_idx);
-      }
-    }
-    __syncthreads();
-    for (int neighbor_idx = threadIdx.x; neighbor_idx < neighbor_count;
-         neighbor_idx += BLOCK_SIZE) {
-      WeightKeyType key = weight_keys_local_buff[neighbor_idx];
-      bool has_topk     = (key == topk_val);
-
-      if (has_topk) {
-        int write_index = atomicAdd(&cnt, 1);
-        if (write_index >= max_sample_count) break;
-        LocalIdType local_original_idx = neighbor_idx;
-        output[offset + write_index]   = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + write_index] = static_cast<IdType>(start + local_original_idx);
-      }
-    }
+  int neighbor_offset = target_neighbor_offset[input_idx];
+  for (int sample_id = threadIdx.x; sample_id < max_sample_count; sample_id += BLOCK_SIZE) {
+    int original_neighbor_idx  = sorted_idxs[neighbor_offset + sample_id];
+    IdType gid                 = csr_col_ptr_gen[start + original_neighbor_idx];
+    output[offset + sample_id] = gid;
+    if (src_lid) src_lid[offset + sample_id] = (LocalIdType)input_idx;
+    if (out_edge_gid)
+      out_edge_gid[offset + sample_id] = static_cast<int64_t>(start + original_neighbor_idx);
   }
 }
 
@@ -216,21 +189,30 @@ __global__ void get_sample_count_and_neighbor_count_without_replacement_kernel(
   }
 }
 
+// to  avoid queue.store()  store keys or values in output.
+struct null_store_t {};
+struct null_store_op {
+  template <typename Type, typename... UnusedArgs>
+  constexpr auto operator()(const Type& in, UnusedArgs...) const
+  {
+    return null_store_t{};
+  }
+};
+
 // A-RES algorithmn
 // https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_A-Res
-// max_sample_count should <=(BLOCK_SIZE*ITEMS_PER_THREAD*/4)  otherwise,need to
-// change the template parameters of BlockRadixTopK.
-template <typename IdType,
+template <template <int, bool, typename, typename> class WarpSortClass,
+          int Capacity,
+          typename IdType,
           typename LocalIdType,
           typename WeightType,
+          typename NeighborIdxType,
           typename WMIdType,
           typename WMOffsetType,
           typename WMWeightType,
-          unsigned int ITEMS_PER_THREAD,
-          unsigned int BLOCK_SIZE,
-          bool NeedRandom = true,
-          bool Ascending  = false>
-__launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacement_kernel(
+          bool NEED_RANDOM = true,
+          bool ASCENDING   = false>
+__launch_bounds__(256) __global__ void weighted_sample_without_replacement_raft_kernel(
   wholememory_gref_t wm_csr_row_ptr,
   wholememory_array_description_t wm_csr_row_ptr_desc,
   wholememory_gref_t wm_csr_col_ptr,
@@ -244,13 +226,12 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   const int* sample_offset,
   wholememory_array_description_t sample_offset_desc,
   WMIdType* output,
-  int* src_lid,
+  LocalIdType* src_lid,
   int64_t* out_edge_gid)
 {
   int input_idx = blockIdx.x;
   if (input_idx >= input_node_count) return;
-  int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE;
-
+  int gidx = threadIdx.x + blockIdx.x * blockDim.x;
   wholememory::device_reference<WMOffsetType> csr_row_ptr_gen(wm_csr_row_ptr);
   wholememory::device_reference<WMIdType> csr_col_ptr_gen(wm_csr_col_ptr);
   wholememory::device_reference<WMWeightType> csr_weight_ptr_gen(wm_csr_weight_ptr);
@@ -258,86 +239,153 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   IdType nid         = input_nodes[input_idx];
   int64_t start      = csr_row_ptr_gen[nid];
   int64_t end        = csr_row_ptr_gen[nid + 1];
-  int neighbor_count = (int)(end - start);
+  int neighbor_count = static_cast<int>(end - start);
   int offset         = sample_offset[input_idx];
   if (neighbor_count <= max_sample_count) {
-    for (int sample_id = threadIdx.x; sample_id < neighbor_count; sample_id += BLOCK_SIZE) {
+    for (int sample_id = threadIdx.x; sample_id < neighbor_count; sample_id += blockDim.x) {
       int neighbor_idx           = sample_id;
       int original_neighbor_idx  = neighbor_idx;
       IdType gid                 = csr_col_ptr_gen[start + original_neighbor_idx];
       output[offset + sample_id] = gid;
-      if (src_lid) src_lid[offset + sample_id] = (LocalIdType)input_idx;
+      if (src_lid) src_lid[offset + sample_id] = input_idx;
       if (out_edge_gid)
         out_edge_gid[offset + sample_id] = static_cast<int64_t>(start + original_neighbor_idx);
     }
     return;
   } else {
-    PCGenerator rng(random_seed, (uint64_t)gidx, (uint64_t)0);
-
-    float weight_keys[ITEMS_PER_THREAD];
-    int neighbor_idxs[ITEMS_PER_THREAD];
-
-    using BlockRadixTopKT =
-      std::conditional_t<Ascending,
-                         BlockRadixTopKRegister<float, BLOCK_SIZE, ITEMS_PER_THREAD, false, int>,
-                         BlockRadixTopKRegister<float, BLOCK_SIZE, ITEMS_PER_THREAD, true, int>>;
-
-    __shared__ typename BlockRadixTopKT::TempStorage sort_tmp_storage;
-
-    const int tx = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      int idx = BLOCK_SIZE * i + tx;
+    extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
+    using bq_t = raft::matrix::detail::select::warpsort::
+      block_sort<WarpSortClass, Capacity, ASCENDING, WeightType, NeighborIdxType>;
+
+    uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
+    bq_t queue(max_sample_count, warp_smem);
+    PCGenerator rng(random_seed, static_cast<uint64_t>(gidx), static_cast<uint64_t>(0));
+    const int per_thread_lim = neighbor_count + raft::laneId();
+    for (int idx = threadIdx.x; idx < per_thread_lim; idx += blockDim.x) {
+      WeightType weight_key =
+        WarpSortClass<Capacity, ASCENDING, WeightType, NeighborIdxType>::kDummy;
       if (idx < neighbor_count) {
         WeightType thread_weight = csr_weight_ptr_gen[start + idx];
-        weight_keys[i] =
-          NeedRandom ? gen_key_from_weight(thread_weight, rng) : (float)thread_weight;
-        neighbor_idxs[i] = idx;
+        weight_key = NEED_RANDOM ? gen_key_from_weight(thread_weight, rng) : thread_weight;
       }
+      queue.add(weight_key, idx);
     }
-    const int valid_count = (neighbor_count < (BLOCK_SIZE * ITEMS_PER_THREAD))
-                              ? neighbor_count
-                              : (BLOCK_SIZE * ITEMS_PER_THREAD);
-    BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped(
-      weight_keys, neighbor_idxs, max_sample_count, valid_count);
+    queue.done(smem_buf_bytes);
+
     __syncthreads();
-    const int stride = BLOCK_SIZE * ITEMS_PER_THREAD - max_sample_count;
-
-    for (int idx_offset = ITEMS_PER_THREAD * BLOCK_SIZE; idx_offset < neighbor_count;
-         idx_offset += stride) {
-#pragma unroll
-      for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-        int local_idx = BLOCK_SIZE * i + tx - max_sample_count;
-        // [0,BLOCK_SIZE*ITEMS_PER_THREAD-max_sample_count)
-        int target_idx = idx_offset + local_idx;
-        if (local_idx >= 0 && target_idx < neighbor_count) {
-          WeightType thread_weight = csr_weight_ptr_gen[start + target_idx];
-          weight_keys[i] =
-            NeedRandom ? gen_key_from_weight(thread_weight, rng) : (float)thread_weight;
-          neighbor_idxs[i] = target_idx;
-        }
+    NeighborIdxType* smem_topk_idx = reinterpret_cast<NeighborIdxType*>(smem_buf_bytes);
+    queue.store(static_cast<null_store_t*>(nullptr), smem_topk_idx, null_store_op{});
+    __syncthreads();
+    for (int idx = threadIdx.x; idx < max_sample_count; idx += blockDim.x) {
+      NeighborIdxType local_original_idx = static_cast<NeighborIdxType>(smem_topk_idx[idx]);
+      if (src_lid) { src_lid[offset + idx] = static_cast<LocalIdType>(input_idx); }
+      output[offset + idx] = csr_col_ptr_gen[start + local_original_idx];
+      if (out_edge_gid) {
+        out_edge_gid[offset + idx] = static_cast<int64_t>(start + local_original_idx);
       }
-      const int iter_valid_count = ((neighbor_count - idx_offset) >= stride)
-                                     ? (BLOCK_SIZE * ITEMS_PER_THREAD)
-                                     : (max_sample_count + neighbor_count - idx_offset);
-      BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped(
-        weight_keys, neighbor_idxs, max_sample_count, iter_valid_count);
-      __syncthreads();
     }
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      int idx = i * BLOCK_SIZE + tx;
-      if (idx < max_sample_count) {
-        if (src_lid) src_lid[offset + idx] = (LocalIdType)input_idx;
-        LocalIdType local_original_idx = neighbor_idxs[i];
-        output[offset + idx]           = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + idx] = static_cast<int64_t>(start + local_original_idx);
-      }
+  };
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass,
+          int Capacity,
+          typename IdType,
+          typename LocalIdType,
+          typename WeightType,
+          typename NeighborIdxType,
+          typename WMIdType,
+          typename WMOffsetType,
+          typename WMWeightType,
+          bool NEED_RANDOM = true,
+          bool ASCENDING   = false>
+void launch_kernel(wholememory_gref_t wm_csr_row_ptr,
+                   wholememory_array_description_t wm_csr_row_ptr_desc,
+                   wholememory_gref_t wm_csr_col_ptr,
+                   wholememory_array_description_t wm_csr_col_ptr_desc,
+                   wholememory_gref_t wm_csr_weight_ptr,
+                   wholememory_array_description_t wm_csr_weight_ptr_desc,
+                   const IdType* input_nodes,
+                   const int input_node_count,
+                   const int max_sample_count,
+                   unsigned long long random_seed,
+                   const int* sample_offset,
+                   wholememory_array_description_t sample_offset_desc,
+                   WMIdType* output,
+                   LocalIdType* src_lid,
+                   int64_t* out_edge_gid,
+                   int block_dim,
+                   int smem_size,
+                   cudaStream_t stream)
+{
+  const int capacity = raft::bound_by_power_of_two(max_sample_count);
+  if constexpr (Capacity > 8) {
+    if (capacity < Capacity) {
+      return launch_kernel<WarpSortClass,
+                           Capacity / 2,
+                           IdType,
+                           LocalIdType,
+                           WeightType,
+                           NeighborIdxType,
+                           WMIdType,
+                           WMOffsetType,
+                           WMWeightType,
+                           NEED_RANDOM,
+                           ASCENDING>(wm_csr_row_ptr,
+                                      wm_csr_row_ptr_desc,
+                                      wm_csr_col_ptr,
+                                      wm_csr_col_ptr_desc,
+                                      wm_csr_weight_ptr,
+                                      wm_csr_weight_ptr_desc,
+                                      input_nodes,
+                                      input_node_count,
+                                      max_sample_count,
+                                      random_seed,
+                                      sample_offset,
+                                      sample_offset_desc,
+                                      output,
+                                      src_lid,
+                                      out_edge_gid,
+                                      block_dim,
+                                      smem_size,
+                                      stream);
     }
   }
+  WHOLEMEMORY_EXPECTS(
+    capacity <= Capacity, "Requested max_sample_count is too large (%d)", max_sample_count);
+  smem_size = std::max<int>(
+    smem_size, WarpSortClass<1, true, WeightType, NeighborIdxType>::mem_required(block_dim));
+  weighted_sample_without_replacement_raft_kernel<WarpSortClass,
+                                                  Capacity,
+                                                  IdType,
+                                                  LocalIdType,
+                                                  WeightType,
+                                                  NeighborIdxType,
+                                                  WMIdType,
+                                                  WMOffsetType,
+                                                  WMWeightType,
+                                                  NEED_RANDOM,
+                                                  ASCENDING>
+    <<<input_node_count, block_dim, smem_size, stream>>>(wm_csr_row_ptr,
+                                                         wm_csr_row_ptr_desc,
+                                                         wm_csr_col_ptr,
+                                                         wm_csr_col_ptr_desc,
+                                                         wm_csr_weight_ptr,
+                                                         wm_csr_weight_ptr_desc,
+                                                         input_nodes,
+                                                         input_node_count,
+                                                         max_sample_count,
+                                                         random_seed,
+                                                         sample_offset,
+                                                         sample_offset_desc,
+                                                         output,
+                                                         src_lid,
+                                                         out_edge_gid);
 }
 
+template <int Capacity, bool Ascending, class T, class IdxT>
+using WarpSortClassT =
+  raft::matrix::detail::select::warpsort::warp_sort_distributed_ext<Capacity, Ascending, T, IdxT>;
+
 template <typename IdType, typename WMIdType, typename WeightType>
 void wholegraph_csr_weighted_sample_without_replacement_func(
   wholememory_gref_t wm_csr_row_ptr,
@@ -372,12 +420,13 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                       "output_sample_offset_desc.dtype = %d",
                       output_sample_offset_desc.dtype);
 
-  constexpr int sample_count_threshold = 1024;
-  const bool need_neighbor_count       = max_sample_count > sample_count_threshold;
+  constexpr int sample_count_threshold = raft::matrix::detail::select::warpsort::kMaxCapacity;
+
+  const bool need_neighbor_count = max_sample_count > sample_count_threshold;
 
   wholememory_ops::temp_memory_handle gen_buffer_tmh(p_env_fns);
   int* tmp_sample_count_mem_pointer =
-    (int*)gen_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT);
+    static_cast<int*>(gen_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT));
 
   wholememory_ops::temp_memory_handle gen_neighbor_count_buffer_tmh(p_env_fns);
   int* tmp_neighbor_counts_mem_pointer = nullptr;
@@ -386,12 +435,12 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   int block_count = raft::div_rounding_up_safe<int>(center_node_count, thread_x);
 
   if (need_neighbor_count) {
-    tmp_neighbor_counts_mem_pointer =
-      (int*)gen_neighbor_count_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT);
+    tmp_neighbor_counts_mem_pointer = static_cast<int*>(
+      gen_neighbor_count_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT));
     get_sample_count_and_neighbor_count_without_replacement_kernel<IdType, int64_t, true>
       <<<block_count, thread_x, 0, stream>>>(wm_csr_row_ptr,
                                              wm_csr_row_ptr_desc,
-                                             (const IdType*)center_nodes,
+                                             static_cast<const IdType*>(center_nodes),
                                              center_node_count,
                                              tmp_sample_count_mem_pointer,
                                              tmp_neighbor_counts_mem_pointer,
@@ -400,7 +449,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
     get_sample_count_and_neighbor_count_without_replacement_kernel<IdType, int64_t, false>
       <<<block_count, thread_x, 0, stream>>>(wm_csr_row_ptr,
                                              wm_csr_row_ptr_desc,
-                                             (const IdType*)center_nodes,
+                                             static_cast<const IdType*>(center_nodes),
                                              center_node_count,
                                              tmp_sample_count_mem_pointer,
                                              tmp_neighbor_counts_mem_pointer,
@@ -412,7 +461,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
                          tmp_sample_count_mem_pointer,
                          tmp_sample_count_mem_pointer + center_node_count + 1,
-                         (int*)output_sample_offset);
+                         static_cast<int*>(output_sample_offset));
 
   int count;
   WM_CUDA_CHECK(cudaMemcpyAsync(&count,
@@ -431,16 +480,16 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   if (output_center_localid_memory_context) {
     wholememory_ops::output_memory_handle gen_output_center_localid_buffer_mh(
       p_env_fns, output_center_localid_memory_context);
-    output_center_localid_ptr =
-      (int*)gen_output_center_localid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT);
+    output_center_localid_ptr = static_cast<int*>(
+      gen_output_center_localid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT));
   }
 
   int64_t* output_edge_gid_ptr = nullptr;
   if (output_edge_gid_memory_context) {
     wholememory_ops::output_memory_handle gen_output_edge_gid_buffer_mh(
       p_env_fns, output_edge_gid_memory_context);
-    output_edge_gid_ptr =
-      (int64_t*)gen_output_edge_gid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT64);
+    output_edge_gid_ptr = static_cast<int64_t*>(
+      gen_output_edge_gid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT64));
   }
 
   if (max_sample_count > sample_count_threshold) {
@@ -449,7 +498,6 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                            tmp_neighbor_counts_mem_pointer,
                            tmp_neighbor_counts_mem_pointer + center_node_count + 1,
                            tmp_neighbor_counts_mem_pointer);
-    int* tmp_neighbor_counts_offset = tmp_neighbor_counts_mem_pointer;
     int target_neighbor_counts;
     WM_CUDA_CHECK(cudaMemcpyAsync(&target_neighbor_counts,
                                   ((int*)tmp_neighbor_counts_mem_pointer) + center_node_count,
@@ -458,19 +506,32 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                                   stream));
     WM_CUDA_CHECK(cudaStreamSynchronize(stream));
 
-    wholememory_ops::temp_memory_handle gen_weights_buffer_tmh(p_env_fns);
-    WeightType* tmp_weights_buffer_mem_pointer = (WeightType*)gen_weights_buffer_tmh.device_malloc(
-      target_neighbor_counts, wm_csr_weight_ptr_desc.dtype);
+    wholememory_ops::temp_memory_handle gen_weights_buffer0_tmh(p_env_fns);
+    WeightType* tmp_weights_buffer0_mem_pointer =
+      (WeightType*)gen_weights_buffer0_tmh.device_malloc(target_neighbor_counts,
+                                                         wm_csr_weight_ptr_desc.dtype);
+    wholememory_ops::temp_memory_handle gen_weights_buffer1_tmh(p_env_fns);
+    WeightType* tmp_weights_buffer1_mem_pointer =
+      (WeightType*)gen_weights_buffer1_tmh.device_malloc(target_neighbor_counts,
+                                                         wm_csr_weight_ptr_desc.dtype);
+
+    auto neighbor_idx_dtype = wholememory_dtype_t::WHOLEMEMORY_DT_INT;
+    wholememory_ops::temp_memory_handle local_idx_buffer0_tmh(p_env_fns);
+    int* local_idx_buffer0_mem_pointer = static_cast<int*>(
+      local_idx_buffer0_tmh.device_malloc(target_neighbor_counts, neighbor_idx_dtype));
+    wholememory_ops::temp_memory_handle local_idx_buffer1_tmh(p_env_fns);
+    int* local_idx_buffer1_mem_pointer = static_cast<int*>(
+      local_idx_buffer1_tmh.device_malloc(target_neighbor_counts, neighbor_idx_dtype));
 
     constexpr int BLOCK_SIZE = 256;
-    weighted_sample_without_replacement_large_kernel<IdType,
-                                                     int,
-                                                     WeightType,
-                                                     WeightType,
-                                                     WMIdType,
-                                                     int64_t,
-                                                     WeightType,
-                                                     BLOCK_SIZE>
+    generate_weighted_keys_and_idxs_kernel<IdType,
+                                           WeightType,
+                                           WeightType,
+                                           int,
+                                           WMIdType,
+                                           int64_t,
+                                           WeightType,
+                                           BLOCK_SIZE>
       <<<center_node_count, BLOCK_SIZE, 0, stream>>>(wm_csr_row_ptr,
                                                      wm_csr_row_ptr_desc,
                                                      wm_csr_col_ptr,
@@ -481,13 +542,54 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                                                      center_node_count,
                                                      max_sample_count,
                                                      random_seed,
-                                                     (const int*)output_sample_offset,
+                                                     tmp_neighbor_counts_mem_pointer,
+                                                     tmp_weights_buffer0_mem_pointer,
+                                                     local_idx_buffer0_mem_pointer,
+                                                     true);
+    cub::DoubleBuffer<WeightType> weighted_key_double_buffer{tmp_weights_buffer0_mem_pointer,
+                                                             tmp_weights_buffer1_mem_pointer};
+    cub::DoubleBuffer<int> neighbor_idx_double_buffer{local_idx_buffer0_mem_pointer,
+                                                      local_idx_buffer1_mem_pointer};
+    void* d_temp_storage      = nullptr;
+    size_t temp_storage_bytes = 0;
+
+    WM_CUDA_CHECK(cub::DeviceSegmentedSort::SortPairsDescending(d_temp_storage,
+                                                                temp_storage_bytes,
+                                                                weighted_key_double_buffer,
+                                                                neighbor_idx_double_buffer,
+                                                                target_neighbor_counts,
+                                                                center_node_count,
+                                                                tmp_neighbor_counts_mem_pointer,
+                                                                tmp_neighbor_counts_mem_pointer + 1,
+                                                                stream));
+    wholememory_ops::temp_memory_handle segment_sort_storge_tmp(p_env_fns);
+    d_temp_storage = segment_sort_storge_tmp.device_malloc(temp_storage_bytes, WHOLEMEMORY_DT_INT8);
+
+    WM_CUDA_CHECK(cub::DeviceSegmentedSort::SortPairsDescending(d_temp_storage,
+                                                                temp_storage_bytes,
+                                                                weighted_key_double_buffer,
+                                                                neighbor_idx_double_buffer,
+                                                                target_neighbor_counts,
+                                                                center_node_count,
+                                                                tmp_neighbor_counts_mem_pointer,
+                                                                tmp_neighbor_counts_mem_pointer + 1,
+                                                                stream));
+
+    weighted_sample_select_k_kernel<IdType, int, int, WMIdType, int64_t, BLOCK_SIZE>
+      <<<center_node_count, BLOCK_SIZE, 0, stream>>>(wm_csr_row_ptr,
+                                                     wm_csr_row_ptr_desc,
+                                                     wm_csr_col_ptr,
+                                                     wm_csr_col_ptr_desc,
+                                                     (const IdType*)center_nodes,
+                                                     center_node_count,
+                                                     max_sample_count,
+                                                     static_cast<const int*>(output_sample_offset),
                                                      output_sample_offset_desc,
-                                                     tmp_neighbor_counts_offset,
-                                                     (WMIdType*)output_dest_node_ptr,
-                                                     (int*)output_center_localid_ptr,
-                                                     (int64_t*)output_edge_gid_ptr,
-                                                     tmp_weights_buffer_mem_pointer);
+                                                     neighbor_idx_double_buffer.Current(),
+                                                     tmp_neighbor_counts_mem_pointer,
+                                                     output_dest_node_ptr,
+                                                     output_center_localid_ptr,
+                                                     output_edge_gid_ptr);
 
     WM_CUDA_CHECK(cudaGetLastError());
     WM_CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -502,10 +604,10 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                                              wm_csr_col_ptr_desc,
                                              (const IdType*)center_nodes,
                                              center_node_count,
-                                             (const int*)output_sample_offset,
+                                             static_cast<const int*>(output_sample_offset),
                                              output_sample_offset_desc,
                                              (WMIdType*)output_dest_node_ptr,
-                                             (int*)output_center_localid_ptr,
+                                             output_center_localid_ptr,
                                              (int64_t*)output_edge_gid_ptr);
 
     WM_CUDA_CHECK(cudaGetLastError());
@@ -513,91 +615,41 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
     return;
   }
 
-  using weighted_sample_fun_type = void (*)(wholememory_gref_t,
-                                            wholememory_array_description_t,
-                                            wholememory_gref_t,
-                                            wholememory_array_description_t,
-                                            wholememory_gref_t,
-                                            wholememory_array_description_t,
-                                            const IdType*,
-                                            const int,
-                                            const int,
-                                            unsigned long long,
-                                            const int*,
-                                            wholememory_array_description_t,
-                                            WMIdType*,
-                                            int*,
-                                            int64_t*);
-
-  static const weighted_sample_fun_type func_array[4] = {
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               4,
-                                               128>,
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               4,
-                                               256>,
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               8,
-                                               256>,
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               8,
-                                               512>,
-  };
-
-  // 128,256,512,1024
-  // Maximum one-fourth ratio , however it  may not be a good way to choose a
-  // fun.
-
-  const int block_sizes[4] = {128, 256, 256, 512};
-  auto choose_fun_idx      = [](int max_sample_count) {
-    if (max_sample_count <= 128) {
-      // return (max_sample_count - 1) / 32;
-      return 0;
-    }
-    if (max_sample_count <= 256) { return 1; }
-    if (max_sample_count <= 512) { return 2; }
-    return 3;
-  };
-  int func_idx = choose_fun_idx(max_sample_count);
-
-  int block_size = block_sizes[func_idx];
-
-  func_array[func_idx]<<<center_node_count, block_size, 0, stream>>>(
-    wm_csr_row_ptr,
-    wm_csr_row_ptr_desc,
-    wm_csr_col_ptr,
-    wm_csr_col_ptr_desc,
-    wm_csr_weight_ptr,
-    wm_csr_weight_ptr_desc,
-    (const IdType*)center_nodes,
-    center_node_count,
-    max_sample_count,
-    random_seed,
-    (const int*)output_sample_offset,
-    output_sample_offset_desc,
-    (WMIdType*)output_dest_node_ptr,
-    (int*)output_center_localid_ptr,
-    (int64_t*)output_edge_gid_ptr);
+  constexpr int Capacity    = sample_count_threshold;
+  const int capacity        = raft::bound_by_power_of_two(max_sample_count);
+  constexpr int block_dim   = 128;
+  constexpr int num_of_warp = block_dim / raft::WarpSize;
+  int smem_size = raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, int>(
+    num_of_warp, max_sample_count);
+  smem_size = raft::max(static_cast<int>(max_sample_count * sizeof(int)),
+                        smem_size);  // store values of topk-result
+
+  launch_kernel<WarpSortClassT,
+                Capacity,
+                IdType,
+                int,
+                WeightType,
+                int,
+                WMIdType,
+                int64_t,
+                WeightType>(wm_csr_row_ptr,
+                            wm_csr_row_ptr_desc,
+                            wm_csr_col_ptr,
+                            wm_csr_col_ptr_desc,
+                            wm_csr_weight_ptr,
+                            wm_csr_weight_ptr_desc,
+                            (const IdType*)center_nodes,
+                            center_node_count,
+                            max_sample_count,
+                            random_seed,
+                            static_cast<const int*>(output_sample_offset),
+                            output_sample_offset_desc,
+                            output_dest_node_ptr,
+                            output_center_localid_ptr,
+                            output_edge_gid_ptr,
+                            block_dim,
+                            smem_size,
+                            stream);
 
   WM_CUDA_CHECK(cudaGetLastError());
   WM_CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
index 5a110a1e4..e3589285f 100644
--- a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
+++ b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
@@ -20,7 +20,7 @@
 
 #include <stdint.h>
 
-#include "wholegraph_ops//block_radix_topk.cuh"
+#include <raft/matrix/detail/select_k-inl.cuh>
 
 namespace wholememory_ops {
 
@@ -157,9 +157,22 @@ class CacheSetUpdater {
   static constexpr int kTopKRegisterCount = 4;
   static constexpr int kCacheSetSize      = CacheLineInfo::kCacheSetSize;
   static constexpr int kScaledCounterBits = 14;
-  using BlockTopK =
-    wholegraph_ops::BlockRadixTopKRegister<int64_t, kCacheSetSize, kTopKRegisterCount, true, int>;
-  struct TempStorage : BlockTopK::TempStorage {};
+
+ private:
+  using warp_bq_t =
+    raft::matrix::detail::select::warpsort::warp_sort_immediate<kCacheSetSize, false, int64_t, int>;
+
+  static constexpr int WARP_SIZE  = 32;
+  static constexpr int BLOCK_SIZE = kCacheSetSize;
+  static_assert(kCacheSetSize == WARP_SIZE, "only support CacheSetSize==32,and BLOCK_SIZE==32\n");
+
+ public:
+  struct TempStorage {
+    int64_t store_keys[kCacheSetSize];
+    int store_values[kCacheSetSize];
+  };
+
+  ;
   /**
    * From all invalid CacheSet, recompute lids to cache, and update cache_line_info.
    * NOTE: data are not loaded, need to load after this function
@@ -176,26 +189,14 @@ class CacheSetUpdater {
   {
     if (id_count <= 0) return;
     assert(cache_line_info.IsInValid());
-#pragma unroll
-    for (int i = 0; i < kTopKRegisterCount; i++) {
-      candidate_lfu_count_[i] = -1;
-      candidate_local_id_[i]  = -1;
-    }
-    int base_idx    = 0;
-    int valid_count = 0;
-    FillCandidate<0, false>(
-      nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-    while (base_idx < id_count) {
-      FillCandidate<1, false>(
-        nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-      FillCandidate<2, false>(
-        nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-      FillCandidate<3, false>(
-        nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-    }
+
+    // int base_idx    = 0;
+    // int valid_count = 0;
+
+    FillCandidate<false>(nullptr, nullptr, memory_lfu_counter, 0, id_count, temp_storage, -1);
     cache_line_info.ClearCacheLine();
-    cache_line_info.SetLocalID(candidate_local_id_[0]);
-    cache_line_info.SetScaleLfuCountSync(candidate_local_id_[0] >= 0 ? candidate_lfu_count_[0] : 0);
+    cache_line_info.SetLocalID(candidate_local_id_);
+    cache_line_info.SetScaleLfuCountSync(candidate_local_id_ >= 0 ? candidate_lfu_count_ : 0);
   }
   /**
    * Update cache set according to gids and inc_count
@@ -226,85 +227,58 @@ class CacheSetUpdater {
                                               int id_count)
   {
     if (id_count <= 0) return;
-#pragma unroll
-    for (int i = 0; i < kTopKRegisterCount; i++) {
-      candidate_lfu_count_[i] = -1;
-      candidate_local_id_[i]  = -1;
-    }
-    int base_idx           = 0;
-    int valid_count        = 0;
+
+    candidate_lfu_count_   = -1;
+    candidate_local_id_    = -1;
     int cached_local_id    = cache_line_info.LocalID();
-    int has_local_id_count = 0;
-    has_local_id_count += FillCandidate<0>(gids,
-                                           inc_count,
-                                           memory_lfu_counter,
-                                           base_idx,
-                                           valid_count,
-                                           set_start_id,
-                                           id_count,
-                                           temp_storage,
-                                           cached_local_id);
-    while (base_idx < id_count) {
-      has_local_id_count += FillCandidate<1>(gids,
-                                             inc_count,
-                                             memory_lfu_counter,
-                                             base_idx,
-                                             valid_count,
-                                             set_start_id,
-                                             id_count,
-                                             temp_storage,
-                                             cached_local_id);
-      has_local_id_count += FillCandidate<2>(gids,
-                                             inc_count,
-                                             memory_lfu_counter,
-                                             base_idx,
-                                             valid_count,
-                                             set_start_id,
-                                             id_count,
-                                             temp_storage,
-                                             cached_local_id);
-      has_local_id_count += FillCandidate<3>(gids,
-                                             inc_count,
-                                             memory_lfu_counter,
-                                             base_idx,
-                                             valid_count,
-                                             set_start_id,
-                                             id_count,
-                                             temp_storage,
-                                             cached_local_id);
-    }
-    // printf("[TopK init dump] threadIdx.x=%d, lfu_count=%ld, lid=%d\n", threadIdx.x,
-    // candidate_lfu_count_[0], candidate_local_id_[0]);
-    candidate_lfu_count_[1] = -1;
-    candidate_local_id_[1]  = -1;
+    int has_local_id_count = FillCandidate(
+      gids, inc_count, memory_lfu_counter, set_start_id, id_count, temp_storage, cached_local_id);
+
+    // printf("[TopK init dump] threadIdx.x=%d, lfu_count=%ld, lid=%d, has_local_id_count = %d \n",
+    //        threadIdx.x,
+    //        candidate_lfu_count_,
+    //        candidate_local_id_,
+    //        has_local_id_count);
+    int64_t candidate_lfu_count0 = -1;
+    int candidate_local_id0      = -1;
     unsigned int match_flag;
     // match_flag = WarpMatchLocalIDPairSync(candidate_local_id_[0], cached_local_id);
     int64_t estimated_lfu_count = cache_line_info.LfuCountSync();
     // Valid AND NOT exist in update list
+
     if (cached_local_id != -1 && has_local_id_count == 0) {
       // cached key not updated, use estimated lfu_count from cache
-      candidate_lfu_count_[1] = estimated_lfu_count;
-      candidate_local_id_[1]  = cached_local_id;
+      candidate_lfu_count0 = estimated_lfu_count;
+      candidate_local_id0  = cached_local_id;
+    }
+
+    warp_bq_t warp_queue(kCacheSetSize);
+    warp_queue.add(candidate_lfu_count_, candidate_local_id_);
+    warp_queue.add(candidate_lfu_count0, candidate_local_id0);
+    warp_queue.done();
+    warp_queue.store(temp_storage.store_keys, temp_storage.store_values);
+    __syncthreads();
+    if (threadIdx.x < kCacheSetSize) {
+      candidate_lfu_count_ = temp_storage.store_keys[threadIdx.x];
+      candidate_local_id_  = temp_storage.store_values[threadIdx.x];
     }
-    BlockTopK(temp_storage)
-      .radixTopKToStriped(
-        candidate_lfu_count_, candidate_local_id_, kCacheSetSize, kCacheSetSize * 2);
+
     // printf("[TopK merge dump] threadIdx.x=%d, lfu_count=%ld, lid=%d\n", threadIdx.x,
     // candidate_lfu_count_[0], candidate_local_id_[0]);
-    match_flag     = WarpMatchLocalIDPairSync(candidate_local_id_[0], cached_local_id);
+    match_flag     = WarpMatchLocalIDPairSync(candidate_local_id_, cached_local_id);
     int from_lane  = -1;
     bool has_match = (cached_local_id >= 0 && match_flag != 0);
     if (has_match) from_lane = __ffs(match_flag) - 1;
     unsigned int can_update_mask   = __ballot_sync(0xFFFFFFFF, !has_match);
     unsigned int lower_thread_mask = (1U << threadIdx.x) - 1;
     int updatable_cache_line_rank  = !has_match ? __popc(can_update_mask & lower_thread_mask) : -1;
-    unsigned int new_match_flag = WarpMatchLocalIDPairSync(cached_local_id, candidate_local_id_[0]);
+    unsigned int new_match_flag    = WarpMatchLocalIDPairSync(cached_local_id, candidate_local_id_);
     // printf("tid=%d, cached_local_id=%d, candidate_local_id_=%d, new_match_flag=%x\n",
     //        threadIdx.x,
     //        cached_local_id,
-    //        candidate_local_id_[0],
+    //        candidate_local_id_,
     //        new_match_flag);
-    bool new_need_slot              = (candidate_local_id_[0] >= 0 && new_match_flag == 0);
+    bool new_need_slot              = (candidate_local_id_ >= 0 && new_match_flag == 0);
     unsigned int need_new_slot_mask = __ballot_sync(0xFFFFFFFF, new_need_slot);
     int insert_data_rank = new_need_slot ? __popc(need_new_slot_mask & lower_thread_mask) : -1;
     // printf("tid=%d, updatable_cache_line_rank=%d, insert_data_rank=%d\n", threadIdx.x,
@@ -315,8 +289,8 @@ class CacheSetUpdater {
       from_lane = __ffs(rank_match_flag) - 1;
     }
     int src_lane_idx      = from_lane >= 0 ? from_lane : 0;
-    int64_t new_lfu_count = __shfl_sync(0xFFFFFFFF, candidate_lfu_count_[0], src_lane_idx, 32);
-    int new_local_id      = __shfl_sync(0xFFFFFFFF, candidate_local_id_[0], src_lane_idx, 32);
+    int64_t new_lfu_count = __shfl_sync(0xFFFFFFFF, candidate_lfu_count_, src_lane_idx, 32);
+    int new_local_id      = __shfl_sync(0xFFFFFFFF, candidate_local_id_, src_lane_idx, 32);
     if (from_lane == -1) {
       new_local_id  = -1;
       new_lfu_count = 0;
@@ -325,7 +299,7 @@ class CacheSetUpdater {
     // new_lfu_count);
     if (NeedOutputLoadIDs && need_load_to_cache_ids != nullptr) {
       int new_cached_lid = -1;
-      if (new_need_slot) { new_cached_lid = candidate_local_id_[0]; }
+      if (new_need_slot) { new_cached_lid = candidate_local_id_; }
       unsigned int load_cache_mask = __ballot_sync(0xFFFFFFFF, new_cached_lid >= 0);
       int output_idx               = __popc(load_cache_mask & ((1 << threadIdx.x) - 1));
       int total_load_count         = __popc(load_cache_mask);
@@ -361,41 +335,49 @@ class CacheSetUpdater {
   }
 
  private:
-  int64_t candidate_lfu_count_[kTopKRegisterCount];
-  int candidate_local_id_[kTopKRegisterCount];
-  template <int StrideIdx, bool IncCounter = true>
+  int64_t candidate_lfu_count_;
+  int candidate_local_id_;
+  template <bool IncCounter = true>
   __device__ __forceinline__ int FillCandidate(const NodeIDT* gids,
                                                const int* inc_freq_count,
                                                int64_t* cache_set_coverage_counter,
-                                               int& base_idx,
-                                               int& valid_count,
                                                int64_t cache_set_start_id,
                                                int id_count,
                                                TempStorage& temp_storage,
                                                int cached_local_id)
   {
-    int const idx = base_idx + threadIdx.x;
-    valid_count += min(kCacheSetSize, max(0, id_count - base_idx));
-    int local_id = -1;
-    if (idx < id_count) {
-      local_id                        = gids != nullptr ? gids[idx] - cache_set_start_id : idx;
-      candidate_lfu_count_[StrideIdx] = cache_set_coverage_counter[local_id];
-      if (IncCounter) {
-        int id_inc_count = inc_freq_count != nullptr ? inc_freq_count[idx] : 1;
-        candidate_lfu_count_[StrideIdx] += id_inc_count;
-        cache_set_coverage_counter[local_id] = candidate_lfu_count_[StrideIdx];
+    warp_bq_t warp_queue(kCacheSetSize);
+    const int per_thread_lim = id_count + raft::laneId();
+
+    int has_local_id_count = 0;
+    for (int idx = threadIdx.x; idx < per_thread_lim; idx += BLOCK_SIZE) {
+      int local_id                = -1;
+      int64_t candidate_lfu_count = -1;
+      int candidate_local_id      = -1;
+      if (idx < id_count) {
+        local_id            = gids != nullptr ? gids[idx] - cache_set_start_id : idx;
+        candidate_lfu_count = cache_set_coverage_counter[local_id];
+        if (IncCounter) {
+          int id_inc_count = inc_freq_count != nullptr ? inc_freq_count[idx] : 1;
+          candidate_lfu_count += id_inc_count;
+          cache_set_coverage_counter[local_id] = candidate_lfu_count;
+        }
+        candidate_local_id = local_id;
       }
-      candidate_local_id_[StrideIdx] = local_id;
+      unsigned int local_id_match_mask = WarpMatchLocalIDPairSync(local_id, cached_local_id);
+      has_local_id_count += ((cached_local_id != -1) ? __popc(local_id_match_mask) : 0);
+      warp_queue.add(candidate_lfu_count, candidate_local_id);
     }
-    unsigned int local_id_match_mask = WarpMatchLocalIDPairSync(local_id, cached_local_id);
-    int has_local_id_count           = (cached_local_id != -1) ? __popc(local_id_match_mask) : 0;
-    if (StrideIdx == kTopKRegisterCount - 1) {
-      BlockTopK(temp_storage)
-        .radixTopKToStriped(
-          candidate_lfu_count_, candidate_local_id_, min(kCacheSetSize, valid_count), valid_count);
-      valid_count = min(valid_count, kCacheSetSize);
+
+    warp_queue.done();
+    warp_queue.store(temp_storage.store_keys, temp_storage.store_values);
+    __syncthreads();
+    if (threadIdx.x < kCacheSetSize) {
+      candidate_lfu_count_ = temp_storage.store_keys[threadIdx.x];
+      candidate_local_id_  = temp_storage.store_values[threadIdx.x];
     }
-    base_idx += kCacheSetSize;
+    __syncthreads();
+
     return has_local_id_count;
   }
 };
diff --git a/cpp/tests/graph_ops/csr_add_self_loop_utils.cu b/cpp/tests/graph_ops/csr_add_self_loop_utils.cu
index 77783c081..276d01db4 100644
--- a/cpp/tests/graph_ops/csr_add_self_loop_utils.cu
+++ b/cpp/tests/graph_ops/csr_add_self_loop_utils.cu
@@ -148,7 +148,7 @@ void host_get_csr_add_self_loop(int* host_csr_row_ptr,
 {
   for (int64_t row_id = 0; row_id < csr_row_ptr_array_desc.size - 1; row_id++) {
     int start                                   = host_csr_row_ptr[row_id];
-    int end                                     = host_csr_col_ptr[row_id + 1];
+    int end                                     = host_csr_row_ptr[row_id + 1];
     host_ref_output_csr_row_ptr[row_id]         = start + row_id;
     host_ref_output_csr_col_ptr[start + row_id] = row_id;
     for (int64_t j = start; j < end; j++) {
diff --git a/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu b/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu
index 4e60c0aec..54f3ab934 100644
--- a/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu
+++ b/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu
@@ -500,7 +500,7 @@ void wholegraph_csr_unweighted_sample_without_replacement_cpu(
 template <typename DataType>
 void check_value_same(void* value, void* ref, int64_t size)
 {
-  int64_t diff_count;
+  int64_t diff_count = 0;
 
   DataType* value_ptr = static_cast<DataType*>(value);
   DataType* ref_ptr   = static_cast<DataType*>(ref);
@@ -593,22 +593,8 @@ void host_weighted_sample_without_replacement(
 
   int64_t center_nodes_count = center_node_desc.size;
 
-  const int block_sizes[4]       = {128, 256, 256, 512};
-  const int items_per_threads[4] = {4, 4, 8, 8};
-  auto choose_fun_idx            = [](int max_sample_count) {
-    if (max_sample_count <= 128) {
-      // return (max_sample_count - 1) / 32;
-      return 0;
-    }
-    if (max_sample_count <= 256) { return 1; }
-    if (max_sample_count <= 512) { return 2; }
-    return 3;
-  };
-  int func_idx = choose_fun_idx(max_sample_count);
-
-  int block_size       = block_sizes[func_idx];
-  int items_per_thread = items_per_threads[func_idx];
-
+  int block_size = 128;
+  if (max_sample_count > 256) { block_size = 256; }
   for (int64_t i = 0; i < center_nodes_count; i++) {
     int output_id          = output_sample_offset_ptr[i];
     int output_local_id    = 0;
@@ -637,25 +623,27 @@ void host_weighted_sample_without_replacement(
       };
       std::priority_queue<std::pair<int, WeightType>, std::vector<std::pair<int, WeightType>>, cmp>
         small_heap;
+
+      auto consume_fun = [&](int id, PCGenerator& rng) {
+        WeightType edge_weight = csr_weight_ptr[start + id];
+        WeightType weight      = host_gen_key_from_weight(edge_weight, rng);
+        process_count++;
+        if (process_count <= max_sample_count) {
+          small_heap.push(std::make_pair(id, weight));
+        } else {
+          std::pair<int, WeightType> small_heap_top_ele = small_heap.top();
+          if (small_heap_top_ele.second < weight) {
+            small_heap.pop();
+            small_heap.push(std::make_pair(id, weight));
+          }
+        }
+      };
+
       for (int j = 0; j < block_size; j++) {
         int local_gidx = gidx + j;
         PCGenerator rng(random_seed, (uint64_t)local_gidx, (uint64_t)0);
-        for (int k = 0; k < items_per_thread; k++) {
-          int id = k * block_size + j;
-          if (id < neighbor_count) {
-            WeightType edge_weight = csr_weight_ptr[start + id];
-            WeightType weight      = host_gen_key_from_weight(edge_weight, rng);
-            process_count++;
-            if (process_count < max_sample_count) {
-              small_heap.push(std::make_pair(id, weight));
-            } else {
-              std::pair<int, WeightType> small_heap_top_ele = small_heap.top();
-              if (small_heap_top_ele.second < weight) {
-                small_heap.pop();
-                small_heap.push(std::make_pair(id, weight));
-              }
-            }
-          }
+        for (int id = j; id < neighbor_count; id += block_size) {
+          if (id < neighbor_count) { consume_fun(id, rng); }
         }
       }
 
diff --git a/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu b/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
index bb0828ac9..eac1723af 100644
--- a/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
+++ b/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
@@ -105,7 +105,7 @@ typedef struct WholeGraphCSRWeightedSampleWithoutReplacementTestParam {
 
   wholememory_memory_type_t memory_type                 = WHOLEMEMORY_MT_CHUNKED;
   wholememory_memory_location_t memory_location         = WHOLEMEMORY_ML_DEVICE;
-  int64_t max_sample_count                              = 50;
+  int64_t max_sample_count                              = 10;
   int64_t center_node_count                             = 512;
   int64_t graph_node_count                              = 9703LL;
   int64_t graph_edge_count                              = 104323L;
@@ -369,7 +369,13 @@ TEST_P(WholeGraphCSRWeightedSampleWithoutReplacementParameterTests, WeightedSamp
         random_seed);
 
       EXPECT_EQ(total_sample_count, host_total_sample_count);
-
+      wholegraph_ops::testing::segment_sort_output(
+        host_ref_output_sample_offset,
+        output_sample_offset_desc,
+        host_ref_output_dest_nodes,
+        wholememory_create_array_desc(host_total_sample_count, 0, csr_col_ptr_desc.dtype),
+        host_ref_output_global_edge_id,
+        wholememory_create_array_desc(host_total_sample_count, 0, WHOLEMEMORY_DT_INT64));
       wholegraph_ops::testing::host_check_two_array_same(host_output_sample_offset,
                                                          output_sample_offset_desc,
                                                          host_ref_output_sample_offset,
@@ -440,6 +446,12 @@ INSTANTIATE_TEST_SUITE_P(WholeGraphCSRWeightedSampleWithoutReplacementOpTests,
                                              .set_center_node_count(35)
                                              .set_graph_node_count(23289)
                                              .set_graph_edge_couont(689403),
+                                           WholeGraphCSRWeightedSampleWithoutReplacementTestParam()
+                                             .set_memory_type(WHOLEMEMORY_MT_CONTINUOUS)
+                                             .set_max_sample_count(300)
+                                             .set_center_node_count(256)
+                                             .set_graph_node_count(23200)
+                                             .set_graph_edge_couont(68940300),
                                            WholeGraphCSRWeightedSampleWithoutReplacementTestParam()
                                              .set_memory_type(WHOLEMEMORY_MT_CHUNKED)
                                              .set_center_node_type(WHOLEMEMORY_DT_INT64)));
diff --git a/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py b/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
index 3727829bf..138163a87 100644
--- a/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
+++ b/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
@@ -46,14 +46,7 @@ def host_weighted_sample_without_replacement_func(
     output_center_localid_tensor = torch.empty((total_sample_count,), dtype=torch.int32)
     output_edge_gid_tensor = torch.empty((total_sample_count,), dtype=torch.int64)
     center_nodes_count = center_nodes.size(0)
-    block_sizes = [128, 256, 256, 512]
-    items_per_threads = [4, 4, 8, 8]
-    fun_idx = int((max_sample_count - 1) / 128)
-    if fun_idx > 3:
-        fun_idx = 3
-
-    block_size = block_sizes[fun_idx]
-    items_per_thread = items_per_threads[fun_idx]
+    block_size = 128 if max_sample_count <= 256 else 256
 
     for i in range(center_nodes_count):
         node_id = center_nodes[i]
@@ -72,21 +65,25 @@ def host_weighted_sample_without_replacement_func(
             edge_weight_corresponding_ids = torch.tensor([], dtype=col_id_dtype)
             for j in range(block_size):
                 local_gidx = gidx + j
-                local_edge_weights = torch.empty(
-                    (items_per_thread,), dtype=csr_weight_dtype
-                )
+                local_edge_weights = torch.tensor([], dtype=csr_weight_dtype)
                 generated_edge_weight_count = 0
-                for k in range(items_per_thread):
-                    id = k * block_size + j
-                    if id < neighbor_count:
-                        local_edge_weights[k] = host_csr_weight_ptr[start + id]
-                        generated_edge_weight_count += 1
-                        edge_weight_corresponding_ids = torch.cat(
-                            (
-                                edge_weight_corresponding_ids,
-                                torch.tensor([id], dtype=col_id_dtype),
-                            )
+                for id in range(j, neighbor_count, block_size):
+                    local_edge_weights = torch.cat(
+                        (
+                            local_edge_weights,
+                            torch.tensor(
+                                [host_csr_weight_ptr[start + id]],
+                                dtype=csr_weight_dtype,
+                            ),
                         )
+                    )
+                    generated_edge_weight_count += 1
+                    edge_weight_corresponding_ids = torch.cat(
+                        (
+                            edge_weight_corresponding_ids,
+                            torch.tensor([id], dtype=col_id_dtype),
+                        )
+                    )
                 random_values = (
                     wg_ops.generate_exponential_distribution_negative_float_cpu(
                         random_seed, local_gidx, generated_edge_weight_count

From 6b819033da347c0a9452dc59b236de4fa1a96665 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Tue, 29 Aug 2023 10:48:58 -0400
Subject: [PATCH 03/10] Use `copy-pr-bot` (#60)

This PR replaces the `copy_prs` functionality from the `ops-bot` with the new dedicated `copy-pr-bot` GitHub application.

Thorough documentation for the new `copy-pr-bot` application can be viewed below.

- https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/

**Important**: `copy-pr-bot` enforces signed commits. If an organization member opens a PR that contains unsigned commits, it will be deemed untrusted and therefore require an `/ok to test` comment. See the GitHub docs [here](https://docs.github.com/en/authentication/managing-commit-signature-verification/about-commit-signature-verification) for information on how to set up commit signing.

Any time a PR is deemed untrusted, it will receive a comment that looks like this: https://github.com/rapidsai/ci-imgs/pull/63#issuecomment-1688973208.

Every subsequent commit on an untrusted PR will require an additional `/ok to test` comment.

Any existing PRs that have unsigned commits after this change is merged will require an `/ok to test` comment for each subsequent commit _or_ the PR can be rebased to include signed commits as mentioned in the docs below:
https://docs.gha-runners.nvidia.com/cpr/contributors.

This information is all included on the documentation page linked above.

_I've skipped CI on this PR since it's not a change that is tested._

[skip ci]
---
 .github/copy-pr-bot.yaml | 4 ++++
 .github/ops-bot.yaml     | 1 -
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 .github/copy-pr-bot.yaml

diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
new file mode 100644
index 000000000..895ba83ee
--- /dev/null
+++ b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
index 2d1444c59..9a0b41550 100644
--- a/.github/ops-bot.yaml
+++ b/.github/ops-bot.yaml
@@ -5,5 +5,4 @@ auto_merger: true
 branch_checker: true
 label_checker: true
 release_drafter: true
-copy_prs: true
 recently_updated: true

From 4ce5dbd82325d058087693fc1a8461537fbf8f63 Mon Sep 17 00:00:00 2001
From: dongxuy04 <78518666+dongxuy04@users.noreply.github.com>
Date: Wed, 30 Aug 2023 20:39:13 +0800
Subject: [PATCH 04/10] Fix docs build and slightly optimize (#63)

This PR includes following updates:

- Fix `./build.sh docs`
- Add related cleanup for `./build.sh clean`
- Slightly optimize for `memset` before `cudaHostRegister` to speedup host continuous type of WholeMemory
- Remove unnecessary `prefetch_factor` in dataloader that may cause problem in some PyTorch versions

Authors:
  - https://github.com/dongxuy04

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/63
---
 build.sh                                                  | 8 +++++---
 cpp/CMakeLists.txt                                        | 2 +-
 cpp/src/wholememory/memory_handle.cpp                     | 4 +++-
 .../pylibwholegraph/pylibwholegraph/torch/data_loader.py  | 1 -
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/build.sh b/build.sh
index ddf0d46a3..535988f54 100755
--- a/build.sh
+++ b/build.sh
@@ -212,6 +212,8 @@ if hasArg clean; then
     find ${REPODIR}/python/pylibwholegraph -name "*.cpython*.so" -type f -delete
 
     # remove docs build
+    rm -rf ${REPODIR}/cpp/html
+    rm -rf ${REPODIR}/cpp/xml
     cd ${REPODIR}/docs/wholegraph
     make BUILDDIR=${DOCS_BUILD_DIR} clean
     rm -rf ${REPODIR}/docs/wholegraph/_xml
@@ -303,13 +305,13 @@ if hasArg docs; then
               ${CMAKE_GENERATOR_OPTION} \
               ${CMAKE_VERBOSE_OPTION}
     fi
-    cd ${LIBWHOLEGRAPH_BUILD_DIR}
+    cd ${REPODIR}/cpp
     cmake --build "${LIBWHOLEGRAPH_BUILD_DIR}" -j${PARALLEL_LEVEL} --target doxygen ${VERBOSE_FLAG}
     mkdir -p ${REPODIR}/docs/wholegraph/_html/doxygen_docs/libwholegraph/html
-    mv ${LIBWHOLEGRAPH_BUILD_DIR}/html/* ${REPODIR}/docs/wholegraph/_html/doxygen_docs/libwholegraph/html
+    mv ${REPODIR}/cpp/html/* ${REPODIR}/docs/wholegraph/_html/doxygen_docs/libwholegraph/html
     mkdir -p ${REPODIR}/docs/wholegraph/_xml
     # _xml is used for sphinx breathe project
-    mv ${LIBWHOLEGRAPH_BUILD_DIR}/xml/* "${REPODIR}/docs/wholegraph/_xml"
+    mv ${REPODIR}/cpp/xml/* "${REPODIR}/docs/wholegraph/_xml"
     cd ${REPODIR}/docs/wholegraph
     PYTHONPATH=${REPODIR}/python/pylibwholegraph:${PYTHONPATH} make BUILDDIR=${DOCS_BUILD_DIR} html
     mv ${REPODIR}/docs/wholegraph/_html/doxygen_docs ${REPODIR}/docs/wholegraph/${DOCS_BUILD_DIR}/html/
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a072f6d0b..2162665c2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -292,7 +292,7 @@ rapids_export(
 find_package(Doxygen 1.8.11)
 if(Doxygen_FOUND)
         add_custom_command(OUTPUT WHOLEGRAPH_DOXYGEN
-                           WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                           WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
                            COMMAND doxygen Doxyfile
                            VERBATIM)
 
diff --git a/cpp/src/wholememory/memory_handle.cpp b/cpp/src/wholememory/memory_handle.cpp
index 78909c08e..f20668612 100644
--- a/cpp/src/wholememory/memory_handle.cpp
+++ b/cpp/src/wholememory/memory_handle.cpp
@@ -497,7 +497,9 @@ class global_mapped_host_wholememory_impl : public wholememory_impl {
       nullptr, alloc_strategy_.total_alloc_size, PROT_READ | PROT_WRITE, MAP_SHARED, shm_fd, 0);
     WHOLEMEMORY_CHECK(mmap_ptr != (void*)-1);
 #endif
-    memset(mmap_ptr, 0, alloc_strategy_.total_alloc_size);
+    memset(static_cast<char*>(mmap_ptr) + rank_partition_strategy_.local_mem_offset,
+           0,
+           rank_partition_strategy_.local_mem_size);
     WM_CUDA_CHECK_NO_THROW(
       cudaHostRegister(mmap_ptr, alloc_strategy_.total_alloc_size, cudaHostRegisterDefault));
 #ifndef USE_SYSTEMV_SHM
diff --git a/python/pylibwholegraph/pylibwholegraph/torch/data_loader.py b/python/pylibwholegraph/pylibwholegraph/torch/data_loader.py
index 5edc2aade..4499ca216 100644
--- a/python/pylibwholegraph/pylibwholegraph/torch/data_loader.py
+++ b/python/pylibwholegraph/pylibwholegraph/torch/data_loader.py
@@ -81,7 +81,6 @@ def get_train_dataloader(
         batch_size=batch_size,
         num_workers=num_workers,
         pin_memory=True,
-        prefetch_factor=8 if num_workers > 0 else None,
         persistent_workers=True if num_workers > 0 else None,
         sampler=train_sampler,
     )

From e4e21fc875f0ae68b15d4104e405ea9b9038a129 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 7 Sep 2023 09:47:22 -0400
Subject: [PATCH 05/10] Simplify wheel build scripts and allow alphas of RAPIDS
 dependencies (#66)

This PR:

1. Removes `ci/apply_wheel_modifications.sh` and uses it inline in wheel build scripts
2. Allows for specifying alpha versioned dependencies of RAPIDS projects

Authors:
  - Divye Gala (https://github.com/divyegala)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/wholegraph/pull/66
---
 ci/build_wheel.sh                       | 30 ++++++++++++++++++++-----
 ci/release/apply_wheel_modifications.sh | 14 ------------
 2 files changed, 25 insertions(+), 19 deletions(-)
 delete mode 100755 ci/release/apply_wheel_modifications.sh

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 5f6556734..6e2c9f73c 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,6 +1,11 @@
 #!/bin/bash
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
+set -euo pipefail
+
+package_name="pylibwholegraph"
+package_dir="python/pylibwholegraph"
+
 source rapids-configure-sccache
 source rapids-date-string
 
@@ -10,11 +15,26 @@ version_override="$(rapids-pip-wheel-version ${RAPIDS_DATE_STRING})"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-ci/release/apply_wheel_modifications.sh ${version_override} "-${RAPIDS_PY_CUDA_SUFFIX}"
-echo "The package name and/or version was modified in the package source. The git diff is:"
-git diff
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+
+sed -i "s/^version = .*/version = \"${version_override}\"/g" ${pyproject_file}
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
 
-cd python/pylibwholegraph
+cd "${package_dir}"
 
 # Hardcode the output dir
 SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_MESSAGE_LOG_LEVEL=VERBOSE -DCUDA_STATIC_RUNTIME=ON -DWHOLEGRAPH_BUILD_WHEELS=ON" \
@@ -23,4 +43,4 @@ SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DBUILD_SHARED_LIBS=OFF -DCMAK
 mkdir -p final_dist
 python -m auditwheel repair --exclude libcuda.so.1 -w final_dist dist/*
 
-RAPIDS_PY_WHEEL_NAME="pylibwholegraph_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/release/apply_wheel_modifications.sh b/ci/release/apply_wheel_modifications.sh
deleted file mode 100755
index fd36d38f4..000000000
--- a/ci/release/apply_wheel_modifications.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Usage: bash apply_wheel_modifications.sh <new_version> <cuda_suffix>
-
-VERSION=${1}
-CUDA_SUFFIX=${2}
-
-# setup.py updates
-sed -i "s/^version = .*/version = \"${VERSION}\"/g" \
-  python/pylibwholegraph/pyproject.toml
-
-# pyproject.toml cuda suffixes
-sed -i "s/name = \"pylibwholegraph\"/name = \"pylibwholegraph${CUDA_SUFFIX}\"/g" python/pylibwholegraph/pyproject.toml

From d482448e967385b540c896484c506d246186ad6c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 8 Sep 2023 13:56:21 -0500
Subject: [PATCH 06/10] Use `conda mambabuild` not `mamba mambabuild` (#67)

With the release of conda 23.7.3, `mamba mambabuild` stopped working. With boa installed, `conda mambabuild` uses the mamba solver, so just use that instead.

See also https://github.com/rapidsai/cudf/issues/14068.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/wholegraph/pull/67
---
 ci/build_cpp.sh    | 2 +-
 ci/build_python.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 45e668af1..e290374fd 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -11,6 +11,6 @@ rapids-print-env
 
 rapids-logger "Begin cpp build"
 
-rapids-mamba-retry mambabuild conda/recipes/libwholegraph
+rapids-conda-retry mambabuild conda/recipes/libwholegraph
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index e316c94d0..e4382400e 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -22,7 +22,7 @@ rapids-logger "Begin py build"
 # TODO: Remove `--no-test` flags once importing on a CPU
 # node works correctly
 rapids-logger "Begin pylibwholegraph build"
-rapids-mamba-retry mambabuild \
+rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibwholegraph

From ee148044fd4f19dbf3cf3e7ecfe6afcfb46fcb65 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 19 Sep 2023 14:23:12 -0500
Subject: [PATCH 07/10] Update to clang 16.0.6. (#68)

This PR updates wholegraph to use clang 16.0.6. The previous version has some minor formatting issues affecting several RAPIDS repos.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Brad Rees (https://github.com/BradReesWork)

URL: https://github.com/rapidsai/wholegraph/pull/68
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 51ae0d97f..6943ae3b0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
             scripts
           )
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v16.0.0
+    rev: v16.0.6
     hooks:
       - id: clang-format
         exclude: |

From 63bf668a41cfcdaa46fe59ab84d92b3c3a927499 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 21 Sep 2023 14:02:53 -0500
Subject: [PATCH 08/10] Update image names (#70)

PR updates `rapidsai/ci` references to `rapidsai/ci-conda`

Authors:
  - Jake Awe (https://github.com/AyodeAwe)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/wholegraph/pull/70
---
 .github/workflows/build.yaml | 2 +-
 .github/workflows/pr.yaml    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 15180c9f0..9c3ad00a8 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -54,7 +54,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 2109630b1..35f30d038 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -59,7 +59,7 @@ jobs:
     with:
       build_type: pull-request
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibwholegraph:
     needs: checks

From a0ef0d274bfdb1aeb60b7df1e2450a6622e7417f Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:16:07 -0400
Subject: [PATCH 09/10] Update all versions to 23.10 (#71)

Looks like when `branch-23.10`/`v23.10.00a` was created, `update-version.sh` was not in the repo, so no versions were updated.

This PR just runs `update-version.sh` for `23.10.00`.

Authors:
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/wholegraph/pull/71
---
 .github/workflows/build.yaml                   | 12 ++++++------
 .github/workflows/pr.yaml                      | 18 +++++++++---------
 .github/workflows/test.yaml                    |  6 +++---
 ci/build_docs.sh                               |  2 +-
 .../environments/all_cuda-118_arch-x86_64.yaml |  4 ++--
 .../environments/all_cuda-120_arch-x86_64.yaml |  4 ++--
 cpp/CMakeLists.txt                             |  2 +-
 cpp/Doxyfile                                   |  2 +-
 dependencies.yaml                              |  4 ++--
 docs/wholegraph/source/conf.py                 |  4 ++--
 fetch_rapids.cmake                             |  2 +-
 python/pylibwholegraph/CMakeLists.txt          |  2 +-
 .../pylibwholegraph/__init__.py                |  2 +-
 python/pylibwholegraph/pyproject.toml          |  2 +-
 14 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9c3ad00a8..46450bc96 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -49,7 +49,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: [python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -62,7 +62,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -70,7 +70,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -80,7 +80,7 @@ jobs:
   wheel-publish-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-publish.yaml@branch-23.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 35f30d038..28c87a1e6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -21,41 +21,41 @@ jobs:
       - wheel-build-pylibwholegraph
       - wheel-test-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/pr-builder.yaml@branch-23.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@branch-23.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.10
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@branch-23.10
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/custom-job.yaml@branch-23.10
     with:
       build_type: pull-request
       arch: "amd64"
@@ -64,14 +64,14 @@ jobs:
   wheel-build-pylibwholegraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-build.yaml@branch-23.10
     with:
       build_type: pull-request
       script: ci/build_wheel.sh
   wheel-test-pylibwholegraph:
     needs: wheel-build-pylibwholegraph
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       build_type: pull-request
       script: ci/test_wheel.sh
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 1abb5e881..183a29b35 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-pytorch-tests:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibwholegraph:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.08
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index d9951c890..d7b08a87b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -23,7 +23,7 @@ rapids-logger "Downloading artifacts from previous jobs"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-export RAPIDS_VERSION_NUMBER="23.08"
+export RAPIDS_VERSION_NUMBER="23.10"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-mamba-retry install \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index ec637bbda..8f4c468c8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -23,8 +23,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==23.8.*
-- librmm==23.8.*
+- libraft-headers==23.10.*
+- librmm==23.10.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index ba4ae96eb..1532684f7 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -25,8 +25,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==23.8.*
-- librmm==23.8.*
+- libraft-headers==23.10.*
+- librmm==23.10.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2162665c2..de2cd28d4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
-set(RAPIDS_VERSION "23.08")
+set(RAPIDS_VERSION "23.10")
 set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
diff --git a/cpp/Doxyfile b/cpp/Doxyfile
index 60ee8334e..ed303cb92 100644
--- a/cpp/Doxyfile
+++ b/cpp/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "WholeGraph C API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.08
+PROJECT_NUMBER         = 23.10
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/dependencies.yaml b/dependencies.yaml
index 2eb6d6371..193c6018a 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -164,8 +164,8 @@ dependencies:
     common:
       - output_types: [conda, requirements]
         packages:
-          - libraft-headers==23.8.*
-          - librmm==23.8.*
+          - libraft-headers==23.10.*
+          - librmm==23.10.*
   test_cpp:
     common:
       - output_types: [conda, requirements]
diff --git a/docs/wholegraph/source/conf.py b/docs/wholegraph/source/conf.py
index e74f64b29..fe919ce6a 100644
--- a/docs/wholegraph/source/conf.py
+++ b/docs/wholegraph/source/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.08'
+version = '23.10'
 # The full version, including alpha/beta/rc tags.
-release = '23.08.00'
+release = '23.10.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index dbbbbd4d8..c32dc74da 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.08/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.10/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()
diff --git a/python/pylibwholegraph/CMakeLists.txt b/python/pylibwholegraph/CMakeLists.txt
index e7193ef2c..38758a14d 100644
--- a/python/pylibwholegraph/CMakeLists.txt
+++ b/python/pylibwholegraph/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(RAPIDS_VERSION "23.08")
+set(RAPIDS_VERSION "23.10")
 set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")
 
 include(FetchContent)
diff --git a/python/pylibwholegraph/pylibwholegraph/__init__.py b/python/pylibwholegraph/pylibwholegraph/__init__.py
index d1ec30fab..44d6a1a38 100644
--- a/python/pylibwholegraph/pylibwholegraph/__init__.py
+++ b/python/pylibwholegraph/pylibwholegraph/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "23.08.00"
+__version__ = "23.10.00"
diff --git a/python/pylibwholegraph/pyproject.toml b/python/pylibwholegraph/pyproject.toml
index 2f65ac413..f2d247e16 100644
--- a/python/pylibwholegraph/pyproject.toml
+++ b/python/pylibwholegraph/pyproject.toml
@@ -24,7 +24,7 @@ requires = [
 
 [project]
 name = "pylibwholegraph"
-version = "23.08.00"
+version = "23.10.00"
 description = "pylibwholegraph - GPU Graph Storage for GNN feature and graph structure"
 authors = [
     { name = "NVIDIA Corporation" },

From c2b2bc9dc0dcdf41b0b1ce8d5ec31d958dc97e8c Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 11 Oct 2023 10:30:02 -0400
Subject: [PATCH 10/10] Update Changelog [skip ci]

---
 CHANGELOG.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b399b807b..32c4b47e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,19 @@
+# wholegraph 23.10.00 (11 Oct 2023)
+
+## 🐛 Bug Fixes
+
+- Update all versions to 23.10 ([#71](https://github.com/rapidsai/wholegraph/pull/71)) [@raydouglass](https://github.com/raydouglass)
+- Use `conda mambabuild` not `mamba mambabuild` ([#67](https://github.com/rapidsai/wholegraph/pull/67)) [@bdice](https://github.com/bdice)
+
+## 🛠️ Improvements
+
+- Update image names ([#70](https://github.com/rapidsai/wholegraph/pull/70)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Update to clang 16.0.6. ([#68](https://github.com/rapidsai/wholegraph/pull/68)) [@bdice](https://github.com/bdice)
+- Simplify wheel build scripts and allow alphas of RAPIDS dependencies ([#66](https://github.com/rapidsai/wholegraph/pull/66)) [@divyegala](https://github.com/divyegala)
+- Fix docs build and slightly optimize ([#63](https://github.com/rapidsai/wholegraph/pull/63)) [@dongxuy04](https://github.com/dongxuy04)
+- Use `copy-pr-bot` ([#60](https://github.com/rapidsai/wholegraph/pull/60)) [@ajschmidt8](https://github.com/ajschmidt8)
+- PR: Use top-k from RAFT ([#53](https://github.com/rapidsai/wholegraph/pull/53)) [@chuangz0](https://github.com/chuangz0)
+
 # wholegraph 23.08.00 (9 Aug 2023)
 
 ## 🚨 Breaking Changes