diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 160780bf4d..53bb840309 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -589,10 +589,3 @@ endif()
 if(BUILD_PRIMS_BENCH)
   add_subdirectory(bench/prims/)
 endif()
-
-# ##################################################################################################
-# * build ann benchmark executable -----------------------------------------------
-
-if(BUILD_ANN_BENCH)
-  add_subdirectory(bench/ann/)
-endif()
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 52c63ad73b..cf03a36612 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -74,49 +74,9 @@ function(ConfigureBench)
 endfunction()
 
 if(BUILD_PRIMS_BENCH)
-  ConfigureBench(
-    NAME
-    CORE_BENCH
-    PATH
-    core/bitset.cu
-    core/copy.cu
-    main.cpp
-  )
+  ConfigureBench(NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp)
 
-  ConfigureBench(
-    NAME
-    UTIL_BENCH
-    PATH
-    util/popc.cu
-    main.cpp
-  )
-
-  ConfigureBench(
-    NAME CLUSTER_BENCH PATH cluster/kmeans_balanced.cu cluster/kmeans.cu
-    main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
-  )
-
-  ConfigureBench(
-    NAME TUNE_DISTANCE PATH distance/tune_pairwise/kernel.cu
-    distance/tune_pairwise/bench.cu main.cpp
-  )
-
-  ConfigureBench(
-    NAME
-    DISTANCE_BENCH
-    PATH
-    distance/distance_cosine.cu
-    distance/distance_exp_l2.cu
-    distance/distance_l1.cu
-    distance/distance_unexp_l2.cu
-    distance/fused_l2_nn.cu
-    distance/masked_nn.cu
-    distance/kernels.cu
-    main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
-  )
+  ConfigureBench(NAME UTIL_BENCH PATH util/popc.cu main.cpp)
 
   ConfigureBench(
     NAME
@@ -137,54 +97,18 @@ if(BUILD_PRIMS_BENCH)
   )
 
   ConfigureBench(
-    NAME MATRIX_BENCH PATH matrix/argmin.cu matrix/gather.cu
-    matrix/select_k.cu main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_BENCH PATH matrix/argmin.cu matrix/gather.cu matrix/select_k.cu main.cpp OPTIONAL
+    LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
-    NAME RANDOM_BENCH PATH random/make_blobs.cu random/permute.cu
-   random/rng.cu random/subsample.cu main.cpp
-  )
-
-  ConfigureBench(
-    NAME
-    SPARSE_BENCH
-    PATH
-    sparse/bitmap_to_csr.cu
-    sparse/convert_csr.cu
-    sparse/select_k_csr.cu
+    NAME RANDOM_BENCH PATH random/make_blobs.cu random/permute.cu random/rng.cu random/subsample.cu
     main.cpp
   )
 
   ConfigureBench(
-    NAME
-    NEIGHBORS_BENCH
-    PATH
-    neighbors/knn/brute_force_float_int64_t.cu
-    neighbors/knn/brute_force_float_uint32_t.cu
-    neighbors/knn/cagra_float_uint32_t.cu
-    neighbors/knn/ivf_flat_filter_float_int64_t.cu
-    neighbors/knn/ivf_flat_float_int64_t.cu
-    neighbors/knn/ivf_flat_int8_t_int64_t.cu
-    neighbors/knn/ivf_flat_uint8_t_int64_t.cu
-    neighbors/knn/ivf_pq_float_int64_t.cu
-    neighbors/knn/ivf_pq_filter_float_int64_t.cu
-    neighbors/knn/ivf_pq_int8_t_int64_t.cu
-    neighbors/knn/ivf_pq_uint8_t_int64_t.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
-    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
-    neighbors/refine_float_int64_t.cu
-    neighbors/refine_uint8_t_int64_t.cu
+    NAME SPARSE_BENCH PATH sparse/bitmap_to_csr.cu sparse/convert_csr.cu sparse/select_k_csr.cu
     main.cpp
-    OPTIONAL
-    LIB
-    EXPLICIT_INSTANTIATE_ONLY
   )
 
 endif()
diff --git a/cpp/bench/prims/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu
deleted file mode 100644
index 6387211135..0000000000
--- a/cpp/bench/prims/cluster/kmeans.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-
-#include <raft/cluster/kmeans.cuh>
-#include <raft/cluster/kmeans_types.hpp>
-
-namespace raft::bench::cluster {
-
-struct KMeansBenchParams {
-  DatasetParams data;
-  BlobsParams blobs;
-  raft::cluster::KMeansParams kmeans;
-};
-
-inline auto operator<<(std::ostream& os, const KMeansBenchParams& p) -> std::ostream&
-{
-  os << p.data.rows << "#" << p.data.cols << "#" << p.kmeans.n_clusters;
-  return os;
-}
-
-template <typename T, typename IndexT = int>
-struct KMeans : public BlobsFixture<T, IndexT> {
-  KMeans(const KMeansBenchParams& p)
-    : BlobsFixture<T, IndexT>(p.data, p.blobs),
-      params(p),
-      centroids(this->handle),
-      labels(this->handle)
-  {
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    std::ostringstream label_stream;
-    label_stream << params;
-    state.SetLabel(label_stream.str());
-
-    raft::device_matrix_view<const T, IndexT> X_view                          = this->X.view();
-    std::optional<raft::device_vector_view<const T, IndexT>> opt_weights_view = std::nullopt;
-    std::optional<raft::device_matrix_view<T, IndexT>> centroids_view =
-      std::make_optional<raft::device_matrix_view<T, IndexT>>(centroids.view());
-    raft::device_vector_view<IndexT, IndexT> labels_view = labels.view();
-    raft::host_scalar_view<T> inertia_view               = raft::make_host_scalar_view<T>(&inertia);
-    raft::host_scalar_view<IndexT> n_iter_view = raft::make_host_scalar_view<IndexT>(&n_iter);
-
-    this->loop_on_state(state, [&]() {
-      raft::cluster::kmeans_fit_predict<T, IndexT>(this->handle,
-                                                   params.kmeans,
-                                                   X_view,
-                                                   opt_weights_view,
-                                                   centroids_view,
-                                                   labels_view,
-                                                   inertia_view,
-                                                   n_iter_view);
-    });
-  }
-
-  void allocate_temp_buffers(const ::benchmark::State& state) override
-  {
-    centroids =
-      raft::make_device_matrix<T, IndexT>(this->handle, params.kmeans.n_clusters, params.data.cols);
-    labels = raft::make_device_vector<IndexT, IndexT>(this->handle, params.data.rows);
-  }
-
- private:
-  KMeansBenchParams params;
-  raft::device_matrix<T, IndexT> centroids;
-  raft::device_vector<IndexT, IndexT> labels;
-  T inertia;
-  IndexT n_iter;
-};  // struct KMeans
-
-std::vector<KMeansBenchParams> getKMeansInputs()
-{
-  std::vector<KMeansBenchParams> out;
-  KMeansBenchParams p;
-  p.data.row_major                                  = true;
-  p.blobs.cluster_std                               = 1.0;
-  p.blobs.shuffle                                   = false;
-  p.blobs.center_box_min                            = -10.0;
-  p.blobs.center_box_max                            = 10.0;
-  p.blobs.seed                                      = 12345ULL;
-  p.kmeans.init                                     = raft::cluster::KMeansParams::KMeansPlusPlus;
-  p.kmeans.max_iter                                 = 300;
-  p.kmeans.tol                                      = 1e-4;
-  p.kmeans.verbosity                                = RAFT_LEVEL_INFO;
-  p.kmeans.metric                                   = raft::distance::DistanceType::L2Expanded;
-  p.kmeans.inertia_check                            = true;
-  std::vector<std::tuple<int, int, int>> row_cols_k = {
-    {1000000, 20, 1000},
-    {3000000, 50, 20},
-    {10000000, 50, 5},
-  };
-  for (auto& rck : row_cols_k) {
-    p.data.rows         = std::get<0>(rck);
-    p.data.cols         = std::get<1>(rck);
-    p.blobs.n_clusters  = std::get<2>(rck);
-    p.kmeans.n_clusters = std::get<2>(rck);
-    out.push_back(p);
-  }
-  return out;
-}
-
-// note(lsugy): commenting out int64_t because the templates are not compiled in the distance
-// library, resulting in long compilation times.
-RAFT_BENCH_REGISTER((KMeans<float, int>), "", getKMeansInputs());
-RAFT_BENCH_REGISTER((KMeans<double, int>), "", getKMeansInputs());
-// RAFT_BENCH_REGISTER((KMeans<float, int64_t>), "", getKMeansInputs());
-// RAFT_BENCH_REGISTER((KMeans<double, int64_t>), "", getKMeansInputs());
-
-}  // namespace raft::bench::cluster
diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu
deleted file mode 100644
index dc05783989..0000000000
--- a/cpp/bench/prims/cluster/kmeans_balanced.cu
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-
-#include <raft/cluster/kmeans_balanced.cuh>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/random/rng.cuh>
-
-namespace raft::bench::cluster {
-
-struct KMeansBalancedBenchParams {
-  DatasetParams data;
-  uint32_t n_lists;
-  raft::cluster::kmeans_balanced_params kb_params;
-};
-
-template <typename T, typename IndexT = int>
-struct KMeansBalanced : public fixture {
-  KMeansBalanced(const KMeansBalancedBenchParams& p) : params(p), X(handle), centroids(handle) {}
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    this->loop_on_state(state, [this]() {
-      raft::device_matrix_view<const T, IndexT> X_view   = this->X.view();
-      raft::device_matrix_view<T, IndexT> centroids_view = this->centroids.view();
-      raft::cluster::kmeans_balanced::fit(
-        this->handle, this->params.kb_params, X_view, centroids_view);
-    });
-  }
-
-  void allocate_data(const ::benchmark::State& state) override
-  {
-    X = raft::make_device_matrix<T, IndexT>(handle, params.data.rows, params.data.cols);
-
-    raft::random::RngState rng{1234};
-    constexpr T kRangeMax = std::is_integral_v<T> ? std::numeric_limits<T>::max() : T(1);
-    constexpr T kRangeMin = std::is_integral_v<T> ? std::numeric_limits<T>::min() : T(-1);
-    if constexpr (std::is_integral_v<T>) {
-      raft::random::uniformInt(
-        handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax);
-    } else {
-      raft::random::uniform(
-        handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax);
-    }
-    resource::sync_stream(handle, stream);
-  }
-
-  void allocate_temp_buffers(const ::benchmark::State& state) override
-  {
-    centroids =
-      raft::make_device_matrix<float, IndexT>(this->handle, params.n_lists, params.data.cols);
-  }
-
- private:
-  KMeansBalancedBenchParams params;
-  raft::device_matrix<T, IndexT> X;
-  raft::device_matrix<float, IndexT> centroids;
-};  // struct KMeansBalanced
-
-std::vector<KMeansBalancedBenchParams> getKMeansBalancedInputs()
-{
-  std::vector<KMeansBalancedBenchParams> out;
-  KMeansBalancedBenchParams p;
-  p.data.row_major                          = true;
-  p.kb_params.n_iters                       = 20;
-  p.kb_params.metric                        = raft::distance::DistanceType::L2Expanded;
-  std::vector<std::pair<int, int>> row_cols = {
-    {100000, 128}, {1000000, 128}, {10000000, 128},
-    // The following dataset sizes are too large for most GPUs.
-    // {100000000, 128},
-  };
-  for (auto& rc : row_cols) {
-    p.data.rows = rc.first;
-    p.data.cols = rc.second;
-    for (auto n_lists : std::vector<int>({1000, 10000, 100000})) {
-      p.n_lists = n_lists;
-      out.push_back(p);
-    }
-  }
-  return out;
-}
-
-// Note: the datasets sizes are too large for 32-bit index types.
-RAFT_BENCH_REGISTER((KMeansBalanced<float, int64_t>), "", getKMeansBalancedInputs());
-
-}  // namespace raft::bench::cluster
diff --git a/cpp/bench/prims/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh
deleted file mode 100644
index 8368062168..0000000000
--- a/cpp/bench/prims/distance/distance_common.cuh
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-
-#include <raft/distance/distance.cuh>
-#include <raft/util/cudart_utils.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-namespace raft::bench::distance {
-
-struct distance_params {
-  int m, n, k;
-  bool isRowMajor;
-};  // struct distance_params
-
-template <typename T, raft::distance::DistanceType DType>
-struct distance : public fixture {
-  distance(const distance_params& p)
-    : params(p),
-      x(p.m * p.k, stream),
-      y(p.n * p.k, stream),
-      out(p.m * p.n, stream),
-      workspace(0, stream)
-  {
-    RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, x.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, y.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(T), stream));
-    worksize = raft::distance::getWorkspaceSize<DType, T, T, T>(
-      x.data(), y.data(), params.m, params.n, params.k);
-    workspace.resize(worksize, stream);
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    loop_on_state(state, [this]() {
-      raft::distance::distance<DType, T, T, T>(handle,
-                                               x.data(),
-                                               y.data(),
-                                               out.data(),
-                                               params.m,
-                                               params.n,
-                                               params.k,
-                                               (void*)workspace.data(),
-                                               worksize,
-                                               params.isRowMajor);
-    });
-  }
-
- private:
-  distance_params params;
-  rmm::device_uvector<T> x, y, out;
-  rmm::device_uvector<char> workspace;
-  size_t worksize;
-};  // struct Distance
-
-const std::vector<distance_params> dist_input_vecs{
-  {32, 16384, 16384, true},    {64, 16384, 16384, true},    {128, 16384, 16384, true},
-  {256, 16384, 16384, true},   {512, 16384, 16384, true},   {1024, 16384, 16384, true},
-  {16384, 32, 16384, true},    {16384, 64, 16384, true},    {16384, 128, 16384, true},
-  {16384, 256, 16384, true},   {16384, 512, 16384, true},   {16384, 1024, 16384, true},
-  {16384, 16384, 32, true},    {16384, 16384, 64, true},    {16384, 16384, 128, true},
-  {16384, 16384, 256, true},   {16384, 16384, 512, true},   {16384, 16384, 1024, true},
-  {16384, 16384, 16384, true}, {32, 16384, 16384, false},   {64, 16384, 16384, false},
-  {128, 16384, 16384, false},  {256, 16384, 16384, false},  {512, 16384, 16384, false},
-  {1024, 16384, 16384, false}, {16384, 32, 16384, false},   {16384, 64, 16384, false},
-  {16384, 128, 16384, false},  {16384, 256, 16384, false},  {16384, 512, 16384, false},
-  {16384, 1024, 16384, false}, {16384, 16384, 32, false},   {16384, 16384, 64, false},
-  {16384, 16384, 128, false},  {16384, 16384, 256, false},  {16384, 16384, 512, false},
-  {16384, 16384, 1024, false}, {16384, 16384, 16384, false}
-
-};
-
-#define DIST_BENCH_REGISTER(Name, Metric)            \
-  using Name##F = distance<float, Metric>;           \
-  RAFT_BENCH_REGISTER(Name##F, "", dist_input_vecs); \
-  using Name##D = distance<double, Metric>;          \
-  RAFT_BENCH_REGISTER(Name##D, "", dist_input_vecs);
-
-}  // namespace raft::bench::distance
diff --git a/cpp/bench/prims/distance/distance_cosine.cu b/cpp/bench/prims/distance/distance_cosine.cu
deleted file mode 100644
index c8ac8067c8..0000000000
--- a/cpp/bench/prims/distance/distance_cosine.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace raft::bench::distance {
-
-DIST_BENCH_REGISTER(DistanceCosine, raft::distance::DistanceType::CosineExpanded);
-
-}  // namespace raft::bench::distance
diff --git a/cpp/bench/prims/distance/distance_exp_l2.cu b/cpp/bench/prims/distance/distance_exp_l2.cu
deleted file mode 100644
index 52b7fff05c..0000000000
--- a/cpp/bench/prims/distance/distance_exp_l2.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace raft::bench::distance {
-
-DIST_BENCH_REGISTER(DistanceL2Sq, raft::distance::DistanceType::L2Expanded);
-DIST_BENCH_REGISTER(DistanceL2Sqrt, raft::distance::DistanceType::L2SqrtExpanded);
-
-}  // namespace raft::bench::distance
diff --git a/cpp/bench/prims/distance/distance_l1.cu b/cpp/bench/prims/distance/distance_l1.cu
deleted file mode 100644
index e80db48ef0..0000000000
--- a/cpp/bench/prims/distance/distance_l1.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace raft::bench::distance {
-
-DIST_BENCH_REGISTER(DistanceL1, raft::distance::DistanceType::L1);
-
-}  // namespace raft::bench::distance
diff --git a/cpp/bench/prims/distance/distance_unexp_l2.cu b/cpp/bench/prims/distance/distance_unexp_l2.cu
deleted file mode 100644
index 7ac1a8a4b5..0000000000
--- a/cpp/bench/prims/distance/distance_unexp_l2.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace raft::bench::distance {
-
-DIST_BENCH_REGISTER(DistanceUnexpL2Sq, raft::distance::DistanceType::L2Unexpanded);
-DIST_BENCH_REGISTER(DistanceUnexpL2Sqrt, raft::distance::DistanceType::L2SqrtUnexpanded);
-
-}  // namespace raft::bench::distance
diff --git a/cpp/bench/prims/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu
deleted file mode 100644
index a263bef6ba..0000000000
--- a/cpp/bench/prims/distance/fused_l2_nn.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/distance/fused_l2_nn.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/util/cudart_utils.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-namespace raft::bench::distance {
-
-struct fusedl2nn_inputs {
-  int64_t m, n, k;
-};  // struct fusedl2nn_inputs
-
-inline auto operator<<(std::ostream& os, const fusedl2nn_inputs& p) -> std::ostream&
-{
-  os << p.m << "#" << p.n << "#" << p.k;
-  return os;
-}
-
-template <typename DataT, typename IdxT, typename OutT>
-struct fusedl2nn : public fixture {
-  fusedl2nn(const fusedl2nn_inputs& p)
-    : params(p),
-      workspace(this->handle),
-      x(this->handle),
-      y(this->handle),
-      x_norm(this->handle),
-      y_norm(this->handle),
-      out(this->handle)
-  {
-  }
-
-  void allocate_data(const ::benchmark::State& state) override
-  {
-    x      = raft::make_device_matrix<DataT, IdxT>(handle, params.m, params.k);
-    y      = raft::make_device_matrix<DataT, IdxT>(handle, params.n, params.k);
-    x_norm = raft::make_device_vector<DataT, IdxT>(handle, params.m);
-    y_norm = raft::make_device_vector<DataT, IdxT>(handle, params.n);
-    out    = raft::make_device_vector<OutT, IdxT>(handle, params.m);
-
-    raft::random::RngState rng{1234};
-    raft::random::uniform(
-      handle, rng, x.data_handle(), params.m * params.k, (DataT)-1.0, (DataT)1.0);
-    raft::random::uniform(
-      handle, rng, y.data_handle(), params.n * params.k, (DataT)-1.0, (DataT)1.0);
-
-    // Pre-compute norms
-    raft::linalg::rowNorm(x_norm.data_handle(),
-                          x.data_handle(),
-                          params.k,
-                          params.m,
-                          raft::linalg::L2Norm,
-                          true,
-                          stream);
-    raft::linalg::rowNorm(y_norm.data_handle(),
-                          y.data_handle(),
-                          params.k,
-                          params.n,
-                          raft::linalg::L2Norm,
-                          true,
-                          stream);
-    resource::sync_stream(handle, stream);
-  }
-
-  void allocate_temp_buffers(const ::benchmark::State& state) override
-  {
-    workspace = raft::make_device_vector<char, IdxT>(handle, params.m * sizeof(IdxT));
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    std::ostringstream label_stream;
-    label_stream << params;
-    state.SetLabel(label_stream.str());
-
-    loop_on_state(state, [this]() {
-      raft::distance::fusedL2NNMinReduce<DataT, OutT, IdxT>(out.data_handle(),
-                                                            x.data_handle(),
-                                                            y.data_handle(),
-                                                            x_norm.data_handle(),
-                                                            y_norm.data_handle(),
-                                                            static_cast<IdxT>(params.m),
-                                                            static_cast<IdxT>(params.n),
-                                                            static_cast<IdxT>(params.k),
-                                                            (void*)workspace.data_handle(),
-                                                            false,
-                                                            true,
-                                                            stream);
-    });
-
-    int64_t num_flops = 2 * params.m * params.n * params.k;
-
-    int64_t read_elts  = params.n * params.k + params.m * params.k;
-    int64_t write_elts = params.m;
-
-    state.counters["FLOP/s"] = benchmark::Counter(
-      num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000);
-
-    state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT),
-                                                 benchmark::Counter::kIsIterationInvariantRate,
-                                                 benchmark::Counter::OneK::kIs1000);
-    state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT),
-                                                 benchmark::Counter::kIsIterationInvariantRate,
-                                                 benchmark::Counter::OneK::kIs1000);
-  }
-
- private:
-  fusedl2nn_inputs params;
-  raft::device_matrix<DataT, IdxT> x, y;
-  raft::device_vector<DataT, IdxT> x_norm, y_norm;
-  raft::device_vector<OutT, IdxT> out;
-  raft::device_vector<char, IdxT> workspace;
-};  // struct fusedl2nn
-
-template <typename IdxT>
-std::vector<fusedl2nn_inputs> getFusedL2NNInputs()
-{
-  std::vector<fusedl2nn_inputs> inputs;
-  std::vector<int64_t> m_list = {100000, 1000000};
-  if constexpr (sizeof(IdxT) == 8) { m_list.push_back(10000000); }
-  std::vector<int64_t> n_list = {100, 1000, 10000};
-  std::vector<int64_t> k_list = {64, 128, 256};
-  for (auto m : m_list) {
-    for (auto n : n_list) {
-      for (auto k : k_list) {
-        inputs.push_back({m, n, k});
-      }
-    }
-  }
-  return inputs;
-}
-
-#define FUSEDL2NN_BENCH(DataT, IdxT, OutT) \
-  RAFT_BENCH_REGISTER((fusedl2nn<DataT, IdxT, RAFT_DEPAREN(OutT)>), "", getFusedL2NNInputs<IdxT>())
-
-FUSEDL2NN_BENCH(float, int, float);
-FUSEDL2NN_BENCH(double, int, double);
-FUSEDL2NN_BENCH(float, int, (raft::KeyValuePair<int, float>));
-FUSEDL2NN_BENCH(double, int, (raft::KeyValuePair<int, double>));
-FUSEDL2NN_BENCH(float, int64_t, float);
-FUSEDL2NN_BENCH(double, int64_t, double);
-FUSEDL2NN_BENCH(float, int64_t, (raft::KeyValuePair<int64_t, float>));
-FUSEDL2NN_BENCH(double, int64_t, (raft::KeyValuePair<int64_t, double>));
-
-}  // namespace raft::bench::distance
diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu
deleted file mode 100644
index eb86330637..0000000000
--- a/cpp/bench/prims/distance/kernels.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <common/benchmark.hpp>
-
-#include <raft/core/device_resources.hpp>
-#include <raft/core/resource/cublas_handle.hpp>
-#include <raft/distance/distance_types.hpp>
-#include <raft/distance/kernels.cuh>
-#include <raft/random/rng.cuh>
-
-#include <memory>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace raft::bench::distance::kernels {
-
-using namespace raft::distance::kernels;
-struct GramTestParams {
-  int m;  // m parameter of the GEMM
-  int k;  // k parameter of the GEMM
-  int n;  // n parameter of the GEMM
-  KernelParams kernel_params;
-  bool is_row_major;
-};  // struct GramTestParams
-
-template <typename T>
-struct GramMatrix : public fixture {
-  GramMatrix(const GramTestParams& p)
-    : params(p), handle(stream), A(0, stream), B(0, stream), C(0, stream)
-  {
-    kernel = std::unique_ptr<GramMatrixBase<T>>(
-      KernelFactory<T>::create(p.kernel_params, resource::get_cublas_handle(handle)));
-
-    A.resize(params.m * params.k, stream);
-    B.resize(params.k * params.n, stream);
-    C.resize(params.m * params.n, stream);
-    raft::random::RngState rng(123456ULL);
-    raft::random::uniform(handle, rng, A.data(), params.m * params.k, T(-1.0), T(1.0));
-    raft::random::uniform(handle, rng, B.data(), params.k * params.n, T(-1.0), T(1.0));
-  }
-
-  ~GramMatrix()
-  {
-    A.release();
-    B.release();
-    C.release();
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); }
-    loop_on_state(state, [this]() {
-      (*this->kernel)(A.data(),
-                      this->params.m,
-                      this->params.k,
-                      B.data(),
-                      this->params.n,
-                      C.data(),
-                      this->params.is_row_major,
-                      this->stream);
-    });
-  }
-
- private:
-  const raft::device_resources handle;
-  std::unique_ptr<GramMatrixBase<T>> kernel;
-  GramTestParams params;
-
-  rmm::device_uvector<T> A;  // input matrix A, size [m * k]
-  rmm::device_uvector<T> B;  // input matrix B, size [n * k]
-  rmm::device_uvector<T> C;  // output matrix C, size [m*n]
-};
-
-static std::vector<GramTestParams> getInputs()
-{
-  std::vector<GramTestParams> param_vec;
-  std::vector<KernelParams> kernel_params{KernelParams{LINEAR, 3, 1, 0},
-                                          KernelParams{POLYNOMIAL, 2, 1.3, 1},
-                                          KernelParams{TANH, 2, 0.5, 2.4},
-                                          KernelParams{RBF, 2, 0.5, 0}};
-  struct TestSize {
-    int m;
-    int k;
-    int n;
-  };
-  std::vector<TestSize> data_size{{4096, 10, 1024},
-                                  {4096, 100, 1024},
-                                  {4096, 1000, 1024},
-                                  {4096, 10000, 1024},
-                                  {100000, 10, 1024},
-                                  {100000, 100, 1024},
-                                  {100000, 1000, 1024}};
-
-  param_vec.reserve(kernel_params.size() * data_size.size());
-  for (TestSize s : data_size) {
-    for (auto kernel : kernel_params) {
-      for (bool row_major : {false, true}) {
-        param_vec.push_back(GramTestParams{s.m, s.k, s.n, kernel, row_major});
-      }
-    }
-  }
-  return param_vec;
-}
-
-RAFT_BENCH_REGISTER(GramMatrix<float>, "", getInputs());
-RAFT_BENCH_REGISTER(GramMatrix<double>, "", getInputs());
-
-}  // namespace raft::bench::distance::kernels
diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu
deleted file mode 100644
index 979d438b67..0000000000
--- a/cpp/bench/prims/distance/masked_nn.cu
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/benchmark.hpp>
-
-#include <raft/core/device_mdarray.hpp>
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
-#include <raft/distance/masked_nn.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/random/rng.cuh>
-#include <raft/util/cudart_utils.hpp>
-
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <sstream>
-#include <string>
-
-namespace raft::bench::distance::masked_nn {
-
-// Introduce various sparsity patterns
-enum AdjacencyPattern {
-  checkerboard    = 0,
-  checkerboard_4  = 1,
-  checkerboard_64 = 2,
-  all_true        = 3,
-  all_false       = 4
-};
-
-struct Params {
-  int m, n, k, num_groups;
-  AdjacencyPattern pattern;
-};  // struct Params
-
-RAFT_KERNEL init_adj(AdjacencyPattern pattern,
-                     int n,
-                     raft::device_matrix_view<bool, int, raft::layout_c_contiguous> adj,
-                     raft::device_vector_view<int, int, raft::layout_c_contiguous> group_idxs)
-{
-  int m          = adj.extent(0);
-  int num_groups = adj.extent(1);
-
-  for (int idx_m = blockIdx.y * blockDim.y + threadIdx.y; idx_m < m;
-       idx_m += blockDim.y * gridDim.y) {
-    for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups;
-         idx_g += blockDim.x * gridDim.x) {
-      switch (pattern) {
-        case checkerboard: adj(idx_m, idx_g) = (idx_m + idx_g) % 2; break;
-        case checkerboard_4: adj(idx_m, idx_g) = (idx_m / 4 + idx_g) % 2; break;
-        case checkerboard_64: adj(idx_m, idx_g) = (idx_m / 64 + idx_g) % 2; break;
-        case all_true: adj(idx_m, idx_g) = true; break;
-        case all_false: adj(idx_m, idx_g) = false; break;
-        default: assert(false && "unknown pattern");
-      }
-    }
-  }
-  // Each group is of size n / num_groups.
-  //
-  // - group_idxs[j] indicates the start of group j + 1 (i.e. is the inclusive
-  // scan of the group lengths)
-  //
-  // - The first group always starts at index zero, so we do not store it.
-  //
-  // - The group_idxs[num_groups - 1] should always equal n.
-
-  if (blockIdx.y == 0 && threadIdx.y == 0) {
-    const int g_stride = blockDim.x * gridDim.x;
-    for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups; idx_g += g_stride) {
-      group_idxs(idx_g) = (idx_g + 1) * (n / num_groups);
-    }
-    group_idxs(num_groups - 1) = n;
-  }
-}
-
-template <typename T>
-struct masked_l2_nn : public fixture {
-  using DataT      = T;
-  using IdxT       = int;
-  using OutT       = raft::KeyValuePair<IdxT, DataT>;
-  using RedOpT     = raft::distance::MinAndDistanceReduceOp<int, DataT>;
-  using PairRedOpT = raft::distance::KVPMinReduce<int, DataT>;
-  using ParamT     = raft::distance::masked_l2_nn_params<RedOpT, PairRedOpT>;
-
-  // Parameters
-  Params params;
-  // Data
-  raft::device_vector<OutT, IdxT> out;
-  raft::device_matrix<T, IdxT> x, y;
-  raft::device_vector<DataT, IdxT> xn, yn;
-  raft::device_matrix<bool, IdxT> adj;
-  raft::device_vector<IdxT, IdxT> group_idxs;
-
-  masked_l2_nn(const Params& p)
-    : params(p),
-      out{raft::make_device_vector<OutT, IdxT>(handle, p.m)},
-      x{raft::make_device_matrix<DataT, IdxT>(handle, p.m, p.k)},
-      y{raft::make_device_matrix<DataT, IdxT>(handle, p.n, p.k)},
-      xn{raft::make_device_vector<DataT, IdxT>(handle, p.m)},
-      yn{raft::make_device_vector<DataT, IdxT>(handle, p.n)},
-      adj{raft::make_device_matrix<bool, IdxT>(handle, p.m, p.num_groups)},
-      group_idxs{raft::make_device_vector<IdxT, IdxT>(handle, p.num_groups)}
-  {
-    raft::random::RngState r(123456ULL);
-
-    uniform(handle, r, x.data_handle(), p.m * p.k, T(-1.0), T(1.0));
-    uniform(handle, r, y.data_handle(), p.n * p.k, T(-1.0), T(1.0));
-    raft::linalg::rowNorm(
-      xn.data_handle(), x.data_handle(), p.k, p.m, raft::linalg::L2Norm, true, stream);
-    raft::linalg::rowNorm(
-      yn.data_handle(), y.data_handle(), p.k, p.n, raft::linalg::L2Norm, true, stream);
-    raft::distance::initialize<T, raft::KeyValuePair<int, T>, int>(
-      handle, out.data_handle(), p.m, std::numeric_limits<T>::max(), RedOpT{});
-
-    dim3 block(32, 32);
-    dim3 grid(10, 10);
-    init_adj<<<grid, block, 0, stream>>>(p.pattern, p.n, adj.view(), group_idxs.view());
-    RAFT_CUDA_TRY(cudaGetLastError());
-  }
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    bool init_out = true;
-    bool sqrt     = false;
-    ParamT masked_l2_params{RedOpT{}, PairRedOpT{}, sqrt, init_out};
-
-    loop_on_state(state, [this, masked_l2_params]() {
-      // It is sufficient to only benchmark the L2-squared metric
-      raft::distance::masked_l2_nn<DataT, OutT, IdxT>(handle,
-                                                      masked_l2_params,
-                                                      x.view(),
-                                                      y.view(),
-                                                      xn.view(),
-                                                      yn.view(),
-                                                      adj.view(),
-                                                      group_idxs.view(),
-                                                      out.view());
-    });
-
-    // Virtual flop count if no skipping had occurred.
-    size_t virtual_flops = size_t(2) * size_t(params.m) * size_t(params.n) * size_t(params.k);
-
-    int64_t read_elts  = params.n * params.k + params.m * params.k;
-    int64_t write_elts = params.m;
-
-    // Virtual min flops is the number of flops that would have been executed if
-    // the algorithm had actually skipped each computation that it could have
-    // skipped.
-    size_t virtual_min_flops = 0;
-    switch (params.pattern) {
-      case checkerboard:
-      case checkerboard_4:
-      case checkerboard_64: virtual_min_flops = virtual_flops / 2; break;
-      case all_true: virtual_min_flops = virtual_flops; break;
-      case all_false: virtual_min_flops = 0; break;
-      default: assert(false && "unknown pattern");
-    }
-
-    // VFLOP/s is the "virtual" flop count that would have executed if there was
-    // no adjacency pattern. This is useful for comparing to fusedL2NN
-    state.counters["VFLOP/s"] = benchmark::Counter(virtual_flops,
-                                                   benchmark::Counter::kIsIterationInvariantRate,
-                                                   benchmark::Counter::OneK::kIs1000);
-    // Virtual min flops is the number of flops that would have been executed if
-    // the algorithm had actually skipped each computation that it could have
-    // skipped.
-    state.counters["VminFLOP/s"] = benchmark::Counter(virtual_min_flops,
-                                                      benchmark::Counter::kIsIterationInvariantRate,
-                                                      benchmark::Counter::OneK::kIs1000);
-
-    state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT),
-                                                 benchmark::Counter::kIsIterationInvariantRate,
-                                                 benchmark::Counter::OneK::kIs1000);
-    state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT),
-                                                 benchmark::Counter::kIsIterationInvariantRate,
-                                                 benchmark::Counter::OneK::kIs1000);
-
-    state.counters["m"]          = benchmark::Counter(params.m);
-    state.counters["n"]          = benchmark::Counter(params.n);
-    state.counters["k"]          = benchmark::Counter(params.k);
-    state.counters["num_groups"] = benchmark::Counter(params.num_groups);
-    state.counters["group size"] = benchmark::Counter(params.n / params.num_groups);
-    state.counters["Pat"]        = benchmark::Counter(static_cast<int>(params.pattern));
-
-    state.counters["SM count"] = raft::getMultiProcessorCount();
-  }
-};
-
-const std::vector<Params> masked_l2_nn_input_vecs = {
-  // Very fat matrices...
-  {32, 16384, 16384, 32, AdjacencyPattern::checkerboard},
-  {64, 16384, 16384, 32, AdjacencyPattern::checkerboard},
-  {128, 16384, 16384, 32, AdjacencyPattern::checkerboard},
-  {256, 16384, 16384, 32, AdjacencyPattern::checkerboard},
-  {512, 16384, 16384, 32, AdjacencyPattern::checkerboard},
-  {1024, 16384, 16384, 32, AdjacencyPattern::checkerboard},
-  {16384, 32, 16384, 32, AdjacencyPattern::checkerboard},
-  {16384, 64, 16384, 32, AdjacencyPattern::checkerboard},
-  {16384, 128, 16384, 32, AdjacencyPattern::checkerboard},
-  {16384, 256, 16384, 32, AdjacencyPattern::checkerboard},
-  {16384, 512, 16384, 32, AdjacencyPattern::checkerboard},
-  {16384, 1024, 16384, 32, AdjacencyPattern::checkerboard},
-
-  // Representative matrices...
-  {16384, 16384, 32, 32, AdjacencyPattern::checkerboard},
-  {16384, 16384, 64, 32, AdjacencyPattern::checkerboard},
-  {16384, 16384, 128, 32, AdjacencyPattern::checkerboard},
-  {16384, 16384, 256, 32, AdjacencyPattern::checkerboard},
-  {16384, 16384, 512, 32, AdjacencyPattern::checkerboard},
-  {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard},
-  {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard},
-
-  {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_4},
-  {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_4},
-  {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_4},
-  {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_4},
-  {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_4},
-  {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_4},
-  {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_4},
-
-  {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_64},
-  {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_64},
-  {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_64},
-  {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_64},
-  {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_64},
-  {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_64},
-  {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_64},
-
-  {16384, 16384, 32, 32, AdjacencyPattern::all_true},
-  {16384, 16384, 64, 32, AdjacencyPattern::all_true},
-  {16384, 16384, 128, 32, AdjacencyPattern::all_true},
-  {16384, 16384, 256, 32, AdjacencyPattern::all_true},
-  {16384, 16384, 512, 32, AdjacencyPattern::all_true},
-  {16384, 16384, 1024, 32, AdjacencyPattern::all_true},
-  {16384, 16384, 16384, 32, AdjacencyPattern::all_true},
-
-  {16384, 16384, 32, 32, AdjacencyPattern::all_false},
-  {16384, 16384, 64, 32, AdjacencyPattern::all_false},
-  {16384, 16384, 128, 32, AdjacencyPattern::all_false},
-  {16384, 16384, 256, 32, AdjacencyPattern::all_false},
-  {16384, 16384, 512, 32, AdjacencyPattern::all_false},
-  {16384, 16384, 1024, 32, AdjacencyPattern::all_false},
-  {16384, 16384, 16384, 32, AdjacencyPattern::all_false},
-};
-
-RAFT_BENCH_REGISTER(masked_l2_nn<float>, "", masked_l2_nn_input_vecs);
-// We don't benchmark double to keep compile times in check when not using the
-// distance library.
-
-}  // namespace raft::bench::distance::masked_nn
diff --git a/cpp/bench/prims/distance/tune_pairwise/bench.cu b/cpp/bench/prims/distance/tune_pairwise/bench.cu
deleted file mode 100644
index 81105cdefe..0000000000
--- a/cpp/bench/prims/distance/tune_pairwise/bench.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Tuning benchmarks.
-//
-// Goals:
-//
-// 1. Fast compile times to maintain iteration speed.
-// 2. Create benchmarks that can inform the design of the kernels.
-//
-// Non-goals:
-//
-// 1. Measure every distance operation. Instead measures just one distance
-//    operation at the same time.
-// 2. Be useful for finding performance regressions. This is handled by the
-//    normal benchmarks.
-//
-// So far, both goals are partly achieved.
-//
-// RE (1), COMPILE TIMES: kernel.cu is fast to compile. This file is not.
-// When the internals of a pairwise distance kernel is changed, this file is not
-// recompiled.
-//
-// RE 2, benchmarks with intent: this file contains a benchmark to check the
-// maximal throughput of a kernel. Measuring other things, like performance on
-// skinny or wide matrices is not yet implemented.
-
-#include "kernel.cuh"  // launch_kernel
-
-#include <common/benchmark.hpp>  // RAFT_BENCH_REGISTER
-
-#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
-
-#include <rmm/device_uvector.hpp>  // rmm::device_uvector
-
-#include <algorithm>  // std::min
-#include <vector>     // std::vector
-
-namespace raft::bench::distance::tune {
-
-// Max throughput benchmark.
-//
-// Goal: Measure the maximum distances/sec that can be computed.
-//
-// To achieve this, we make sure that:
-//
-// - Input data size is a multiple of the block tile size.
-//
-// - Perfect distribution of work between SMs, i.e. the number of block tiles is
-//   a large multiple (num_waves) of the number of blocks (#SMs * occupancy).
-//
-// - Multiple iterations over Kblk are executed (num_k_iters).
-struct throughput_param {
-  int num_waves;
-  int occupancy;
-  int num_k_iters;
-};
-
-const std::vector<throughput_param> throughput_params{
-  // 32 waves, requested occupancy of 4, and 32 k iterations typically achieves
-  // maximum throughput. No need to pick higher values.
-  {32, 4, 32},
-};
-
-struct throughput_bench : public fixture {
-  const throughput_param p;
-
-  throughput_bench(const throughput_param& p_) : p(p_) {}
-
-  void run_benchmark(::benchmark::State& state) override
-  {
-    // Get block size:
-    int block_m, block_n, block_k;
-    get_block_size(block_m, block_n, block_k);
-
-    // Determine number of blocks that will be launched. This informs the size
-    // of the inputs as well as the grid size.
-    const int num_sms       = raft::getMultiProcessorCount();
-    const int max_occupancy = get_max_occupancy();
-    const int occupancy     = std::min(p.occupancy, max_occupancy);
-    const int num_blocks    = occupancy * num_sms;
-    dim3 grid(num_blocks);
-
-    // Create input sizes that are a multiple of the block tile size.
-    size_t m = block_m;
-    size_t n = block_n * p.num_waves * num_blocks;
-    size_t k = block_k * p.num_k_iters;
-
-    // DataT, OutT, IdxT, etc, are defined in tuned_kernel.cuh
-    rmm::device_uvector<DataT> x_vec(m * k, stream);
-    rmm::device_uvector<DataT> y_vec(n * k, stream);
-    rmm::device_uvector<DataT> x_norm_vec(m, stream);
-    rmm::device_uvector<DataT> y_norm_vec(n, stream);
-    rmm::device_uvector<OutT> out_vec(m * n, stream);
-
-    auto x      = x_vec.data();
-    auto y      = y_vec.data();
-    auto x_norm = x_norm_vec.data();
-    auto y_norm = y_norm_vec.data();
-    auto out    = out_vec.data();
-    FinOpT fin_op{};
-
-    // Create kernel parameter struct. Flip x and y if column major.
-    IdxT ldx    = row_major ? k : m;
-    IdxT ldy    = row_major ? k : n;
-    IdxT ld_out = row_major ? n : m;
-
-    // Template parameters of pairwise_matrix_params are defined in kernel.cuh
-    pairwise_matrix_params kparams{
-      IdxT(m), IdxT(n), IdxT(k), ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, row_major};
-
-    // Run benchmark
-    loop_on_state(state, [&]() { launch_kernel(kparams, grid, stream); });
-
-    // Report metrics. We don't report flop/s because we do not know for each
-    // distance operation how many flops it costs. For L2_unexp and l1, we can
-    // double this number to get the flop/s. For l2 expanded, core_ops/s should
-    // equal flop/s (modulo the sqrt and subtracting from the norm).
-    size_t num_core_ops = m * n * k;
-    size_t read_elts    = n * k + m * k;
-    size_t write_elts   = m * n;
-
-    state.counters["m"]         = benchmark::Counter(m);
-    state.counters["n"]         = benchmark::Counter(n);
-    state.counters["k"]         = benchmark::Counter(k);
-    state.counters["occupancy"] = benchmark::Counter(occupancy);
-    state.counters["# waves"]   = benchmark::Counter(p.num_waves);
-    state.counters["# k iters"] = benchmark::Counter(p.num_k_iters);
-
-    state.counters["core_ops/s"] = benchmark::Counter(num_core_ops,
-                                                      benchmark::Counter::kIsIterationInvariantRate,
-                                                      benchmark::Counter::OneK::kIs1000);
-
-    state.counters["BW"] = benchmark::Counter(write_elts * sizeof(OutT) + read_elts * sizeof(DataT),
-                                              benchmark::Counter::kIsIterationInvariantRate,
-                                              benchmark::Counter::OneK::kIs1000);
-  }
-};
-
-RAFT_BENCH_REGISTER(throughput_bench, "", throughput_params);
-
-}  // namespace raft::bench::distance::tune
diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cu b/cpp/bench/prims/distance/tune_pairwise/kernel.cu
deleted file mode 100644
index 42173c51f5..0000000000
--- a/cpp/bench/prims/distance/tune_pairwise/kernel.cu
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kernel.cuh"
-
-#include <raft/distance/detail/pairwise_matrix/kernel_sm60.cuh>  // pairwise_matrix_sm60_wrapper
-#include <raft/linalg/contractions.cuh>                          // raft::linalg::Policy4x4
-#include <raft/util/arch.cuh>  // raft::util::arch::SM_compute_arch
-
-namespace raft::bench::distance::tune {
-
-// Distance op
-using OpT                  = raft::distance::detail::ops::lp_unexp_distance_op<DataT, AccT, IdxT>;
-constexpr float metric_arg = 2.0;
-OpT distance_op{metric_arg};
-
-// Kernel policy
-constexpr int vec_len = 1;
-using Policy          = typename raft::linalg::Policy4x4<DataT, vec_len>::Policy;
-
-// Architecture
-namespace arch                 = raft::util::arch;
-constexpr auto sm_compat_range = arch::SM_range(arch::SM_min(), arch::SM_future());
-
-void launch_kernel(pairwise_matrix_params params, dim3 grid, cudaStream_t stream)
-{
-  dim3 block(Policy::Nthreads);
-  int smem_size = OpT::shared_mem_size<Policy>();
-
-  // Obtain function pointer to kernel
-  auto kernel = raft::distance::detail::pairwise_matrix_kernel<Policy,
-                                                               row_major,
-                                                               decltype(sm_compat_range),
-                                                               OpT,
-                                                               IdxT,
-                                                               DataT,
-                                                               OutT,
-                                                               FinOpT>;
-
-  kernel<<<grid, block, smem_size, stream>>>(distance_op, params);
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-void get_block_size(int& m, int& n, int& k)
-{
-  m = Policy::Mblk;
-  n = Policy::Nblk;
-  k = Policy::Kblk;
-}
-
-void* get_kernel_ptr()
-{
-  auto kernel = raft::distance::detail::pairwise_matrix_kernel<Policy,
-                                                               row_major,
-                                                               decltype(sm_compat_range),
-                                                               OpT,
-                                                               IdxT,
-                                                               DataT,
-                                                               OutT,
-                                                               FinOpT>;
-  return reinterpret_cast<void*>(kernel);
-}
-
-int get_max_occupancy()
-{
-  void* kernel_ptr = get_kernel_ptr();
-  int max_occupancy;
-  int smem_size = OpT::shared_mem_size<Policy>();
-
-  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &max_occupancy, kernel_ptr, Policy::Nthreads, smem_size));
-
-  return max_occupancy;
-}
-
-}  // namespace raft::bench::distance::tune
diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cuh b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh
deleted file mode 100644
index 5da54a343c..0000000000
--- a/cpp/bench/prims/distance/tune_pairwise/kernel.cuh
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/detail/distance_ops/all_ops.cuh>    // lp_unexp_distance_op
-#include <raft/distance/detail/pairwise_matrix/params.cuh>  // pairwise_matrix_params
-
-namespace raft::bench::distance::tune {
-
-// Launch one specific kernel with the following template parameters
-constexpr bool row_major = true;
-using DataT              = float;
-using AccT               = float;
-using OutT               = DataT;
-using IdxT               = int;
-
-using FinOpT = raft::identity_op;
-
-using pairwise_matrix_params =
-  raft::distance::detail::pairwise_matrix_params<IdxT, DataT, OutT, FinOpT>;
-
-// Launches kernel
-void launch_kernel(pairwise_matrix_params, dim3, cudaStream_t);
-
-// Describes the block size that is decided by the policy
-void get_block_size(int& m, int& n, int& k);
-
-int get_max_occupancy();
-
-}  // namespace raft::bench::distance::tune