diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 160780bf4d..53bb840309 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -589,10 +589,3 @@ endif() if(BUILD_PRIMS_BENCH) add_subdirectory(bench/prims/) endif() - -# ################################################################################################## -# * build ann benchmark executable ----------------------------------------------- - -if(BUILD_ANN_BENCH) - add_subdirectory(bench/ann/) -endif() diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index 52c63ad73b..cf03a36612 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -74,49 +74,9 @@ function(ConfigureBench) endfunction() if(BUILD_PRIMS_BENCH) - ConfigureBench( - NAME - CORE_BENCH - PATH - core/bitset.cu - core/copy.cu - main.cpp - ) + ConfigureBench(NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp) - ConfigureBench( - NAME - UTIL_BENCH - PATH - util/popc.cu - main.cpp - ) - - ConfigureBench( - NAME CLUSTER_BENCH PATH cluster/kmeans_balanced.cu cluster/kmeans.cu - main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY - ) - - ConfigureBench( - NAME TUNE_DISTANCE PATH distance/tune_pairwise/kernel.cu - distance/tune_pairwise/bench.cu main.cpp - ) - - ConfigureBench( - NAME - DISTANCE_BENCH - PATH - distance/distance_cosine.cu - distance/distance_exp_l2.cu - distance/distance_l1.cu - distance/distance_unexp_l2.cu - distance/fused_l2_nn.cu - distance/masked_nn.cu - distance/kernels.cu - main.cpp - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY - ) + ConfigureBench(NAME UTIL_BENCH PATH util/popc.cu main.cpp) ConfigureBench( NAME @@ -137,54 +97,18 @@ if(BUILD_PRIMS_BENCH) ) ConfigureBench( - NAME MATRIX_BENCH PATH matrix/argmin.cu matrix/gather.cu - matrix/select_k.cu main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY + NAME MATRIX_BENCH PATH matrix/argmin.cu matrix/gather.cu matrix/select_k.cu main.cpp OPTIONAL + LIB EXPLICIT_INSTANTIATE_ONLY ) ConfigureBench( - NAME RANDOM_BENCH PATH random/make_blobs.cu random/permute.cu - random/rng.cu random/subsample.cu main.cpp - ) - - ConfigureBench( - NAME - SPARSE_BENCH - PATH - sparse/bitmap_to_csr.cu - sparse/convert_csr.cu - sparse/select_k_csr.cu + NAME RANDOM_BENCH PATH random/make_blobs.cu random/permute.cu random/rng.cu random/subsample.cu main.cpp ) ConfigureBench( - NAME - NEIGHBORS_BENCH - PATH - neighbors/knn/brute_force_float_int64_t.cu - neighbors/knn/brute_force_float_uint32_t.cu - neighbors/knn/cagra_float_uint32_t.cu - neighbors/knn/ivf_flat_filter_float_int64_t.cu - neighbors/knn/ivf_flat_float_int64_t.cu - neighbors/knn/ivf_flat_int8_t_int64_t.cu - neighbors/knn/ivf_flat_uint8_t_int64_t.cu - neighbors/knn/ivf_pq_float_int64_t.cu - neighbors/knn/ivf_pq_filter_float_int64_t.cu - neighbors/knn/ivf_pq_int8_t_int64_t.cu - neighbors/knn/ivf_pq_uint8_t_int64_t.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu - ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu - neighbors/refine_float_int64_t.cu - neighbors/refine_uint8_t_int64_t.cu + NAME SPARSE_BENCH PATH sparse/bitmap_to_csr.cu sparse/convert_csr.cu sparse/select_k_csr.cu main.cpp - OPTIONAL - LIB - EXPLICIT_INSTANTIATE_ONLY ) endif() diff --git a/cpp/bench/prims/cluster/kmeans.cu b/cpp/bench/prims/cluster/kmeans.cu deleted file mode 100644 index 6387211135..0000000000 --- a/cpp/bench/prims/cluster/kmeans.cu +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -namespace raft::bench::cluster { - -struct KMeansBenchParams { - DatasetParams data; - BlobsParams blobs; - raft::cluster::KMeansParams kmeans; -}; - -inline auto operator<<(std::ostream& os, const KMeansBenchParams& p) -> std::ostream& -{ - os << p.data.rows << "#" << p.data.cols << "#" << p.kmeans.n_clusters; - return os; -} - -template -struct KMeans : public BlobsFixture { - KMeans(const KMeansBenchParams& p) - : BlobsFixture(p.data, p.blobs), - params(p), - centroids(this->handle), - labels(this->handle) - { - } - - void run_benchmark(::benchmark::State& state) override - { - std::ostringstream label_stream; - label_stream << params; - state.SetLabel(label_stream.str()); - - raft::device_matrix_view X_view = this->X.view(); - std::optional> opt_weights_view = std::nullopt; - std::optional> centroids_view = - std::make_optional>(centroids.view()); - raft::device_vector_view labels_view = labels.view(); - raft::host_scalar_view inertia_view = raft::make_host_scalar_view(&inertia); - raft::host_scalar_view n_iter_view = raft::make_host_scalar_view(&n_iter); - - this->loop_on_state(state, [&]() { - raft::cluster::kmeans_fit_predict(this->handle, - params.kmeans, - X_view, - opt_weights_view, - centroids_view, - labels_view, - inertia_view, - n_iter_view); - }); - } - - void allocate_temp_buffers(const ::benchmark::State& state) override - { - centroids = - raft::make_device_matrix(this->handle, params.kmeans.n_clusters, params.data.cols); - labels = raft::make_device_vector(this->handle, params.data.rows); - } - - private: - KMeansBenchParams params; - raft::device_matrix centroids; - raft::device_vector labels; - T inertia; - IndexT n_iter; -}; // struct KMeans - -std::vector getKMeansInputs() -{ - std::vector out; - KMeansBenchParams p; - p.data.row_major = true; - p.blobs.cluster_std = 1.0; - p.blobs.shuffle = false; - p.blobs.center_box_min = -10.0; - p.blobs.center_box_max = 10.0; - p.blobs.seed = 12345ULL; - p.kmeans.init = raft::cluster::KMeansParams::KMeansPlusPlus; - p.kmeans.max_iter = 300; - p.kmeans.tol = 1e-4; - p.kmeans.verbosity = RAFT_LEVEL_INFO; - p.kmeans.metric = raft::distance::DistanceType::L2Expanded; - p.kmeans.inertia_check = true; - std::vector> row_cols_k = { - {1000000, 20, 1000}, - {3000000, 50, 20}, - {10000000, 50, 5}, - }; - for (auto& rck : row_cols_k) { - p.data.rows = std::get<0>(rck); - p.data.cols = std::get<1>(rck); - p.blobs.n_clusters = std::get<2>(rck); - p.kmeans.n_clusters = std::get<2>(rck); - out.push_back(p); - } - return out; -} - -// note(lsugy): commenting out int64_t because the templates are not compiled in the distance -// library, resulting in long compilation times. -RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); -RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); -// RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); -// RAFT_BENCH_REGISTER((KMeans), "", getKMeansInputs()); - -} // namespace raft::bench::cluster diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu deleted file mode 100644 index dc05783989..0000000000 --- a/cpp/bench/prims/cluster/kmeans_balanced.cu +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include - -namespace raft::bench::cluster { - -struct KMeansBalancedBenchParams { - DatasetParams data; - uint32_t n_lists; - raft::cluster::kmeans_balanced_params kb_params; -}; - -template -struct KMeansBalanced : public fixture { - KMeansBalanced(const KMeansBalancedBenchParams& p) : params(p), X(handle), centroids(handle) {} - - void run_benchmark(::benchmark::State& state) override - { - this->loop_on_state(state, [this]() { - raft::device_matrix_view X_view = this->X.view(); - raft::device_matrix_view centroids_view = this->centroids.view(); - raft::cluster::kmeans_balanced::fit( - this->handle, this->params.kb_params, X_view, centroids_view); - }); - } - - void allocate_data(const ::benchmark::State& state) override - { - X = raft::make_device_matrix(handle, params.data.rows, params.data.cols); - - raft::random::RngState rng{1234}; - constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); - constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); - if constexpr (std::is_integral_v) { - raft::random::uniformInt( - handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); - } else { - raft::random::uniform( - handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); - } - resource::sync_stream(handle, stream); - } - - void allocate_temp_buffers(const ::benchmark::State& state) override - { - centroids = - raft::make_device_matrix(this->handle, params.n_lists, params.data.cols); - } - - private: - KMeansBalancedBenchParams params; - raft::device_matrix X; - raft::device_matrix centroids; -}; // struct KMeansBalanced - -std::vector getKMeansBalancedInputs() -{ - std::vector out; - KMeansBalancedBenchParams p; - p.data.row_major = true; - p.kb_params.n_iters = 20; - p.kb_params.metric = raft::distance::DistanceType::L2Expanded; - std::vector> row_cols = { - {100000, 128}, {1000000, 128}, {10000000, 128}, - // The following dataset sizes are too large for most GPUs. - // {100000000, 128}, - }; - for (auto& rc : row_cols) { - p.data.rows = rc.first; - p.data.cols = rc.second; - for (auto n_lists : std::vector({1000, 10000, 100000})) { - p.n_lists = n_lists; - out.push_back(p); - } - } - return out; -} - -// Note: the datasets sizes are too large for 32-bit index types. -RAFT_BENCH_REGISTER((KMeansBalanced), "", getKMeansBalancedInputs()); - -} // namespace raft::bench::cluster diff --git a/cpp/bench/prims/distance/distance_common.cuh b/cpp/bench/prims/distance/distance_common.cuh deleted file mode 100644 index 8368062168..0000000000 --- a/cpp/bench/prims/distance/distance_common.cuh +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include - -namespace raft::bench::distance { - -struct distance_params { - int m, n, k; - bool isRowMajor; -}; // struct distance_params - -template -struct distance : public fixture { - distance(const distance_params& p) - : params(p), - x(p.m * p.k, stream), - y(p.n * p.k, stream), - out(p.m * p.n, stream), - workspace(0, stream) - { - RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, x.size() * sizeof(T), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, y.size() * sizeof(T), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(T), stream)); - worksize = raft::distance::getWorkspaceSize( - x.data(), y.data(), params.m, params.n, params.k); - workspace.resize(worksize, stream); - } - - void run_benchmark(::benchmark::State& state) override - { - loop_on_state(state, [this]() { - raft::distance::distance(handle, - x.data(), - y.data(), - out.data(), - params.m, - params.n, - params.k, - (void*)workspace.data(), - worksize, - params.isRowMajor); - }); - } - - private: - distance_params params; - rmm::device_uvector x, y, out; - rmm::device_uvector workspace; - size_t worksize; -}; // struct Distance - -const std::vector dist_input_vecs{ - {32, 16384, 16384, true}, {64, 16384, 16384, true}, {128, 16384, 16384, true}, - {256, 16384, 16384, true}, {512, 16384, 16384, true}, {1024, 16384, 16384, true}, - {16384, 32, 16384, true}, {16384, 64, 16384, true}, {16384, 128, 16384, true}, - {16384, 256, 16384, true}, {16384, 512, 16384, true}, {16384, 1024, 16384, true}, - {16384, 16384, 32, true}, {16384, 16384, 64, true}, {16384, 16384, 128, true}, - {16384, 16384, 256, true}, {16384, 16384, 512, true}, {16384, 16384, 1024, true}, - {16384, 16384, 16384, true}, {32, 16384, 16384, false}, {64, 16384, 16384, false}, - {128, 16384, 16384, false}, {256, 16384, 16384, false}, {512, 16384, 16384, false}, - {1024, 16384, 16384, false}, {16384, 32, 16384, false}, {16384, 64, 16384, false}, - {16384, 128, 16384, false}, {16384, 256, 16384, false}, {16384, 512, 16384, false}, - {16384, 1024, 16384, false}, {16384, 16384, 32, false}, {16384, 16384, 64, false}, - {16384, 16384, 128, false}, {16384, 16384, 256, false}, {16384, 16384, 512, false}, - {16384, 16384, 1024, false}, {16384, 16384, 16384, false} - -}; - -#define DIST_BENCH_REGISTER(Name, Metric) \ - using Name##F = distance; \ - RAFT_BENCH_REGISTER(Name##F, "", dist_input_vecs); \ - using Name##D = distance; \ - RAFT_BENCH_REGISTER(Name##D, "", dist_input_vecs); - -} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_cosine.cu b/cpp/bench/prims/distance/distance_cosine.cu deleted file mode 100644 index c8ac8067c8..0000000000 --- a/cpp/bench/prims/distance/distance_cosine.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "distance_common.cuh" - -namespace raft::bench::distance { - -DIST_BENCH_REGISTER(DistanceCosine, raft::distance::DistanceType::CosineExpanded); - -} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_exp_l2.cu b/cpp/bench/prims/distance/distance_exp_l2.cu deleted file mode 100644 index 52b7fff05c..0000000000 --- a/cpp/bench/prims/distance/distance_exp_l2.cu +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "distance_common.cuh" - -namespace raft::bench::distance { - -DIST_BENCH_REGISTER(DistanceL2Sq, raft::distance::DistanceType::L2Expanded); -DIST_BENCH_REGISTER(DistanceL2Sqrt, raft::distance::DistanceType::L2SqrtExpanded); - -} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_l1.cu b/cpp/bench/prims/distance/distance_l1.cu deleted file mode 100644 index e80db48ef0..0000000000 --- a/cpp/bench/prims/distance/distance_l1.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "distance_common.cuh" - -namespace raft::bench::distance { - -DIST_BENCH_REGISTER(DistanceL1, raft::distance::DistanceType::L1); - -} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/distance_unexp_l2.cu b/cpp/bench/prims/distance/distance_unexp_l2.cu deleted file mode 100644 index 7ac1a8a4b5..0000000000 --- a/cpp/bench/prims/distance/distance_unexp_l2.cu +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "distance_common.cuh" - -namespace raft::bench::distance { - -DIST_BENCH_REGISTER(DistanceUnexpL2Sq, raft::distance::DistanceType::L2Unexpanded); -DIST_BENCH_REGISTER(DistanceUnexpL2Sqrt, raft::distance::DistanceType::L2SqrtUnexpanded); - -} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/fused_l2_nn.cu b/cpp/bench/prims/distance/fused_l2_nn.cu deleted file mode 100644 index a263bef6ba..0000000000 --- a/cpp/bench/prims/distance/fused_l2_nn.cu +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -namespace raft::bench::distance { - -struct fusedl2nn_inputs { - int64_t m, n, k; -}; // struct fusedl2nn_inputs - -inline auto operator<<(std::ostream& os, const fusedl2nn_inputs& p) -> std::ostream& -{ - os << p.m << "#" << p.n << "#" << p.k; - return os; -} - -template -struct fusedl2nn : public fixture { - fusedl2nn(const fusedl2nn_inputs& p) - : params(p), - workspace(this->handle), - x(this->handle), - y(this->handle), - x_norm(this->handle), - y_norm(this->handle), - out(this->handle) - { - } - - void allocate_data(const ::benchmark::State& state) override - { - x = raft::make_device_matrix(handle, params.m, params.k); - y = raft::make_device_matrix(handle, params.n, params.k); - x_norm = raft::make_device_vector(handle, params.m); - y_norm = raft::make_device_vector(handle, params.n); - out = raft::make_device_vector(handle, params.m); - - raft::random::RngState rng{1234}; - raft::random::uniform( - handle, rng, x.data_handle(), params.m * params.k, (DataT)-1.0, (DataT)1.0); - raft::random::uniform( - handle, rng, y.data_handle(), params.n * params.k, (DataT)-1.0, (DataT)1.0); - - // Pre-compute norms - raft::linalg::rowNorm(x_norm.data_handle(), - x.data_handle(), - params.k, - params.m, - raft::linalg::L2Norm, - true, - stream); - raft::linalg::rowNorm(y_norm.data_handle(), - y.data_handle(), - params.k, - params.n, - raft::linalg::L2Norm, - true, - stream); - resource::sync_stream(handle, stream); - } - - void allocate_temp_buffers(const ::benchmark::State& state) override - { - workspace = raft::make_device_vector(handle, params.m * sizeof(IdxT)); - } - - void run_benchmark(::benchmark::State& state) override - { - std::ostringstream label_stream; - label_stream << params; - state.SetLabel(label_stream.str()); - - loop_on_state(state, [this]() { - raft::distance::fusedL2NNMinReduce(out.data_handle(), - x.data_handle(), - y.data_handle(), - x_norm.data_handle(), - y_norm.data_handle(), - static_cast(params.m), - static_cast(params.n), - static_cast(params.k), - (void*)workspace.data_handle(), - false, - true, - stream); - }); - - int64_t num_flops = 2 * params.m * params.n * params.k; - - int64_t read_elts = params.n * params.k + params.m * params.k; - int64_t write_elts = params.m; - - state.counters["FLOP/s"] = benchmark::Counter( - num_flops, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1000); - - state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT), - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT), - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - } - - private: - fusedl2nn_inputs params; - raft::device_matrix x, y; - raft::device_vector x_norm, y_norm; - raft::device_vector out; - raft::device_vector workspace; -}; // struct fusedl2nn - -template -std::vector getFusedL2NNInputs() -{ - std::vector inputs; - std::vector m_list = {100000, 1000000}; - if constexpr (sizeof(IdxT) == 8) { m_list.push_back(10000000); } - std::vector n_list = {100, 1000, 10000}; - std::vector k_list = {64, 128, 256}; - for (auto m : m_list) { - for (auto n : n_list) { - for (auto k : k_list) { - inputs.push_back({m, n, k}); - } - } - } - return inputs; -} - -#define FUSEDL2NN_BENCH(DataT, IdxT, OutT) \ - RAFT_BENCH_REGISTER((fusedl2nn), "", getFusedL2NNInputs()) - -FUSEDL2NN_BENCH(float, int, float); -FUSEDL2NN_BENCH(double, int, double); -FUSEDL2NN_BENCH(float, int, (raft::KeyValuePair)); -FUSEDL2NN_BENCH(double, int, (raft::KeyValuePair)); -FUSEDL2NN_BENCH(float, int64_t, float); -FUSEDL2NN_BENCH(double, int64_t, double); -FUSEDL2NN_BENCH(float, int64_t, (raft::KeyValuePair)); -FUSEDL2NN_BENCH(double, int64_t, (raft::KeyValuePair)); - -} // namespace raft::bench::distance diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu deleted file mode 100644 index eb86330637..0000000000 --- a/cpp/bench/prims/distance/kernels.cu +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -namespace raft::bench::distance::kernels { - -using namespace raft::distance::kernels; -struct GramTestParams { - int m; // m parameter of the GEMM - int k; // k parameter of the GEMM - int n; // n parameter of the GEMM - KernelParams kernel_params; - bool is_row_major; -}; // struct GramTestParams - -template -struct GramMatrix : public fixture { - GramMatrix(const GramTestParams& p) - : params(p), handle(stream), A(0, stream), B(0, stream), C(0, stream) - { - kernel = std::unique_ptr>( - KernelFactory::create(p.kernel_params, resource::get_cublas_handle(handle))); - - A.resize(params.m * params.k, stream); - B.resize(params.k * params.n, stream); - C.resize(params.m * params.n, stream); - raft::random::RngState rng(123456ULL); - raft::random::uniform(handle, rng, A.data(), params.m * params.k, T(-1.0), T(1.0)); - raft::random::uniform(handle, rng, B.data(), params.k * params.n, T(-1.0), T(1.0)); - } - - ~GramMatrix() - { - A.release(); - B.release(); - C.release(); - } - - void run_benchmark(::benchmark::State& state) override - { - if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); } - loop_on_state(state, [this]() { - (*this->kernel)(A.data(), - this->params.m, - this->params.k, - B.data(), - this->params.n, - C.data(), - this->params.is_row_major, - this->stream); - }); - } - - private: - const raft::device_resources handle; - std::unique_ptr> kernel; - GramTestParams params; - - rmm::device_uvector A; // input matrix A, size [m * k] - rmm::device_uvector B; // input matrix B, size [n * k] - rmm::device_uvector C; // output matrix C, size [m*n] -}; - -static std::vector getInputs() -{ - std::vector param_vec; - std::vector kernel_params{KernelParams{LINEAR, 3, 1, 0}, - KernelParams{POLYNOMIAL, 2, 1.3, 1}, - KernelParams{TANH, 2, 0.5, 2.4}, - KernelParams{RBF, 2, 0.5, 0}}; - struct TestSize { - int m; - int k; - int n; - }; - std::vector data_size{{4096, 10, 1024}, - {4096, 100, 1024}, - {4096, 1000, 1024}, - {4096, 10000, 1024}, - {100000, 10, 1024}, - {100000, 100, 1024}, - {100000, 1000, 1024}}; - - param_vec.reserve(kernel_params.size() * data_size.size()); - for (TestSize s : data_size) { - for (auto kernel : kernel_params) { - for (bool row_major : {false, true}) { - param_vec.push_back(GramTestParams{s.m, s.k, s.n, kernel, row_major}); - } - } - } - return param_vec; -} - -RAFT_BENCH_REGISTER(GramMatrix, "", getInputs()); -RAFT_BENCH_REGISTER(GramMatrix, "", getInputs()); - -} // namespace raft::bench::distance::kernels diff --git a/cpp/bench/prims/distance/masked_nn.cu b/cpp/bench/prims/distance/masked_nn.cu deleted file mode 100644 index 979d438b67..0000000000 --- a/cpp/bench/prims/distance/masked_nn.cu +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -namespace raft::bench::distance::masked_nn { - -// Introduce various sparsity patterns -enum AdjacencyPattern { - checkerboard = 0, - checkerboard_4 = 1, - checkerboard_64 = 2, - all_true = 3, - all_false = 4 -}; - -struct Params { - int m, n, k, num_groups; - AdjacencyPattern pattern; -}; // struct Params - -RAFT_KERNEL init_adj(AdjacencyPattern pattern, - int n, - raft::device_matrix_view adj, - raft::device_vector_view group_idxs) -{ - int m = adj.extent(0); - int num_groups = adj.extent(1); - - for (int idx_m = blockIdx.y * blockDim.y + threadIdx.y; idx_m < m; - idx_m += blockDim.y * gridDim.y) { - for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups; - idx_g += blockDim.x * gridDim.x) { - switch (pattern) { - case checkerboard: adj(idx_m, idx_g) = (idx_m + idx_g) % 2; break; - case checkerboard_4: adj(idx_m, idx_g) = (idx_m / 4 + idx_g) % 2; break; - case checkerboard_64: adj(idx_m, idx_g) = (idx_m / 64 + idx_g) % 2; break; - case all_true: adj(idx_m, idx_g) = true; break; - case all_false: adj(idx_m, idx_g) = false; break; - default: assert(false && "unknown pattern"); - } - } - } - // Each group is of size n / num_groups. - // - // - group_idxs[j] indicates the start of group j + 1 (i.e. is the inclusive - // scan of the group lengths) - // - // - The first group always starts at index zero, so we do not store it. - // - // - The group_idxs[num_groups - 1] should always equal n. - - if (blockIdx.y == 0 && threadIdx.y == 0) { - const int g_stride = blockDim.x * gridDim.x; - for (int idx_g = blockIdx.x * blockDim.x + threadIdx.x; idx_g < num_groups; idx_g += g_stride) { - group_idxs(idx_g) = (idx_g + 1) * (n / num_groups); - } - group_idxs(num_groups - 1) = n; - } -} - -template -struct masked_l2_nn : public fixture { - using DataT = T; - using IdxT = int; - using OutT = raft::KeyValuePair; - using RedOpT = raft::distance::MinAndDistanceReduceOp; - using PairRedOpT = raft::distance::KVPMinReduce; - using ParamT = raft::distance::masked_l2_nn_params; - - // Parameters - Params params; - // Data - raft::device_vector out; - raft::device_matrix x, y; - raft::device_vector xn, yn; - raft::device_matrix adj; - raft::device_vector group_idxs; - - masked_l2_nn(const Params& p) - : params(p), - out{raft::make_device_vector(handle, p.m)}, - x{raft::make_device_matrix(handle, p.m, p.k)}, - y{raft::make_device_matrix(handle, p.n, p.k)}, - xn{raft::make_device_vector(handle, p.m)}, - yn{raft::make_device_vector(handle, p.n)}, - adj{raft::make_device_matrix(handle, p.m, p.num_groups)}, - group_idxs{raft::make_device_vector(handle, p.num_groups)} - { - raft::random::RngState r(123456ULL); - - uniform(handle, r, x.data_handle(), p.m * p.k, T(-1.0), T(1.0)); - uniform(handle, r, y.data_handle(), p.n * p.k, T(-1.0), T(1.0)); - raft::linalg::rowNorm( - xn.data_handle(), x.data_handle(), p.k, p.m, raft::linalg::L2Norm, true, stream); - raft::linalg::rowNorm( - yn.data_handle(), y.data_handle(), p.k, p.n, raft::linalg::L2Norm, true, stream); - raft::distance::initialize, int>( - handle, out.data_handle(), p.m, std::numeric_limits::max(), RedOpT{}); - - dim3 block(32, 32); - dim3 grid(10, 10); - init_adj<<>>(p.pattern, p.n, adj.view(), group_idxs.view()); - RAFT_CUDA_TRY(cudaGetLastError()); - } - - void run_benchmark(::benchmark::State& state) override - { - bool init_out = true; - bool sqrt = false; - ParamT masked_l2_params{RedOpT{}, PairRedOpT{}, sqrt, init_out}; - - loop_on_state(state, [this, masked_l2_params]() { - // It is sufficient to only benchmark the L2-squared metric - raft::distance::masked_l2_nn(handle, - masked_l2_params, - x.view(), - y.view(), - xn.view(), - yn.view(), - adj.view(), - group_idxs.view(), - out.view()); - }); - - // Virtual flop count if no skipping had occurred. - size_t virtual_flops = size_t(2) * size_t(params.m) * size_t(params.n) * size_t(params.k); - - int64_t read_elts = params.n * params.k + params.m * params.k; - int64_t write_elts = params.m; - - // Virtual min flops is the number of flops that would have been executed if - // the algorithm had actually skipped each computation that it could have - // skipped. - size_t virtual_min_flops = 0; - switch (params.pattern) { - case checkerboard: - case checkerboard_4: - case checkerboard_64: virtual_min_flops = virtual_flops / 2; break; - case all_true: virtual_min_flops = virtual_flops; break; - case all_false: virtual_min_flops = 0; break; - default: assert(false && "unknown pattern"); - } - - // VFLOP/s is the "virtual" flop count that would have executed if there was - // no adjacency pattern. This is useful for comparing to fusedL2NN - state.counters["VFLOP/s"] = benchmark::Counter(virtual_flops, - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - // Virtual min flops is the number of flops that would have been executed if - // the algorithm had actually skipped each computation that it could have - // skipped. - state.counters["VminFLOP/s"] = benchmark::Counter(virtual_min_flops, - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - - state.counters["BW Wr"] = benchmark::Counter(write_elts * sizeof(OutT), - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - state.counters["BW Rd"] = benchmark::Counter(read_elts * sizeof(DataT), - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - - state.counters["m"] = benchmark::Counter(params.m); - state.counters["n"] = benchmark::Counter(params.n); - state.counters["k"] = benchmark::Counter(params.k); - state.counters["num_groups"] = benchmark::Counter(params.num_groups); - state.counters["group size"] = benchmark::Counter(params.n / params.num_groups); - state.counters["Pat"] = benchmark::Counter(static_cast(params.pattern)); - - state.counters["SM count"] = raft::getMultiProcessorCount(); - } -}; - -const std::vector masked_l2_nn_input_vecs = { - // Very fat matrices... - {32, 16384, 16384, 32, AdjacencyPattern::checkerboard}, - {64, 16384, 16384, 32, AdjacencyPattern::checkerboard}, - {128, 16384, 16384, 32, AdjacencyPattern::checkerboard}, - {256, 16384, 16384, 32, AdjacencyPattern::checkerboard}, - {512, 16384, 16384, 32, AdjacencyPattern::checkerboard}, - {1024, 16384, 16384, 32, AdjacencyPattern::checkerboard}, - {16384, 32, 16384, 32, AdjacencyPattern::checkerboard}, - {16384, 64, 16384, 32, AdjacencyPattern::checkerboard}, - {16384, 128, 16384, 32, AdjacencyPattern::checkerboard}, - {16384, 256, 16384, 32, AdjacencyPattern::checkerboard}, - {16384, 512, 16384, 32, AdjacencyPattern::checkerboard}, - {16384, 1024, 16384, 32, AdjacencyPattern::checkerboard}, - - // Representative matrices... - {16384, 16384, 32, 32, AdjacencyPattern::checkerboard}, - {16384, 16384, 64, 32, AdjacencyPattern::checkerboard}, - {16384, 16384, 128, 32, AdjacencyPattern::checkerboard}, - {16384, 16384, 256, 32, AdjacencyPattern::checkerboard}, - {16384, 16384, 512, 32, AdjacencyPattern::checkerboard}, - {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard}, - {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard}, - - {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_4}, - {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_4}, - {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_4}, - {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_4}, - {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_4}, - {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_4}, - {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_4}, - - {16384, 16384, 32, 32, AdjacencyPattern::checkerboard_64}, - {16384, 16384, 64, 32, AdjacencyPattern::checkerboard_64}, - {16384, 16384, 128, 32, AdjacencyPattern::checkerboard_64}, - {16384, 16384, 256, 32, AdjacencyPattern::checkerboard_64}, - {16384, 16384, 512, 32, AdjacencyPattern::checkerboard_64}, - {16384, 16384, 1024, 32, AdjacencyPattern::checkerboard_64}, - {16384, 16384, 16384, 32, AdjacencyPattern::checkerboard_64}, - - {16384, 16384, 32, 32, AdjacencyPattern::all_true}, - {16384, 16384, 64, 32, AdjacencyPattern::all_true}, - {16384, 16384, 128, 32, AdjacencyPattern::all_true}, - {16384, 16384, 256, 32, AdjacencyPattern::all_true}, - {16384, 16384, 512, 32, AdjacencyPattern::all_true}, - {16384, 16384, 1024, 32, AdjacencyPattern::all_true}, - {16384, 16384, 16384, 32, AdjacencyPattern::all_true}, - - {16384, 16384, 32, 32, AdjacencyPattern::all_false}, - {16384, 16384, 64, 32, AdjacencyPattern::all_false}, - {16384, 16384, 128, 32, AdjacencyPattern::all_false}, - {16384, 16384, 256, 32, AdjacencyPattern::all_false}, - {16384, 16384, 512, 32, AdjacencyPattern::all_false}, - {16384, 16384, 1024, 32, AdjacencyPattern::all_false}, - {16384, 16384, 16384, 32, AdjacencyPattern::all_false}, -}; - -RAFT_BENCH_REGISTER(masked_l2_nn, "", masked_l2_nn_input_vecs); -// We don't benchmark double to keep compile times in check when not using the -// distance library. - -} // namespace raft::bench::distance::masked_nn diff --git a/cpp/bench/prims/distance/tune_pairwise/bench.cu b/cpp/bench/prims/distance/tune_pairwise/bench.cu deleted file mode 100644 index 81105cdefe..0000000000 --- a/cpp/bench/prims/distance/tune_pairwise/bench.cu +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Tuning benchmarks. -// -// Goals: -// -// 1. Fast compile times to maintain iteration speed. -// 2. Create benchmarks that can inform the design of the kernels. -// -// Non-goals: -// -// 1. Measure every distance operation. Instead measures just one distance -// operation at the same time. -// 2. Be useful for finding performance regressions. This is handled by the -// normal benchmarks. -// -// So far, both goals are partly achieved. -// -// RE (1), COMPILE TIMES: kernel.cu is fast to compile. This file is not. -// When the internals of a pairwise distance kernel is changed, this file is not -// recompiled. -// -// RE 2, benchmarks with intent: this file contains a benchmark to check the -// maximal throughput of a kernel. Measuring other things, like performance on -// skinny or wide matrices is not yet implemented. - -#include "kernel.cuh" // launch_kernel - -#include // RAFT_BENCH_REGISTER - -#include // pairwise_matrix_params - -#include // rmm::device_uvector - -#include // std::min -#include // std::vector - -namespace raft::bench::distance::tune { - -// Max throughput benchmark. -// -// Goal: Measure the maximum distances/sec that can be computed. -// -// To achieve this, we make sure that: -// -// - Input data size is a multiple of the block tile size. -// -// - Perfect distribution of work between SMs, i.e. the number of block tiles is -// a large multiple (num_waves) of the number of blocks (#SMs * occupancy). -// -// - Multiple iterations over Kblk are executed (num_k_iters). -struct throughput_param { - int num_waves; - int occupancy; - int num_k_iters; -}; - -const std::vector throughput_params{ - // 32 waves, requested occupancy of 4, and 32 k iterations typically achieves - // maximum throughput. No need to pick higher values. - {32, 4, 32}, -}; - -struct throughput_bench : public fixture { - const throughput_param p; - - throughput_bench(const throughput_param& p_) : p(p_) {} - - void run_benchmark(::benchmark::State& state) override - { - // Get block size: - int block_m, block_n, block_k; - get_block_size(block_m, block_n, block_k); - - // Determine number of blocks that will be launched. This informs the size - // of the inputs as well as the grid size. - const int num_sms = raft::getMultiProcessorCount(); - const int max_occupancy = get_max_occupancy(); - const int occupancy = std::min(p.occupancy, max_occupancy); - const int num_blocks = occupancy * num_sms; - dim3 grid(num_blocks); - - // Create input sizes that are a multiple of the block tile size. - size_t m = block_m; - size_t n = block_n * p.num_waves * num_blocks; - size_t k = block_k * p.num_k_iters; - - // DataT, OutT, IdxT, etc, are defined in tuned_kernel.cuh - rmm::device_uvector x_vec(m * k, stream); - rmm::device_uvector y_vec(n * k, stream); - rmm::device_uvector x_norm_vec(m, stream); - rmm::device_uvector y_norm_vec(n, stream); - rmm::device_uvector out_vec(m * n, stream); - - auto x = x_vec.data(); - auto y = y_vec.data(); - auto x_norm = x_norm_vec.data(); - auto y_norm = y_norm_vec.data(); - auto out = out_vec.data(); - FinOpT fin_op{}; - - // Create kernel parameter struct. Flip x and y if column major. - IdxT ldx = row_major ? k : m; - IdxT ldy = row_major ? k : n; - IdxT ld_out = row_major ? n : m; - - // Template parameters of pairwise_matrix_params are defined in kernel.cuh - pairwise_matrix_params kparams{ - IdxT(m), IdxT(n), IdxT(k), ldx, ldy, ld_out, x, y, x_norm, y_norm, out, fin_op, row_major}; - - // Run benchmark - loop_on_state(state, [&]() { launch_kernel(kparams, grid, stream); }); - - // Report metrics. We don't report flop/s because we do not know for each - // distance operation how many flops it costs. For L2_unexp and l1, we can - // double this number to get the flop/s. For l2 expanded, core_ops/s should - // equal flop/s (modulo the sqrt and subtracting from the norm). - size_t num_core_ops = m * n * k; - size_t read_elts = n * k + m * k; - size_t write_elts = m * n; - - state.counters["m"] = benchmark::Counter(m); - state.counters["n"] = benchmark::Counter(n); - state.counters["k"] = benchmark::Counter(k); - state.counters["occupancy"] = benchmark::Counter(occupancy); - state.counters["# waves"] = benchmark::Counter(p.num_waves); - state.counters["# k iters"] = benchmark::Counter(p.num_k_iters); - - state.counters["core_ops/s"] = benchmark::Counter(num_core_ops, - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - - state.counters["BW"] = benchmark::Counter(write_elts * sizeof(OutT) + read_elts * sizeof(DataT), - benchmark::Counter::kIsIterationInvariantRate, - benchmark::Counter::OneK::kIs1000); - } -}; - -RAFT_BENCH_REGISTER(throughput_bench, "", throughput_params); - -} // namespace raft::bench::distance::tune diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cu b/cpp/bench/prims/distance/tune_pairwise/kernel.cu deleted file mode 100644 index 42173c51f5..0000000000 --- a/cpp/bench/prims/distance/tune_pairwise/kernel.cu +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "kernel.cuh" - -#include // pairwise_matrix_sm60_wrapper -#include // raft::linalg::Policy4x4 -#include // raft::util::arch::SM_compute_arch - -namespace raft::bench::distance::tune { - -// Distance op -using OpT = raft::distance::detail::ops::lp_unexp_distance_op; -constexpr float metric_arg = 2.0; -OpT distance_op{metric_arg}; - -// Kernel policy -constexpr int vec_len = 1; -using Policy = typename raft::linalg::Policy4x4::Policy; - -// Architecture -namespace arch = raft::util::arch; -constexpr auto sm_compat_range = arch::SM_range(arch::SM_min(), arch::SM_future()); - -void launch_kernel(pairwise_matrix_params params, dim3 grid, cudaStream_t stream) -{ - dim3 block(Policy::Nthreads); - int smem_size = OpT::shared_mem_size(); - - // Obtain function pointer to kernel - auto kernel = raft::distance::detail::pairwise_matrix_kernel; - - kernel<<>>(distance_op, params); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -void get_block_size(int& m, int& n, int& k) -{ - m = Policy::Mblk; - n = Policy::Nblk; - k = Policy::Kblk; -} - -void* get_kernel_ptr() -{ - auto kernel = raft::distance::detail::pairwise_matrix_kernel; - return reinterpret_cast(kernel); -} - -int get_max_occupancy() -{ - void* kernel_ptr = get_kernel_ptr(); - int max_occupancy; - int smem_size = OpT::shared_mem_size(); - - RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &max_occupancy, kernel_ptr, Policy::Nthreads, smem_size)); - - return max_occupancy; -} - -} // namespace raft::bench::distance::tune diff --git a/cpp/bench/prims/distance/tune_pairwise/kernel.cuh b/cpp/bench/prims/distance/tune_pairwise/kernel.cuh deleted file mode 100644 index 5da54a343c..0000000000 --- a/cpp/bench/prims/distance/tune_pairwise/kernel.cuh +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include // lp_unexp_distance_op -#include // pairwise_matrix_params - -namespace raft::bench::distance::tune { - -// Launch one specific kernel with the following template parameters -constexpr bool row_major = true; -using DataT = float; -using AccT = float; -using OutT = DataT; -using IdxT = int; - -using FinOpT = raft::identity_op; - -using pairwise_matrix_params = - raft::distance::detail::pairwise_matrix_params; - -// Launches kernel -void launch_kernel(pairwise_matrix_params, dim3, cudaStream_t); - -// Describes the block size that is decided by the policy -void get_block_size(int& m, int& n, int& k); - -int get_max_occupancy(); - -} // namespace raft::bench::distance::tune