From 557c2aaf8e291b6eb47211587d13c7ac56f4804b Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 12 Nov 2024 20:03:37 -0500 Subject: [PATCH] Exposing kernel gramm APIs --- cpp/CMakeLists.txt | 3 + .../distance/detail/kernels/gram_matrix.cu | 478 +++++++++++ .../{gram_matrix.cuh => gram_matrix.hpp} | 231 +----- .../distance/detail/kernels/kernel_factory.cu | 59 ++ .../detail/kernels/kernel_factory.cuh | 65 -- .../detail/kernels/kernel_factory.hpp | 31 + .../detail/kernels/kernel_matrices.cu | 719 ++++++++++++++++ .../detail/kernels/kernel_matrices.cuh | 777 ------------------ .../detail/kernels/kernel_matrices.hpp | 381 +++++++++ cpp/src/distance/kernels.cuh | 4 +- 10 files changed, 1695 insertions(+), 1053 deletions(-) create mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cu rename cpp/src/distance/detail/kernels/{gram_matrix.cuh => gram_matrix.hpp} (59%) create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cu delete mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cuh create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.hpp create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cu delete mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cuh create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9d5ec1304..a393d754c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -324,6 +324,9 @@ if(BUILD_SHARED_LIBS) src/cluster/kmeans_transform_float.cu src/cluster/single_linkage_float.cu src/core/bitset.cu + src/distance/detail/kernels/gram_matrix.cu + src/distance/detail/kernels/kernel_factory.cu + src/distance/detail/kernels/kernel_matrices.cu src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cu b/cpp/src/distance/detail/kernels/gram_matrix.cu new file mode 100644 index 000000000..f51595e7c --- /dev/null +++ b/cpp/src/distance/detail/kernels/gram_matrix.cu @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../distance.cuh" +#include "gram_matrix.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels::detail { + +/** + * Base class for general Gram matrices + * A Gram matrix is the Hermitian matrix of inner probucts G_ik = + * Here, the inner product is evaluated for all elements from vectors sets X1, + * and X2. + * + * To be more precise, on exit the output buffer will store: + * - if is_row_major == true: out[j+k*n1] = , + * - if is_row_major == false: out[j*n2 + k] = , + * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector + * from the x2 set. + */ + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * Vector sets are provided in Matrix format + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void GramMatrixBase::operator()(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + evaluate(handle, x1, x2, out, norm_x1, norm_x2); +} + +// unfortunately, 'evaluate' cannot be templatized as it needs to be virtual + +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void GramMatrixBase::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + linear(handle, x1, x2, out); +} + +/** Evaluate the Gram matrix for two vector sets using simple dot product. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void GramMatrixBase::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); +} + +/** Convenience function to evaluate the Gram matrix for two vector sets. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ +template +[[deprecated]] void GramMatrixBase::operator()(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor."); + if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } + if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } + if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } + evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 + * @param ld2 leading dimension of x2 + * @param ld_out leading dimension of out + */ +template +[[deprecated]] void GramMatrixBase::linear(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + math_t alpha = 1.0; + math_t beta = 0.0; + if (is_row_major) { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + n2, + n1, + n_cols, + &alpha, + x2, + ld2, + x1, + ld1, + &beta, + out, + ld_out, + stream)); + } else { + // #TODO: Call from public API when ready + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + n1, + n2, + n_cols, + &alpha, + x1, + ld1, + x2, + ld2, + &beta, + out, + ld_out, + stream)); + } +} + +template +bool GramMatrixBase::get_is_row_major(dense_output_matrix_view_t matrix) +{ + return (matrix.stride(1) == 1); +} +template +bool GramMatrixBase::get_is_row_major(dense_input_matrix_view_t matrix) +{ + return (matrix.stride(1) == 1); +} + +template +bool GramMatrixBase::get_is_col_major(dense_output_matrix_view_t matrix) +{ + return (matrix.stride(0) == 1); +} + +template +bool GramMatrixBase::get_is_col_major(dense_input_matrix_view_t matrix) +{ + return (matrix.stride(0) == 1); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check is_row_major consistency + bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out); + bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out); + ASSERT(is_row_major || is_col_major, + "GramMatrix leading dimensions for x1, x2 and out do not match"); + + // check dimensions + int n1 = out.extent(0); + int n2 = out.extent(1); + int n_cols = x1.extent(1); + ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match"); + ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match"); + ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match"); + + // extract major stride + int ld1 = is_row_major ? x1.stride(0) : x1.stride(1); + int ld2 = is_row_major ? x2.stride(0) : x2.stride(1); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + + math_t alpha = 1.0; + math_t beta = 0.0; + if (is_row_major) { + // #TODO: Use mdspan-based API when stride-capable + // https://github.com/rapidsai/raft/issues/875 + raft::linalg::gemm(handle, + true, + false, + n2, + n1, + n_cols, + &alpha, + x2.data_handle(), + ld2, + x1.data_handle(), + ld1, + &beta, + out.data_handle(), + ld_out, + raft::resource::get_cuda_stream(handle)); + } else { + // #TODO: Use mdspan-based API when stride-capable + // https://github.com/rapidsai/raft/issues/875 + raft::linalg::gemm(handle, + false, + true, + n1, + n2, + n_cols, + &alpha, + x1.data_handle(), + ld1, + x2.data_handle(), + ld2, + &beta, + out.data_handle(), + ld_out, + raft::resource::get_cuda_stream(handle)); + } +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check is_row_major consistency + bool is_row_major = get_is_row_major(x2) && get_is_row_major(out); + bool is_col_major = get_is_col_major(x2) && get_is_col_major(out); + ASSERT(is_row_major || is_col_major, "GramMatrix leading dimensions for x2 and out do not match"); + + // check dimensions + auto x1_structure = x1.structure_view(); + ASSERT(x1_structure.get_n_rows() == out.extent(0), + "GramMatrix input matrix dimensions for x1 and out do not match"); + ASSERT(x2.extent(0) == out.extent(1), + "GramMatrix input matrix dimensions for x2 and out do not match"); + ASSERT(x2.extent(1) == x1_structure.get_n_cols(), + "GramMatrix input matrix dimensions for x1 and x2 do not match"); + + math_t alpha = 1.0; + math_t beta = 0.0; + + raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out); +} + +/** Calculates the Gram matrix using simple dot product between vector sets. + * + * out = x1 * x2 + * + * Can be used as a building block for more complex kernel functions. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + */ +template +void GramMatrixBase::linear(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out) +{ + // check layout consistency (w.r.t. strides a matrix might be both row & col major) + bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1); + bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0); + + ASSERT(is_row_major_nopad || is_col_major_nopad, + "Sparse linear Kernel distance does not support ld_out parameter"); + + // switch a,b based on is_row_major + if (is_col_major_nopad) { + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(1), out.extent(0)); + raft::sparse::distance::pairwise_distance( + handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); + } else { + auto out_row_major = raft::make_device_matrix_view( + out.data_handle(), out.extent(0), out.extent(1)); + raft::sparse::distance::pairwise_distance( + handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); + } +} + +}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cuh b/cpp/src/distance/detail/kernels/gram_matrix.hpp similarity index 59% rename from cpp/src/distance/detail/kernels/gram_matrix.cuh rename to cpp/src/distance/detail/kernels/gram_matrix.hpp index d435fb4d1..165a81c80 100644 --- a/cpp/src/distance/detail/kernels/gram_matrix.cuh +++ b/cpp/src/distance/detail/kernels/gram_matrix.hpp @@ -16,23 +16,19 @@ #pragma once -#include "../../distance.cuh" +#include "cublas.h" #include #include +#include #include #include -// #include -#include -#include -#include -#include namespace cuvs::distance::kernels::detail { template -using dense_input_matrix_view_t = raft::device_matrix_view; +using dense_input_matrix_view_t = raft::device_matrix_view; template -using dense_output_matrix_view_t = raft::device_matrix_view; +using dense_output_matrix_view_t = raft::device_matrix_view; template using csr_input_matrix_view_t = raft::device_csr_matrix_view; @@ -76,10 +72,7 @@ class GramMatrixBase { dense_input_matrix_view_t x2, dense_output_matrix_view_t out, math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } + math_t* norm_x2 = nullptr); /** Convenience function to evaluate the Gram matrix for two vector sets. * Vector sets are provided in Matrix format @@ -96,10 +89,7 @@ class GramMatrixBase { dense_input_matrix_view_t x2, dense_output_matrix_view_t out, math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } + math_t* norm_x2 = nullptr); /** Convenience function to evaluate the Gram matrix for two vector sets. * Vector sets are provided in Matrix format @@ -116,10 +106,7 @@ class GramMatrixBase { csr_input_matrix_view_t x2, dense_output_matrix_view_t out, math_t* norm_x1 = nullptr, - math_t* norm_x2 = nullptr) - { - evaluate(handle, x1, x2, out, norm_x1, norm_x2); - } + math_t* norm_x2 = nullptr); // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual @@ -137,10 +124,8 @@ class GramMatrixBase { dense_input_matrix_view_t x2, dense_output_matrix_view_t out, math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } + math_t* norm_x2); + /** Evaluate the Gram matrix for two vector sets using simple dot product. * * @param [in] handle raft handle @@ -155,10 +140,8 @@ class GramMatrixBase { dense_input_matrix_view_t x2, dense_output_matrix_view_t out, math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } + math_t* norm_x2); + /** Evaluate the Gram matrix for two vector sets using simple dot product. * * @param [in] handle raft handle @@ -173,10 +156,7 @@ class GramMatrixBase { csr_input_matrix_view_t x2, dense_output_matrix_view_t out, math_t* norm_x1, - math_t* norm_x2) - { - linear(handle, x1, x2, out); - } + math_t* norm_x2); /** Evaluate the Gram matrix for two vector sets using simple dot product. * @@ -203,10 +183,7 @@ class GramMatrixBase { cudaStream_t stream, int ld1, int ld2, - int ld_out) - { - linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } + int ld_out); /** Convenience function to evaluate the Gram matrix for two vector sets. * @@ -233,14 +210,7 @@ class GramMatrixBase { cudaStream_t stream, int ld1 = 0, int ld2 = 0, - int ld_out = 0) - { - ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor."); - if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; } - if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; } - if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; } - evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - } + int ld_out = 0); protected: /** Calculates the Gram matrix using simple dot product between vector sets. @@ -272,67 +242,13 @@ class GramMatrixBase { cudaStream_t stream, int ld1, int ld2, - int ld_out) - { - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - n2, - n1, - n_cols, - &alpha, - x2, - ld2, - x1, - ld1, - &beta, - out, - ld_out, - stream)); - } else { - // #TODO: Call from public API when ready - RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle, - CUBLAS_OP_N, - CUBLAS_OP_T, - n1, - n2, - n_cols, - &alpha, - x1, - ld1, - x2, - ld2, - &beta, - out, - ld_out, - stream)); - } - } + int ld_out); protected: - bool get_is_row_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_row_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(1) == 1); - } - - bool get_is_col_major(dense_output_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } - - bool get_is_col_major(dense_input_matrix_view_t matrix) - { - return (matrix.stride(0) == 1); - } + bool get_is_row_major(dense_output_matrix_view_t matrix); + bool get_is_row_major(dense_input_matrix_view_t matrix); + bool get_is_col_major(dense_output_matrix_view_t matrix); + bool get_is_col_major(dense_input_matrix_view_t matrix); /** Calculates the Gram matrix using simple dot product between vector sets. * @@ -348,67 +264,7 @@ class GramMatrixBase { void linear(raft::resources const& handle, dense_input_matrix_view_t x1, dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x1, x2 and out do not match"); - - // check dimensions - int n1 = out.extent(0); - int n2 = out.extent(1); - int n_cols = x1.extent(1); - ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - // extract major stride - int ld1 = is_row_major ? x1.stride(0) : x1.stride(1); - int ld2 = is_row_major ? x2.stride(0) : x2.stride(1); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - - math_t alpha = 1.0; - math_t beta = 0.0; - if (is_row_major) { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - true, - false, - n2, - n1, - n_cols, - &alpha, - x2.data_handle(), - ld2, - x1.data_handle(), - ld1, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } else { - // #TODO: Use mdspan-based API when stride-capable - // https://github.com/rapidsai/raft/issues/875 - raft::linalg::gemm(handle, - false, - true, - n1, - n2, - n_cols, - &alpha, - x1.data_handle(), - ld1, - x2.data_handle(), - ld2, - &beta, - out.data_handle(), - ld_out, - resource::get_cuda_stream(handle)); - } - } + dense_output_matrix_view_t out); /** Calculates the Gram matrix using simple dot product between vector sets. * @@ -424,28 +280,7 @@ class GramMatrixBase { void linear(raft::resources const& handle, csr_input_matrix_view_t x1, dense_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check is_row_major consistency - bool is_row_major = get_is_row_major(x2) && get_is_row_major(out); - bool is_col_major = get_is_col_major(x2) && get_is_col_major(out); - ASSERT(is_row_major || is_col_major, - "GramMatrix leading dimensions for x2 and out do not match"); - - // check dimensions - auto x1_structure = x1.structure_view(); - ASSERT(x1_structure.get_n_rows() == out.extent(0), - "GramMatrix input matrix dimensions for x1 and out do not match"); - ASSERT(x2.extent(0) == out.extent(1), - "GramMatrix input matrix dimensions for x2 and out do not match"); - ASSERT(x2.extent(1) == x1_structure.get_n_cols(), - "GramMatrix input matrix dimensions for x1 and x2 do not match"); - - math_t alpha = 1.0; - math_t beta = 0.0; - - raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out); - } + dense_output_matrix_view_t out); /** Calculates the Gram matrix using simple dot product between vector sets. * @@ -461,28 +296,6 @@ class GramMatrixBase { void linear(raft::resources const& handle, csr_input_matrix_view_t x1, csr_input_matrix_view_t x2, - dense_output_matrix_view_t out) - { - // check layout consistency (w.r.t. strides a matrix might be both row & col major) - bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1); - bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0); - - ASSERT(is_row_major_nopad || is_col_major_nopad, - "Sparse linear Kernel distance does not support ld_out parameter"); - - // switch a,b based on is_row_major - if (is_col_major_nopad) { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(1), out.extent(0)); - raft::sparse::distance::pairwise_distance( - handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } else { - auto out_row_major = raft::make_device_matrix_view( - out.data_handle(), out.extent(0), out.extent(1)); - raft::sparse::distance::pairwise_distance( - handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0); - } - } + dense_output_matrix_view_t out); }; - }; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cu b/cpp/src/distance/detail/kernels/kernel_factory.cu new file mode 100644 index 000000000..2d9634de2 --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_factory.cu @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gram_matrix.hpp" +#include "kernel_factory.hpp" + +namespace cuvs::distance::kernels::detail { + +template +GramMatrixBase* KernelFactory::create(KernelParams params) +{ + GramMatrixBase* res; + // KernelParams is not templated, we convert the parameters to math_t here: + math_t coef0 = params.coef0; + math_t gamma = params.gamma; + switch (params.kernel) { + case LINEAR: res = new GramMatrixBase(); break; + case POLYNOMIAL: res = new PolynomialKernel(params.degree, gamma, coef0); break; + case TANH: res = new TanhKernel(gamma, coef0); break; + case RBF: res = new RBFKernel(gamma); break; + default: throw raft::exception("Kernel not implemented"); + } + return res; +} + +template +[[deprecated]] GramMatrixBase* KernelFactory::create(KernelParams params, + cublasHandle_t handle) +{ + GramMatrixBase* res; + // KernelParams is not templated, we convert the parameters to math_t here: + math_t coef0 = params.coef0; + math_t gamma = params.gamma; + switch (params.kernel) { + case LINEAR: res = new GramMatrixBase(handle); break; + case POLYNOMIAL: + res = new PolynomialKernel(params.degree, gamma, coef0, handle); + break; + case TANH: res = new TanhKernel(gamma, coef0, handle); break; + case RBF: res = new RBFKernel(gamma, handle); break; + default: throw raft::exception("Kernel not implemented"); + } + return res; +} + +}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cuh b/cpp/src/distance/detail/kernels/kernel_factory.cuh deleted file mode 100644 index 5c50a95a3..000000000 --- a/cpp/src/distance/detail/kernels/kernel_factory.cuh +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" -#include "kernel_matrices.cuh" - -#include -#include - -namespace cuvs::distance::kernels::detail { - -template -class KernelFactory { - public: - static GramMatrixBase* create(KernelParams params) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(); break; - case POLYNOMIAL: res = new PolynomialKernel(params.degree, gamma, coef0); break; - case TANH: res = new TanhKernel(gamma, coef0); break; - case RBF: res = new RBFKernel(gamma); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } - - [[deprecated]] static GramMatrixBase* create(KernelParams params, cublasHandle_t handle) - { - GramMatrixBase* res; - // KernelParams is not templated, we convert the parameters to math_t here: - math_t coef0 = params.coef0; - math_t gamma = params.gamma; - switch (params.kernel) { - case LINEAR: res = new GramMatrixBase(handle); break; - case POLYNOMIAL: - res = new PolynomialKernel(params.degree, gamma, coef0, handle); - break; - case TANH: res = new TanhKernel(gamma, coef0, handle); break; - case RBF: res = new RBFKernel(gamma, handle); break; - default: throw raft::exception("Kernel not implemented"); - } - return res; - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_factory.hpp b/cpp/src/distance/detail/kernels/kernel_factory.hpp new file mode 100644 index 000000000..cc767fbd5 --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_factory.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "gram_matrix.hpp" +#include "kernel_matrices.hpp" + +namespace cuvs::distance::kernels::detail { + +template +class KernelFactory { + public: + static GramMatrixBase* create(KernelParams params); + [[deprecated]] static GramMatrixBase* create(KernelParams params, cublasHandle_t handle); +}; + +}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cu b/cpp/src/distance/detail/kernels/kernel_matrices.cu new file mode 100644 index 000000000..2ecceb8ed --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_matrices.cu @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../../../distance/distance.cuh" +#include "gram_matrix.hpp" +#include "kernel_matrices.hpp" +#include "rbf_fin_op.cuh" +#include +#include +#include +#include +#include + +namespace cuvs::distance::kernels::detail { + +/** Epiloge function for polynomial kernel without padding. + * Calculates output = (gain*in + offset)^exponent + * @param inout device vector in column major format, size [len] + * @param len array length + * @param exponent + * @param gain + * @param offset + */ +template +RAFT_KERNEL polynomial_kernel_nopad( + math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = pow(gain * inout[tid] + offset, exponent); + } +} + +/** Epiloge function for polynomial kernel with padding. + * Calculates output = (gain*input + offset)^exponent + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param exponent + * @param gain + * @param offset + */ +template +RAFT_KERNEL polynomial_kernel( + math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector, size [len] + * @param len length of the input vector + * @param gain + * @param offset + */ +template +RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) +{ + for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; + tid += blockDim.x * gridDim.x) { + inout[tid] = tanh(gain * inout[tid] + offset); + } +} + +/** Epiloge function for tanh kernel without padding. + * Calculates output = tanh(gain*input + offset) + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param gain + * @param offset + */ +template +RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); + } +} + +/** Epiloge function for rbf kernel using expansion. + * + * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij)); + * + * Intended usage + * - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk + * - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X + * - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y + * + * @param inout device vector in column major format, size [ld * cols] + * @param ld leading dimension of the inout buffer + * @param rows number of rows (rows <= ld) + * @param cols number of columns + * @param norm_x l2-norm of X's rows + * @param norm_y l2-norm of Y's rows + * @param gain + */ +template +RAFT_KERNEL rbf_kernel_expanded( + math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) +{ + for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; + tidy += blockDim.y * gridDim.y) { + math_t norm_y_val = norm_y[tidy]; + for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; + tidx += blockDim.x * gridDim.x) { + inout[tidx + tidy * ld] = + exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2)); + } + } +} + +std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) +{ + dim3 block_shape = dim3(32, 4); + const int num_blocks_x = raft::ceildiv(n1, 32); + const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); + dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); + return std::make_tuple(grid_shape, block_shape); +} + +/** + * Create a kernel matrix using polynomial kernel function. + */ +template +void PolynomialKernel::applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) +{ + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, exponent, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + polynomial_kernel<<>>( + inout, ld, n1, n2, exponent, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void PolynomialKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void PolynomialKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); +} + +/** + * Create a kernel matrix using tanh kernel function. + */ +template +void TanhKernel::applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) +{ + const int n_minor = is_row_major ? cols : rows; + if (ld == n_minor) { + tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( + inout, rows * cols, gain, offset); + } else { + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); + } + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ +template +void TanhKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void TanhKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + GramMatrixBase::linear( + x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); + applyKernel(out, ld_out, n1, n2, is_row_major, stream); +} + +/** + * Create a kernel matrix using RBF kernel function. + */ +template +void RBFKernel::applyKernel(math_t* inout, + int ld, + int rows, + int cols, + math_t* norm_x1, + math_t* norm_x2, + bool is_row_major, + cudaStream_t stream) +{ + int n1 = is_row_major ? cols : rows; + int n2 = is_row_major ? rows : cols; + math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; + math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; + auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); + rbf_kernel_expanded<<>>( + inout, ld, n1, n2, norm_n1, norm_n2, gain); +} + +template +void matrixRowNormL2(raft::resources const& handle, + dense_input_matrix_view_t matrix, + math_t* target) +{ + bool is_row_major = GramMatrixBase::get_is_row_major(matrix); + int minor = is_row_major ? matrix.extent(1) : matrix.extent(0); + int ld = is_row_major ? matrix.stride(0) : matrix.stride(1); + ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter"); + raft::linalg::rowNorm(target, + matrix.data_handle(), + matrix.extent(1), + matrix.extent(0), + raft::linalg::NormType::L2Norm, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +template +void RBFKernel::matrixRowNormL2(raft::resources const& handle, + csr_input_matrix_view_t matrix, + math_t* target) +{ + auto matrix_structure = matrix.structure_view(); + raft::sparse::linalg::rowNormCsr(handle, + matrix_structure.get_indptr().data(), + matrix.get_elements().data(), + matrix_structure.get_nnz(), + matrix_structure.get_n_rows(), + target, + raft::linalg::NormType::L2Norm); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.extent(0), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.extent(0), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.extent(0), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ +template +void RBFKernel::evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2) +{ + cudaStream_t stream = raft::resource::get_cuda_stream(handle); + + // lazy compute norms if not given + rmm::device_uvector tmp_norm_x1(0, stream); + rmm::device_uvector tmp_norm_x2(0, stream); + if (norm_x1 == nullptr) { + tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); + norm_x1 = tmp_norm_x1.data(); + matrixRowNormL2(handle, x1, norm_x1); + } + if (norm_x2 == nullptr) { + tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream); + norm_x2 = tmp_norm_x2.data(); + matrixRowNormL2(handle, x2, norm_x2); + } + + // compute L2expanded + bool is_row_major = GramMatrixBase::get_is_row_major(out); + int ld_out = is_row_major ? out.stride(0) : out.stride(1); + GramMatrixBase::linear(handle, x1, x2, out); + applyKernel(out.data_handle(), + ld_out, + out.extent(0), + out.extent(1), + norm_x1, + norm_x2, + is_row_major, + raft::resource::get_cuda_stream(handle)); +} + +/** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ +template +[[deprecated]] void RBFKernel::evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out) +{ + ASSERT(GramMatrixBase::legacy_interface, + "Legacy interface can only be used with legacy ctor."); + int minor1 = is_row_major ? n_cols : n1; + int minor2 = is_row_major ? n_cols : n2; + int minor_out = is_row_major ? n2 : n1; + ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); + ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); + ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); + + math_t gain = this->gain; + using index_t = int64_t; + + rbf_fin_op fin_op{gain}; + + raft::resources handle; + raft::resource::set_cuda_stream(handle, stream); + + cuvs::distance::distance(handle, + const_cast(x1), + const_cast(x2), + out, + n1, + n2, + n_cols, + NULL, + 0, + fin_op, + is_row_major); +} + +}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cuh b/cpp/src/distance/detail/kernels/kernel_matrices.cuh deleted file mode 100644 index bff5bda92..000000000 --- a/cpp/src/distance/detail/kernels/kernel_matrices.cuh +++ /dev/null @@ -1,777 +0,0 @@ -/* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "gram_matrix.cuh" - -#include "../detail/kernels/rbf_fin_op.cuh" -#include -#include -#include -#include -#include - -namespace cuvs::distance::kernels::detail { - -/** Epiloge function for polynomial kernel without padding. - * Calculates output = (gain*in + offset)^exponent - * @param inout device vector in column major format, size [len] - * @param len array length - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel_nopad( - math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = pow(gain * inout[tid] + offset, exponent); - } -} - -/** Epiloge function for polynomial kernel with padding. - * Calculates output = (gain*input + offset)^exponent - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param exponent - * @param gain - * @param offset - */ -template -RAFT_KERNEL polynomial_kernel( - math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector, size [len] - * @param len length of the input vector - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset) -{ - for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len; - tid += blockDim.x * gridDim.x) { - inout[tid] = tanh(gain * inout[tid] + offset); - } -} - -/** Epiloge function for tanh kernel without padding. - * Calculates output = tanh(gain*input + offset) - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param gain - * @param offset - */ -template -RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset); - } -} - -/** Epiloge function for rbf kernel using expansion. - * - * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij)); - * - * Intended usage - * - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk - * - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X - * - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y - * - * @param inout device vector in column major format, size [ld * cols] - * @param ld leading dimension of the inout buffer - * @param rows number of rows (rows <= ld) - * @param cols number of columns - * @param norm_x l2-norm of X's rows - * @param norm_y l2-norm of Y's rows - * @param gain - */ -template -RAFT_KERNEL rbf_kernel_expanded( - math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain) -{ - for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols; - tidy += blockDim.y * gridDim.y) { - math_t norm_y_val = norm_y[tidy]; - for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows; - tidx += blockDim.x * gridDim.x) { - inout[tidx + tidy * ld] = - exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2)); - } - } -} - -namespace { -std::tuple generateLaunchConfig2dElementwiseOp(int n1, int n2) -{ - dim3 block_shape = dim3(32, 4); - const int num_blocks_x = raft::ceildiv(n1, 32); - const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1); - dim3 grid_shape = dim3(num_blocks_x, num_blocks_y); - return std::make_tuple(grid_shape, block_shape); -} -} // namespace - -/** - * Create a kernel matrix using polynomial kernel function. - */ -template -class PolynomialKernel : public GramMatrixBase { - exp_t exponent; - math_t gain; - math_t offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - polynomial_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, exponent, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - polynomial_kernel<<>>( - inout, ld, n1, n2, exponent, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a polynomial kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = (gain* + offset)^exponent - * - * @tparam math_t floating point type - * @tparam exp_t type of exponent - * @param exponent - * @param gain - * @param offset - */ - PolynomialKernel(exp_t exponent, math_t gain, math_t offset) - : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset) - { - } - - [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using polynomial kernel. - * - * output[i,k] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using tanh kernel function. - */ -template -class TanhKernel : public GramMatrixBase { - math_t gain, offset; - - void applyKernel( - math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream) - { - const int n_minor = is_row_major ? cols : rows; - if (ld == n_minor) { - tanh_kernel_nopad<<((size_t)rows * cols, 128), 128, 0, stream>>>( - inout, rows * cols, gain, offset); - } else { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - tanh_kernel<<>>(inout, ld, n1, n2, gain, offset); - } - RAFT_CUDA_TRY(cudaPeekAtLastError()); - } - - public: - /** - * Constructs a tanh kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = tanh(gain* + offset) - * - * @tparam math_t floating point type - * @param gain - * @param offset - */ - TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} - - [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain), offset(offset) - { - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using tanh kernel. - * - * output_[i + k*n1] = (gain* + offset)^exponent, - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and < , > denotes dot product. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 unused. - * @param norm_x2 unused. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - GramMatrixBase::linear( - x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out); - applyKernel(out, ld_out, n1, n2, is_row_major, stream); - } -}; - -/** - * Create a kernel matrix using RBF kernel function. - */ -template -class RBFKernel : public GramMatrixBase { - math_t gain; - - void applyKernel(math_t* inout, - int ld, - int rows, - int cols, - math_t* norm_x1, - math_t* norm_x2, - bool is_row_major, - cudaStream_t stream) - { - int n1 = is_row_major ? cols : rows; - int n2 = is_row_major ? rows : cols; - math_t* norm_n1 = is_row_major ? norm_x2 : norm_x1; - math_t* norm_n2 = is_row_major ? norm_x1 : norm_x2; - auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2); - rbf_kernel_expanded<<>>( - inout, ld, n1, n2, norm_n1, norm_n2, gain); - } - - public: - /** - * Constructs a RBF kernel object. - * It evaluates the kernel matrix using the following formula: - * K_ij = exp(-gain*|x1_i- x2_k|^2) - * - * @tparam math_t floating point type - * @param gain - */ - RBFKernel(math_t gain) : GramMatrixBase(), gain(gain) {} - - [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) - : GramMatrixBase(handle), gain(gain) - { - } - - void matrixRowNormL2(raft::resources const& handle, - dense_input_matrix_view_t matrix, - math_t* target) - { - bool is_row_major = GramMatrixBase::get_is_row_major(matrix); - int minor = is_row_major ? matrix.extent(1) : matrix.extent(0); - int ld = is_row_major ? matrix.stride(0) : matrix.stride(1); - ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter"); - raft::linalg::rowNorm(target, - matrix.data_handle(), - matrix.extent(1), - matrix.extent(0), - raft::linalg::NormType::L2Norm, - is_row_major, - resource::get_cuda_stream(handle)); - } - - void matrixRowNormL2(raft::resources const& handle, - csr_input_matrix_view_t matrix, - math_t* target) - { - auto matrix_structure = matrix.structure_view(); - raft::sparse::linalg::rowNormCsr(handle, - matrix_structure.get_indptr().data(), - matrix.get_elements().data(), - matrix_structure.get_nnz(), - matrix_structure.get_n_rows(), - target, - raft::linalg::NormType::L2Norm); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 dense device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - dense_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.extent(0), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 dense device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - dense_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.extent(0), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate kernel matrix using RBF kernel. - * - * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), - * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector - * in the x2 set, and | | euclidean distance. - * - * @param [in] handle raft handle - * @param [in] x1 csr device matrix view, size [n1*n_cols] - * @param [in] x2 csr device matrix view, size [n2*n_cols] - * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] - * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. - * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. - */ - void evaluate(raft::resources const& handle, - csr_input_matrix_view_t x1, - csr_input_matrix_view_t x2, - dense_output_matrix_view_t out, - math_t* norm_x1, - math_t* norm_x2) - { - cudaStream_t stream = resource::get_cuda_stream(handle); - - // lazy compute norms if not given - rmm::device_uvector tmp_norm_x1(0, stream); - rmm::device_uvector tmp_norm_x2(0, stream); - if (norm_x1 == nullptr) { - tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream); - norm_x1 = tmp_norm_x1.data(); - matrixRowNormL2(handle, x1, norm_x1); - } - if (norm_x2 == nullptr) { - tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream); - norm_x2 = tmp_norm_x2.data(); - matrixRowNormL2(handle, x2, norm_x2); - } - - // compute L2expanded - bool is_row_major = GramMatrixBase::get_is_row_major(out); - int ld_out = is_row_major ? out.stride(0) : out.stride(1); - GramMatrixBase::linear(handle, x1, x2, out); - applyKernel(out.data_handle(), - ld_out, - out.extent(0), - out.extent(1), - norm_x1, - norm_x2, - is_row_major, - resource::get_cuda_stream(handle)); - } - - /** Evaluate the Gram matrix using the legacy interface. - * - * @param [in] x1 device array of vectors, size [n1*n_cols] - * @param [in] n1 number vectors in x1 - * @param [in] n_cols number of columns (features) in x1 and x2 - * @param [in] x2 device array of vectors, size [n2*n_cols] - * @param [in] n2 number vectors in x2 - * @param [out] out device buffer to store the Gram matrix, size [n1*n2] - * @param [in] is_row_major whether the input and output matrices are in row - * major format - * @param [in] stream cuda stream - * @param ld1 leading dimension of x1 (usually it is n1) - * @param ld2 leading dimension of x2 (usually it is n2) - * @param ld_out leading dimension of out (usually it is n1) - */ - [[deprecated]] void evaluate(const math_t* x1, - int n1, - int n_cols, - const math_t* x2, - int n2, - math_t* out, - bool is_row_major, - cudaStream_t stream, - int ld1, - int ld2, - int ld_out) - { - ASSERT(GramMatrixBase::legacy_interface, - "Legacy interface can only be used with legacy ctor."); - int minor1 = is_row_major ? n_cols : n1; - int minor2 = is_row_major ? n_cols : n2; - int minor_out = is_row_major ? n2 : n1; - ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter"); - ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter"); - ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter"); - - math_t gain = this->gain; - using index_t = int64_t; - - rbf_fin_op fin_op{gain}; - - raft::resources handle; - resource::set_cuda_stream(handle, stream); - - cuvs::distance::distance(handle, - const_cast(x1), - const_cast(x2), - out, - n1, - n2, - n_cols, - NULL, - 0, - fin_op, - is_row_major); - } -}; - -}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.hpp b/cpp/src/distance/detail/kernels/kernel_matrices.hpp new file mode 100644 index 000000000..d675f3ceb --- /dev/null +++ b/cpp/src/distance/detail/kernels/kernel_matrices.hpp @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "gram_matrix.hpp" +#include + +namespace cuvs::distance::kernels::detail { + +/** + * Create a kernel matrix using polynomial kernel function. + */ +template +class PolynomialKernel : public GramMatrixBase { + exp_t exponent; + math_t gain; + math_t offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream); + + public: + /** + * Constructs a polynomial kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = (gain* + offset)^exponent + * + * @tparam math_t floating point type + * @tparam exp_t type of exponent + * @param exponent + * @param gain + * @param offset + */ + PolynomialKernel(exp_t exponent, math_t gain, math_t offset) + : GramMatrixBase(), exponent(exponent), gain(gain), offset(offset){}; + + [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle) + : GramMatrixBase(handle), exponent(exponent), gain(gain), offset(offset){}; + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using polynomial kernel. + * + * output[i,k] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; + +/** + * Create a kernel matrix using tanh kernel function. + */ +template +class TanhKernel : public GramMatrixBase { + math_t gain, offset; + + void applyKernel( + math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream); + + public: + /** + * Constructs a tanh kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = tanh(gain* + offset) + * + * @tparam math_t floating point type + * @param gain + * @param offset + */ + TanhKernel(math_t gain, math_t offset) : GramMatrixBase(), gain(gain), offset(offset) {} + + [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle) + : GramMatrixBase(handle), gain(gain), offset(offset){}; + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using tanh kernel. + * + * output_[i + k*n1] = (gain* + offset)^exponent, + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and < , > denotes dot product. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 unused. + * @param norm_x2 unused. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; + +/** + * Create a kernel matrix using RBF kernel function. + */ +template +class RBFKernel : public GramMatrixBase { + math_t gain; + + void applyKernel(math_t* inout, + int ld, + int rows, + int cols, + math_t* norm_x1, + math_t* norm_x2, + bool is_row_major, + cudaStream_t stream); + + public: + /** + * Constructs a RBF kernel object. + * It evaluates the kernel matrix using the following formula: + * K_ij = exp(-gain*|x1_i- x2_k|^2) + * + * @tparam math_t floating point type + * @param gain + */ + RBFKernel(math_t gain) : GramMatrixBase(), gain(gain){}; + + [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle) + : GramMatrixBase(handle), gain(gain){}; + + void matrixRowNormL2(raft::resources const& handle, + dense_input_matrix_view_t matrix, + math_t* target); + + void matrixRowNormL2(raft::resources const& handle, + csr_input_matrix_view_t matrix, + math_t* target); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 dense device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + dense_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 dense device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + dense_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate kernel matrix using RBF kernel. + * + * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2), + * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector + * in the x2 set, and | | euclidean distance. + * + * @param [in] handle raft handle + * @param [in] x1 csr device matrix view, size [n1*n_cols] + * @param [in] x2 csr device matrix view, size [n2*n_cols] + * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2] + * @param norm_x1 optional L2-norm of x1's rows for computation within RBF. + * @param norm_x2 optional L2-norm of x2's rows for computation within RBF. + */ + void evaluate(raft::resources const& handle, + csr_input_matrix_view_t x1, + csr_input_matrix_view_t x2, + dense_output_matrix_view_t out, + math_t* norm_x1, + math_t* norm_x2); + + /** Evaluate the Gram matrix using the legacy interface. + * + * @param [in] x1 device array of vectors, size [n1*n_cols] + * @param [in] n1 number vectors in x1 + * @param [in] n_cols number of columns (features) in x1 and x2 + * @param [in] x2 device array of vectors, size [n2*n_cols] + * @param [in] n2 number vectors in x2 + * @param [out] out device buffer to store the Gram matrix, size [n1*n2] + * @param [in] is_row_major whether the input and output matrices are in row + * major format + * @param [in] stream cuda stream + * @param ld1 leading dimension of x1 (usually it is n1) + * @param ld2 leading dimension of x2 (usually it is n2) + * @param ld_out leading dimension of out (usually it is n1) + */ + [[deprecated]] void evaluate(const math_t* x1, + int n1, + int n_cols, + const math_t* x2, + int n2, + math_t* out, + bool is_row_major, + cudaStream_t stream, + int ld1, + int ld2, + int ld_out); +}; + +}; // end namespace cuvs::distance::kernels::detail diff --git a/cpp/src/distance/kernels.cuh b/cpp/src/distance/kernels.cuh index 5983f1692..ca73eadc0 100644 --- a/cpp/src/distance/kernels.cuh +++ b/cpp/src/distance/kernels.cuh @@ -16,8 +16,8 @@ #pragma once -#include "detail/kernels/gram_matrix.cuh" -#include "detail/kernels/kernel_factory.cuh" +#include "detail/kernels/gram_matrix.hpp" +#include "detail/kernels/kernel_factory.hpp" namespace cuvs::distance::kernels {