From 557c2aaf8e291b6eb47211587d13c7ac56f4804b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 12 Nov 2024 20:03:37 -0500
Subject: [PATCH] Exposing kernel gramm APIs

---
 cpp/CMakeLists.txt                            |   3 +
 .../distance/detail/kernels/gram_matrix.cu    | 478 +++++++++++
 .../{gram_matrix.cuh => gram_matrix.hpp}      | 231 +-----
 .../distance/detail/kernels/kernel_factory.cu |  59 ++
 .../detail/kernels/kernel_factory.cuh         |  65 --
 .../detail/kernels/kernel_factory.hpp         |  31 +
 .../detail/kernels/kernel_matrices.cu         | 719 ++++++++++++++++
 .../detail/kernels/kernel_matrices.cuh        | 777 ------------------
 .../detail/kernels/kernel_matrices.hpp        | 381 +++++++++
 cpp/src/distance/kernels.cuh                  |   4 +-
 10 files changed, 1695 insertions(+), 1053 deletions(-)
 create mode 100644 cpp/src/distance/detail/kernels/gram_matrix.cu
 rename cpp/src/distance/detail/kernels/{gram_matrix.cuh => gram_matrix.hpp} (59%)
 create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cu
 delete mode 100644 cpp/src/distance/detail/kernels/kernel_factory.cuh
 create mode 100644 cpp/src/distance/detail/kernels/kernel_factory.hpp
 create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cu
 delete mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.cuh
 create mode 100644 cpp/src/distance/detail/kernels/kernel_matrices.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9d5ec1304..a393d754c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -324,6 +324,9 @@ if(BUILD_SHARED_LIBS)
     src/cluster/kmeans_transform_float.cu
     src/cluster/single_linkage_float.cu
     src/core/bitset.cu
+    src/distance/detail/kernels/gram_matrix.cu
+    src/distance/detail/kernels/kernel_factory.cu
+    src/distance/detail/kernels/kernel_matrices.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_half_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_canberra_double_double_double_int.cu
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cu b/cpp/src/distance/detail/kernels/gram_matrix.cu
new file mode 100644
index 000000000..f51595e7c
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/gram_matrix.cu
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../distance.cuh"
+#include "gram_matrix.hpp"
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/sparse/distance/distance.cuh>
+#include <raft/sparse/linalg/spmm.hpp>
+
+namespace cuvs::distance::kernels::detail {
+
+/**
+ * Base class for general Gram matrices
+ * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
+ * Here, the  inner product is evaluated for all elements from vectors sets X1,
+ * and X2.
+ *
+ * To be more precise, on exit the output buffer will store:
+ * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
+ * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
+ * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
+ * from the x2 set.
+ */
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        dense_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        dense_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *  Vector sets are provided in Matrix format
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::operator()(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> x1,
+                                        csr_input_matrix_view_t<math_t> x2,
+                                        dense_output_matrix_view_t<math_t> out,
+                                        math_t* norm_x1,
+                                        math_t* norm_x2)
+{
+  evaluate(handle, x1, x2, out, norm_x1, norm_x2);
+}
+
+// unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      dense_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      dense_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::evaluate(raft::resources const& handle,
+                                      csr_input_matrix_view_t<math_t> x1,
+                                      csr_input_matrix_view_t<math_t> x2,
+                                      dense_output_matrix_view_t<math_t> out,
+                                      math_t* norm_x1,
+                                      math_t* norm_x2)
+{
+  linear(handle, x1, x2, out);
+}
+
+/** Evaluate the Gram matrix for two vector sets using simple dot product.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::evaluate(const math_t* x1,
+                                                     int n1,
+                                                     int n_cols,
+                                                     const math_t* x2,
+                                                     int n2,
+                                                     math_t* out,
+                                                     bool is_row_major,
+                                                     cudaStream_t stream,
+                                                     int ld1,
+                                                     int ld2,
+                                                     int ld_out)
+{
+  linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Convenience function to evaluate the Gram matrix for two vector sets.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::operator()(const math_t* x1,
+                                                       int n1,
+                                                       int n_cols,
+                                                       const math_t* x2,
+                                                       int n2,
+                                                       math_t* out,
+                                                       bool is_row_major,
+                                                       cudaStream_t stream,
+                                                       int ld1,
+                                                       int ld2,
+                                                       int ld_out)
+{
+  ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
+  if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
+  if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
+  if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
+  evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1
+ * @param ld2 leading dimension of x2
+ * @param ld_out leading dimension of out
+ */
+template <typename math_t>
+[[deprecated]] void GramMatrixBase<math_t>::linear(const math_t* x1,
+                                                   int n1,
+                                                   int n_cols,
+                                                   const math_t* x2,
+                                                   int n2,
+                                                   math_t* out,
+                                                   bool is_row_major,
+                                                   cudaStream_t stream,
+                                                   int ld1,
+                                                   int ld2,
+                                                   int ld_out)
+{
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_N,
+                                                     n2,
+                                                     n1,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x2,
+                                                     ld2,
+                                                     x1,
+                                                     ld1,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  } else {
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_T,
+                                                     n1,
+                                                     n2,
+                                                     n_cols,
+                                                     &alpha,
+                                                     x1,
+                                                     ld1,
+                                                     x2,
+                                                     ld2,
+                                                     &beta,
+                                                     out,
+                                                     ld_out,
+                                                     stream));
+  }
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(1) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+template <typename math_t>
+bool GramMatrixBase<math_t>::get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
+{
+  return (matrix.stride(0) == 1);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    dense_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major,
+         "GramMatrix leading dimensions for x1, x2 and out do not match");
+
+  // check dimensions
+  int n1     = out.extent(0);
+  int n2     = out.extent(1);
+  int n_cols = x1.extent(1);
+  ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  // extract major stride
+  int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
+  int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
+  int ld_out = is_row_major ? out.stride(0) : out.stride(1);
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+  if (is_row_major) {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       true,
+                       false,
+                       n2,
+                       n1,
+                       n_cols,
+                       &alpha,
+                       x2.data_handle(),
+                       ld2,
+                       x1.data_handle(),
+                       ld1,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  } else {
+    // #TODO: Use mdspan-based API when stride-capable
+    // https://github.com/rapidsai/raft/issues/875
+    raft::linalg::gemm(handle,
+                       false,
+                       true,
+                       n1,
+                       n2,
+                       n_cols,
+                       &alpha,
+                       x1.data_handle(),
+                       ld1,
+                       x2.data_handle(),
+                       ld2,
+                       &beta,
+                       out.data_handle(),
+                       ld_out,
+                       raft::resource::get_cuda_stream(handle));
+  }
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    dense_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check is_row_major consistency
+  bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
+  bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
+  ASSERT(is_row_major || is_col_major, "GramMatrix leading dimensions for x2 and out do not match");
+
+  // check dimensions
+  auto x1_structure = x1.structure_view();
+  ASSERT(x1_structure.get_n_rows() == out.extent(0),
+         "GramMatrix input matrix dimensions for x1 and out do not match");
+  ASSERT(x2.extent(0) == out.extent(1),
+         "GramMatrix input matrix dimensions for x2 and out do not match");
+  ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
+         "GramMatrix input matrix dimensions for x1 and x2 do not match");
+
+  math_t alpha = 1.0;
+  math_t beta  = 0.0;
+
+  raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
+}
+
+/** Calculates the Gram matrix using simple dot product between vector sets.
+ *
+ * out = x1 * x2
+ *
+ * Can be used as a building block for more complex kernel functions.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ */
+template <typename math_t>
+void GramMatrixBase<math_t>::linear(raft::resources const& handle,
+                                    csr_input_matrix_view_t<math_t> x1,
+                                    csr_input_matrix_view_t<math_t> x2,
+                                    dense_output_matrix_view_t<math_t> out)
+{
+  // check layout consistency (w.r.t. strides a matrix might be both row & col major)
+  bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
+  bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
+
+  ASSERT(is_row_major_nopad || is_col_major_nopad,
+         "Sparse linear Kernel distance does not support ld_out parameter");
+
+  // switch a,b based on is_row_major
+  if (is_col_major_nopad) {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(1), out.extent(0));
+    raft::sparse::distance::pairwise_distance(
+      handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  } else {
+    auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
+      out.data_handle(), out.extent(0), out.extent(1));
+    raft::sparse::distance::pairwise_distance(
+      handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
+  }
+}
+
+};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/gram_matrix.cuh b/cpp/src/distance/detail/kernels/gram_matrix.hpp
similarity index 59%
rename from cpp/src/distance/detail/kernels/gram_matrix.cuh
rename to cpp/src/distance/detail/kernels/gram_matrix.hpp
index d435fb4d1..165a81c80 100644
--- a/cpp/src/distance/detail/kernels/gram_matrix.cuh
+++ b/cpp/src/distance/detail/kernels/gram_matrix.hpp
@@ -16,23 +16,19 @@
 
 #pragma once
 
-#include "../../distance.cuh"
+#include "cublas.h"
 #include <cuvs/distance/distance.hpp>
 #include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/mdspan.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
-// #include <raft/sparse/detail/cusparse_wrappers.h>
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/distance/distance.cuh>
-#include <raft/sparse/linalg/spmm.hpp>
 
 namespace cuvs::distance::kernels::detail {
 
 template <typename math_t>
-using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, layout_stride>;
+using dense_input_matrix_view_t = raft::device_matrix_view<const math_t, int, raft::layout_stride>;
 template <typename math_t>
-using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, layout_stride>;
+using dense_output_matrix_view_t = raft::device_matrix_view<math_t, int, raft::layout_stride>;
 template <typename math_t>
 using csr_input_matrix_view_t = raft::device_csr_matrix_view<const math_t, int, int, int>;
 
@@ -76,10 +72,7 @@ class GramMatrixBase {
                   dense_input_matrix_view_t<math_t> x2,
                   dense_output_matrix_view_t<math_t> out,
                   math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
+                  math_t* norm_x2 = nullptr);
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
@@ -96,10 +89,7 @@ class GramMatrixBase {
                   dense_input_matrix_view_t<math_t> x2,
                   dense_output_matrix_view_t<math_t> out,
                   math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
+                  math_t* norm_x2 = nullptr);
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *  Vector sets are provided in Matrix format
@@ -116,10 +106,7 @@ class GramMatrixBase {
                   csr_input_matrix_view_t<math_t> x2,
                   dense_output_matrix_view_t<math_t> out,
                   math_t* norm_x1 = nullptr,
-                  math_t* norm_x2 = nullptr)
-  {
-    evaluate(handle, x1, x2, out, norm_x1, norm_x2);
-  }
+                  math_t* norm_x2 = nullptr);
 
   // unfortunately, 'evaluate' cannot be templatized as it needs to be virtual
 
@@ -137,10 +124,8 @@ class GramMatrixBase {
                         dense_input_matrix_view_t<math_t> x2,
                         dense_output_matrix_view_t<math_t> out,
                         math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
+                        math_t* norm_x2);
+
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
    * @param [in] handle raft handle
@@ -155,10 +140,8 @@ class GramMatrixBase {
                         dense_input_matrix_view_t<math_t> x2,
                         dense_output_matrix_view_t<math_t> out,
                         math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
+                        math_t* norm_x2);
+
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
    * @param [in] handle raft handle
@@ -173,10 +156,7 @@ class GramMatrixBase {
                         csr_input_matrix_view_t<math_t> x2,
                         dense_output_matrix_view_t<math_t> out,
                         math_t* norm_x1,
-                        math_t* norm_x2)
-  {
-    linear(handle, x1, x2, out);
-  }
+                        math_t* norm_x2);
 
   /** Evaluate the Gram matrix for two vector sets using simple dot product.
    *
@@ -203,10 +183,7 @@ class GramMatrixBase {
                                        cudaStream_t stream,
                                        int ld1,
                                        int ld2,
-                                       int ld_out)
-  {
-    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
+                                       int ld_out);
 
   /** Convenience function to evaluate the Gram matrix for two vector sets.
    *
@@ -233,14 +210,7 @@ class GramMatrixBase {
                                  cudaStream_t stream,
                                  int ld1    = 0,
                                  int ld2    = 0,
-                                 int ld_out = 0)
-  {
-    ASSERT(legacy_interface, "Legacy interface can only be used with legacy ctor.");
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
+                                 int ld_out = 0);
 
  protected:
   /** Calculates the Gram matrix using simple dot product between vector sets.
@@ -272,67 +242,13 @@ class GramMatrixBase {
                              cudaStream_t stream,
                              int ld1,
                              int ld2,
-                             int ld_out)
-  {
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_T,
-                                                       CUBLAS_OP_N,
-                                                       n2,
-                                                       n1,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x2,
-                                                       ld2,
-                                                       x1,
-                                                       ld1,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    } else {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_T,
-                                                       n1,
-                                                       n2,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x1,
-                                                       ld1,
-                                                       x2,
-                                                       ld2,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    }
-  }
+                             int ld_out);
 
  protected:
-  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(1) == 1);
-  }
-
-  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
-
-  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix)
-  {
-    return (matrix.stride(0) == 1);
-  }
+  bool get_is_row_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_row_major(dense_input_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_output_matrix_view_t<math_t> matrix);
+  bool get_is_col_major(dense_input_matrix_view_t<math_t> matrix);
 
   /** Calculates the Gram matrix using simple dot product between vector sets.
    *
@@ -348,67 +264,7 @@ class GramMatrixBase {
   void linear(raft::resources const& handle,
               dense_input_matrix_view_t<math_t> x1,
               dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x1) && get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x1) && get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x1, x2 and out do not match");
-
-    // check dimensions
-    int n1     = out.extent(0);
-    int n2     = out.extent(1);
-    int n_cols = x1.extent(1);
-    ASSERT(x1.extent(0) == n1, "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == n2, "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == n_cols, "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    // extract major stride
-    int ld1    = is_row_major ? x1.stride(0) : x1.stride(1);
-    int ld2    = is_row_major ? x2.stride(0) : x2.stride(1);
-    int ld_out = is_row_major ? out.stride(0) : out.stride(1);
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         true,
-                         false,
-                         n2,
-                         n1,
-                         n_cols,
-                         &alpha,
-                         x2.data_handle(),
-                         ld2,
-                         x1.data_handle(),
-                         ld1,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    } else {
-      // #TODO: Use mdspan-based API when stride-capable
-      // https://github.com/rapidsai/raft/issues/875
-      raft::linalg::gemm(handle,
-                         false,
-                         true,
-                         n1,
-                         n2,
-                         n_cols,
-                         &alpha,
-                         x1.data_handle(),
-                         ld1,
-                         x2.data_handle(),
-                         ld2,
-                         &beta,
-                         out.data_handle(),
-                         ld_out,
-                         resource::get_cuda_stream(handle));
-    }
-  }
+              dense_output_matrix_view_t<math_t> out);
 
   /** Calculates the Gram matrix using simple dot product between vector sets.
    *
@@ -424,28 +280,7 @@ class GramMatrixBase {
   void linear(raft::resources const& handle,
               csr_input_matrix_view_t<math_t> x1,
               dense_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check is_row_major consistency
-    bool is_row_major = get_is_row_major(x2) && get_is_row_major(out);
-    bool is_col_major = get_is_col_major(x2) && get_is_col_major(out);
-    ASSERT(is_row_major || is_col_major,
-           "GramMatrix leading dimensions for x2 and out do not match");
-
-    // check dimensions
-    auto x1_structure = x1.structure_view();
-    ASSERT(x1_structure.get_n_rows() == out.extent(0),
-           "GramMatrix input matrix dimensions for x1 and out do not match");
-    ASSERT(x2.extent(0) == out.extent(1),
-           "GramMatrix input matrix dimensions for x2 and out do not match");
-    ASSERT(x2.extent(1) == x1_structure.get_n_cols(),
-           "GramMatrix input matrix dimensions for x1 and x2 do not match");
-
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-
-    raft::sparse::linalg::spmm(handle, false, true, &alpha, x1, x2, &beta, out);
-  }
+              dense_output_matrix_view_t<math_t> out);
 
   /** Calculates the Gram matrix using simple dot product between vector sets.
    *
@@ -461,28 +296,6 @@ class GramMatrixBase {
   void linear(raft::resources const& handle,
               csr_input_matrix_view_t<math_t> x1,
               csr_input_matrix_view_t<math_t> x2,
-              dense_output_matrix_view_t<math_t> out)
-  {
-    // check layout consistency (w.r.t. strides a matrix might be both row & col major)
-    bool is_row_major_nopad = get_is_row_major(out) && out.stride(0) == out.extent(1);
-    bool is_col_major_nopad = get_is_col_major(out) && out.stride(1) == out.extent(0);
-
-    ASSERT(is_row_major_nopad || is_col_major_nopad,
-           "Sparse linear Kernel distance does not support ld_out parameter");
-
-    // switch a,b based on is_row_major
-    if (is_col_major_nopad) {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(1), out.extent(0));
-      raft::sparse::distance::pairwise_distance(
-        handle, x2, x1, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    } else {
-      auto out_row_major = raft::make_device_matrix_view<math_t, int, raft::row_major>(
-        out.data_handle(), out.extent(0), out.extent(1));
-      raft::sparse::distance::pairwise_distance(
-        handle, x1, x2, out_row_major, cuvs::distance::DistanceType::InnerProduct, 0.0);
-    }
-  }
+              dense_output_matrix_view_t<math_t> out);
 };
-
 };  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cu b/cpp/src/distance/detail/kernels/kernel_factory.cu
new file mode 100644
index 000000000..2d9634de2
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_factory.cu
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gram_matrix.hpp"
+#include "kernel_factory.hpp"
+
+namespace cuvs::distance::kernels::detail {
+
+template <typename math_t>
+GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(); break;
+    case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
+    case RBF: res = new RBFKernel<math_t>(gamma); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+template <typename math_t>
+[[deprecated]] GramMatrixBase<math_t>* KernelFactory<math_t>::create(KernelParams params,
+                                                                     cublasHandle_t handle)
+{
+  GramMatrixBase<math_t>* res;
+  // KernelParams is not templated, we convert the parameters to math_t here:
+  math_t coef0 = params.coef0;
+  math_t gamma = params.gamma;
+  switch (params.kernel) {
+    case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
+    case POLYNOMIAL:
+      res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
+      break;
+    case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
+    case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
+    default: throw raft::exception("Kernel not implemented");
+  }
+  return res;
+}
+
+};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.cuh b/cpp/src/distance/detail/kernels/kernel_factory.cuh
deleted file mode 100644
index 5c50a95a3..000000000
--- a/cpp/src/distance/detail/kernels/kernel_factory.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-#include "kernel_matrices.cuh"
-
-#include <cuvs/distance/distance.hpp>
-#include <raft/util/cudart_utils.hpp>
-
-namespace cuvs::distance::kernels::detail {
-
-template <typename math_t>
-class KernelFactory {
- public:
-  static GramMatrixBase<math_t>* create(KernelParams params)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(); break;
-      case POLYNOMIAL: res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0); break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0); break;
-      case RBF: res = new RBFKernel<math_t>(gamma); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-
-  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(handle); break;
-      case POLYNOMIAL:
-        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, handle);
-        break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0, handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma, handle); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_factory.hpp b/cpp/src/distance/detail/kernels/kernel_factory.hpp
new file mode 100644
index 000000000..cc767fbd5
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_factory.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gram_matrix.hpp"
+#include "kernel_matrices.hpp"
+
+namespace cuvs::distance::kernels::detail {
+
+template <typename math_t>
+class KernelFactory {
+ public:
+  static GramMatrixBase<math_t>* create(KernelParams params);
+  [[deprecated]] static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t handle);
+};
+
+};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cu b/cpp/src/distance/detail/kernels/kernel_matrices.cu
new file mode 100644
index 000000000..2ecceb8ed
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_matrices.cu
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../../distance/distance.cuh"
+#include "gram_matrix.hpp"
+#include "kernel_matrices.hpp"
+#include "rbf_fin_op.cuh"
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/sparse/linalg/norm.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+namespace cuvs::distance::kernels::detail {
+
+/** Epiloge function for polynomial kernel without padding.
+ * Calculates output = (gain*in + offset)^exponent
+ * @param inout device vector in column major format, size [len]
+ * @param len array length
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel_nopad(
+  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = pow(gain * inout[tid] + offset, exponent);
+  }
+}
+
+/** Epiloge function for polynomial kernel with padding.
+ * Calculates output = (gain*input + offset)^exponent
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param exponent
+ * @param gain
+ * @param offset
+ */
+template <typename math_t, typename exp_t>
+RAFT_KERNEL polynomial_kernel(
+  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
+    }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector, size [len]
+ * @param len length of the input vector
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
+{
+  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
+       tid += blockDim.x * gridDim.x) {
+    inout[tid] = tanh(gain * inout[tid] + offset);
+  }
+}
+
+/** Epiloge function for tanh kernel without padding.
+ * Calculates output = tanh(gain*input + offset)
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param gain
+ * @param offset
+ */
+template <typename math_t>
+RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y)
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
+    }
+}
+
+/** Epiloge function for rbf kernel using expansion.
+ *
+ * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
+ *
+ * Intended usage
+ *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
+ *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
+ *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
+ *
+ * @param inout device vector in column major format, size [ld * cols]
+ * @param ld leading dimension of the inout buffer
+ * @param rows number of rows (rows <= ld)
+ * @param cols number of columns
+ * @param norm_x l2-norm of X's rows
+ * @param norm_y l2-norm of Y's rows
+ * @param gain
+ */
+template <typename math_t>
+RAFT_KERNEL rbf_kernel_expanded(
+  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
+{
+  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
+       tidy += blockDim.y * gridDim.y) {
+    math_t norm_y_val = norm_y[tidy];
+    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
+         tidx += blockDim.x * gridDim.x) {
+      inout[tidx + tidy * ld] =
+        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
+    }
+  }
+}
+
+std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
+{
+  dim3 block_shape       = dim3(32, 4);
+  const int num_blocks_x = raft::ceildiv(n1, 32);
+  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
+  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
+  return std::make_tuple(grid_shape, block_shape);
+}
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, exponent, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
+      inout, ld, n1, n2, exponent, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               dense_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               dense_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using polynomial kernel.
+ *
+ * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t, typename exp_t>
+void PolynomialKernel<math_t, exp_t>::evaluate(raft::resources const& handle,
+                                               csr_input_matrix_view_t<math_t> x1,
+                                               csr_input_matrix_view_t<math_t> x2,
+                                               dense_output_matrix_view_t<math_t> out,
+                                               math_t* norm_x1,
+                                               math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t, typename exp_t>
+[[deprecated]] void PolynomialKernel<math_t, exp_t>::evaluate(const math_t* x1,
+                                                              int n1,
+                                                              int n_cols,
+                                                              const math_t* x2,
+                                                              int n2,
+                                                              math_t* out,
+                                                              bool is_row_major,
+                                                              cudaStream_t stream,
+                                                              int ld1,
+                                                              int ld2,
+                                                              int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::applyKernel(
+  math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
+{
+  const int n_minor = is_row_major ? cols : rows;
+  if (ld == n_minor) {
+    tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
+      inout, rows * cols, gain, offset);
+  } else {
+    int n1                         = is_row_major ? cols : rows;
+    int n2                         = is_row_major ? rows : cols;
+    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+    tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
+  }
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  dense_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  dense_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using tanh kernel.
+ *
+ * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and < , > denotes dot product.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 unused.
+ * @param norm_x2 unused.
+ */
+template <typename math_t>
+void TanhKernel<math_t>::evaluate(raft::resources const& handle,
+                                  csr_input_matrix_view_t<math_t> x1,
+                                  csr_input_matrix_view_t<math_t> x2,
+                                  dense_output_matrix_view_t<math_t> out,
+                                  math_t* norm_x1,
+                                  math_t* norm_x2)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void TanhKernel<math_t>::evaluate(const math_t* x1,
+                                                 int n1,
+                                                 int n_cols,
+                                                 const math_t* x2,
+                                                 int n2,
+                                                 math_t* out,
+                                                 bool is_row_major,
+                                                 cudaStream_t stream,
+                                                 int ld1,
+                                                 int ld2,
+                                                 int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  GramMatrixBase<math_t>::linear(
+    x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
+  applyKernel(out, ld_out, n1, n2, is_row_major, stream);
+}
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::applyKernel(math_t* inout,
+                                    int ld,
+                                    int rows,
+                                    int cols,
+                                    math_t* norm_x1,
+                                    math_t* norm_x2,
+                                    bool is_row_major,
+                                    cudaStream_t stream)
+{
+  int n1                         = is_row_major ? cols : rows;
+  int n2                         = is_row_major ? rows : cols;
+  math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
+  math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
+  auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
+  rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
+    inout, ld, n1, n2, norm_n1, norm_n2, gain);
+}
+
+template <typename math_t>
+void matrixRowNormL2(raft::resources const& handle,
+                     dense_input_matrix_view_t<math_t> matrix,
+                     math_t* target)
+{
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
+  int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
+  int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
+  ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
+  raft::linalg::rowNorm(target,
+                        matrix.data_handle(),
+                        matrix.extent(1),
+                        matrix.extent(0),
+                        raft::linalg::NormType::L2Norm,
+                        is_row_major,
+                        raft::resource::get_cuda_stream(handle));
+}
+
+template <typename math_t>
+void RBFKernel<math_t>::matrixRowNormL2(raft::resources const& handle,
+                                        csr_input_matrix_view_t<math_t> matrix,
+                                        math_t* target)
+{
+  auto matrix_structure = matrix.structure_view();
+  raft::sparse::linalg::rowNormCsr(handle,
+                                   matrix_structure.get_indptr().data(),
+                                   matrix.get_elements().data(),
+                                   matrix_structure.get_nnz(),
+                                   matrix_structure.get_n_rows(),
+                                   target,
+                                   raft::linalg::NormType::L2Norm);
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 dense device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 dense_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.extent(0), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 dense device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 dense_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.extent(0), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate kernel matrix using RBF kernel.
+ *
+ * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+ * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+ * in the x2 set, and | | euclidean distance.
+ *
+ * @param [in] handle raft handle
+ * @param [in] x1 csr device matrix view, size [n1*n_cols]
+ * @param [in] x2 csr device matrix view, size [n2*n_cols]
+ * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+ * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+ * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+ */
+template <typename math_t>
+void RBFKernel<math_t>::evaluate(raft::resources const& handle,
+                                 csr_input_matrix_view_t<math_t> x1,
+                                 csr_input_matrix_view_t<math_t> x2,
+                                 dense_output_matrix_view_t<math_t> out,
+                                 math_t* norm_x1,
+                                 math_t* norm_x2)
+{
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  // lazy compute norms if not given
+  rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
+  rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
+  if (norm_x1 == nullptr) {
+    tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
+    norm_x1 = tmp_norm_x1.data();
+    matrixRowNormL2(handle, x1, norm_x1);
+  }
+  if (norm_x2 == nullptr) {
+    tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
+    norm_x2 = tmp_norm_x2.data();
+    matrixRowNormL2(handle, x2, norm_x2);
+  }
+
+  // compute L2expanded
+  bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
+  int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
+  GramMatrixBase<math_t>::linear(handle, x1, x2, out);
+  applyKernel(out.data_handle(),
+              ld_out,
+              out.extent(0),
+              out.extent(1),
+              norm_x1,
+              norm_x2,
+              is_row_major,
+              raft::resource::get_cuda_stream(handle));
+}
+
+/** Evaluate the Gram matrix using the legacy interface.
+ *
+ * @param [in] x1 device array of vectors, size [n1*n_cols]
+ * @param [in] n1 number vectors in x1
+ * @param [in] n_cols number of columns (features) in x1 and x2
+ * @param [in] x2 device array of vectors, size [n2*n_cols]
+ * @param [in] n2 number vectors in x2
+ * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+ * @param [in] is_row_major whether the input and output matrices are in row
+ *        major format
+ * @param [in] stream cuda stream
+ * @param ld1 leading dimension of x1 (usually it is n1)
+ * @param ld2 leading dimension of x2 (usually it is n2)
+ * @param ld_out leading dimension of out (usually it is n1)
+ */
+template <typename math_t>
+[[deprecated]] void RBFKernel<math_t>::evaluate(const math_t* x1,
+                                                int n1,
+                                                int n_cols,
+                                                const math_t* x2,
+                                                int n2,
+                                                math_t* out,
+                                                bool is_row_major,
+                                                cudaStream_t stream,
+                                                int ld1,
+                                                int ld2,
+                                                int ld_out)
+{
+  ASSERT(GramMatrixBase<math_t>::legacy_interface,
+         "Legacy interface can only be used with legacy ctor.");
+  int minor1    = is_row_major ? n_cols : n1;
+  int minor2    = is_row_major ? n_cols : n2;
+  int minor_out = is_row_major ? n2 : n1;
+  ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
+  ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
+  ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
+
+  math_t gain   = this->gain;
+  using index_t = int64_t;
+
+  rbf_fin_op fin_op{gain};
+
+  raft::resources handle;
+  raft::resource::set_cuda_stream(handle, stream);
+
+  cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
+                           math_t,
+                           math_t,
+                           math_t,
+                           decltype(fin_op),
+                           index_t>(handle,
+                                    const_cast<math_t*>(x1),
+                                    const_cast<math_t*>(x2),
+                                    out,
+                                    n1,
+                                    n2,
+                                    n_cols,
+                                    NULL,
+                                    0,
+                                    fin_op,
+                                    is_row_major);
+}
+
+};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.cuh b/cpp/src/distance/detail/kernels/kernel_matrices.cuh
deleted file mode 100644
index bff5bda92..000000000
--- a/cpp/src/distance/detail/kernels/kernel_matrices.cuh
+++ /dev/null
@@ -1,777 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "gram_matrix.cuh"
-
-#include "../detail/kernels/rbf_fin_op.cuh"
-#include <cuvs/distance/distance.cuh>
-#include <raft/core/resource/cuda_stream.hpp>
-#include <raft/linalg/gemm.cuh>
-#include <raft/sparse/linalg/norm.cuh>
-#include <raft/util/cuda_utils.cuh>
-
-namespace cuvs::distance::kernels::detail {
-
-/** Epiloge function for polynomial kernel without padding.
- * Calculates output = (gain*in + offset)^exponent
- * @param inout device vector in column major format, size [len]
- * @param len array length
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel_nopad(
-  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = pow(gain * inout[tid] + offset, exponent);
-  }
-}
-
-/** Epiloge function for polynomial kernel with padding.
- * Calculates output = (gain*input + offset)^exponent
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-RAFT_KERNEL polynomial_kernel(
-  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
-    }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector, size [len]
- * @param len length of the input vector
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = tanh(gain * inout[tid] + offset);
-  }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param gain
- * @param offset
- */
-template <typename math_t>
-RAFT_KERNEL tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
-    }
-}
-
-/** Epiloge function for rbf kernel using expansion.
- *
- * Calculates output_ij = exp(-gain * (norm_x_i + norm_y_j - 2*input_ij));
- *
- * Intended usage
- *   - input is the product of two matrices X and Y input_ij = sum_k X_ik * Y_jk
- *   - norm_x_i = l2_norm(x_i), where x_i is the i-th row of matrix X
- *   - norm_y_j = l2_norm(y_j), where y_j is the j-th row of matrix Y
- *
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of columns
- * @param norm_x l2-norm of X's rows
- * @param norm_y l2-norm of Y's rows
- * @param gain
- */
-template <typename math_t>
-RAFT_KERNEL rbf_kernel_expanded(
-  math_t* inout, int ld, int rows, int cols, math_t* norm_x, math_t* norm_y, math_t gain)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y) {
-    math_t norm_y_val = norm_y[tidy];
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] =
-        exp(-1.0 * gain * (norm_x[tidx] + norm_y_val - inout[tidx + tidy * ld] * 2));
-    }
-  }
-}
-
-namespace {
-std::tuple<dim3, dim3> generateLaunchConfig2dElementwiseOp(int n1, int n2)
-{
-  dim3 block_shape       = dim3(32, 4);
-  const int num_blocks_x = raft::ceildiv(n1, 32);
-  const int num_blocks_y = std::min(raft::ceildiv(n2, 32), (1 << 16) - 1);
-  dim3 grid_shape        = dim3(num_blocks_x, num_blocks_y);
-  return std::make_tuple(grid_shape, block_shape);
-}
-}  // namespace
-
-/**
- * Create a kernel matrix using polynomial kernel function.
- */
-template <typename math_t, typename exp_t>
-class PolynomialKernel : public GramMatrixBase<math_t> {
-  exp_t exponent;
-  math_t gain;
-  math_t offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, exponent, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      polynomial_kernel<<<grid_shape, block_shape, 0, stream>>>(
-        inout, ld, n1, n2, exponent, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a polynomial kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
-   *
-   * @tparam math_t floating point type
-   * @tparam exp_t type of exponent
-   * @param exponent
-   * @param gain
-   * @param offset
-   */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
-    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using tanh kernel function.
- */
-template <typename math_t>
-class TanhKernel : public GramMatrixBase<math_t> {
-  math_t gain, offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain, offset);
-    } else {
-      int n1                         = is_row_major ? cols : rows;
-      int n2                         = is_row_major ? rows : cols;
-      auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-      tanh_kernel<<<grid_shape, block_shape, 0, stream>>>(inout, ld, n1, n2, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a tanh kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   * @param offset
-   */
-  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
-
-  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 unused.
-   * @param norm_x2 unused.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using RBF kernel function.
- */
-template <typename math_t>
-class RBFKernel : public GramMatrixBase<math_t> {
-  math_t gain;
-
-  void applyKernel(math_t* inout,
-                   int ld,
-                   int rows,
-                   int cols,
-                   math_t* norm_x1,
-                   math_t* norm_x2,
-                   bool is_row_major,
-                   cudaStream_t stream)
-  {
-    int n1                         = is_row_major ? cols : rows;
-    int n2                         = is_row_major ? rows : cols;
-    math_t* norm_n1                = is_row_major ? norm_x2 : norm_x1;
-    math_t* norm_n2                = is_row_major ? norm_x1 : norm_x2;
-    auto [grid_shape, block_shape] = generateLaunchConfig2dElementwiseOp(n1, n2);
-    rbf_kernel_expanded<<<grid_shape, block_shape, 0, stream>>>(
-      inout, ld, n1, n2, norm_n1, norm_n2, gain);
-  }
-
- public:
-  /**
-   * Constructs a RBF kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = exp(-gain*|x1_i- x2_k|^2)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain) {}
-
-  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
-    : GramMatrixBase<math_t>(handle), gain(gain)
-  {
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       dense_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(matrix);
-    int minor         = is_row_major ? matrix.extent(1) : matrix.extent(0);
-    int ld            = is_row_major ? matrix.stride(0) : matrix.stride(1);
-    ASSERT(ld == minor, "RBF Kernel lazy rowNorm compute does not support ld parameter");
-    raft::linalg::rowNorm(target,
-                          matrix.data_handle(),
-                          matrix.extent(1),
-                          matrix.extent(0),
-                          raft::linalg::NormType::L2Norm,
-                          is_row_major,
-                          resource::get_cuda_stream(handle));
-  }
-
-  void matrixRowNormL2(raft::resources const& handle,
-                       csr_input_matrix_view_t<math_t> matrix,
-                       math_t* target)
-  {
-    auto matrix_structure = matrix.structure_view();
-    raft::sparse::linalg::rowNormCsr(handle,
-                                     matrix_structure.get_indptr().data(),
-                                     matrix.get_elements().data(),
-                                     matrix_structure.get_nnz(),
-                                     matrix_structure.get_n_rows(),
-                                     target,
-                                     raft::linalg::NormType::L2Norm);
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 dense device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                dense_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.extent(0), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 dense device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                dense_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.extent(0), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] handle raft handle
-   * @param [in] x1 csr device matrix view, size [n1*n_cols]
-   * @param [in] x2 csr device matrix view, size [n2*n_cols]
-   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
-   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
-   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
-   */
-  void evaluate(raft::resources const& handle,
-                csr_input_matrix_view_t<math_t> x1,
-                csr_input_matrix_view_t<math_t> x2,
-                dense_output_matrix_view_t<math_t> out,
-                math_t* norm_x1,
-                math_t* norm_x2)
-  {
-    cudaStream_t stream = resource::get_cuda_stream(handle);
-
-    // lazy compute norms if not given
-    rmm::device_uvector<math_t> tmp_norm_x1(0, stream);
-    rmm::device_uvector<math_t> tmp_norm_x2(0, stream);
-    if (norm_x1 == nullptr) {
-      tmp_norm_x1.reserve(x1.structure_view().get_n_rows(), stream);
-      norm_x1 = tmp_norm_x1.data();
-      matrixRowNormL2(handle, x1, norm_x1);
-    }
-    if (norm_x2 == nullptr) {
-      tmp_norm_x2.reserve(x2.structure_view().get_n_rows(), stream);
-      norm_x2 = tmp_norm_x2.data();
-      matrixRowNormL2(handle, x2, norm_x2);
-    }
-
-    // compute L2expanded
-    bool is_row_major = GramMatrixBase<math_t>::get_is_row_major(out);
-    int ld_out        = is_row_major ? out.stride(0) : out.stride(1);
-    GramMatrixBase<math_t>::linear(handle, x1, x2, out);
-    applyKernel(out.data_handle(),
-                ld_out,
-                out.extent(0),
-                out.extent(1),
-                norm_x1,
-                norm_x2,
-                is_row_major,
-                resource::get_cuda_stream(handle));
-  }
-
-  /** Evaluate the Gram matrix using the legacy interface.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  [[deprecated]] void evaluate(const math_t* x1,
-                               int n1,
-                               int n_cols,
-                               const math_t* x2,
-                               int n2,
-                               math_t* out,
-                               bool is_row_major,
-                               cudaStream_t stream,
-                               int ld1,
-                               int ld2,
-                               int ld_out)
-  {
-    ASSERT(GramMatrixBase<math_t>::legacy_interface,
-           "Legacy interface can only be used with legacy ctor.");
-    int minor1    = is_row_major ? n_cols : n1;
-    int minor2    = is_row_major ? n_cols : n2;
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
-    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-
-    math_t gain   = this->gain;
-    using index_t = int64_t;
-
-    rbf_fin_op fin_op{gain};
-
-    raft::resources handle;
-    resource::set_cuda_stream(handle, stream);
-
-    cuvs::distance::distance<cuvs::distance::DistanceType::L2Unexpanded,
-                             math_t,
-                             math_t,
-                             math_t,
-                             decltype(fin_op),
-                             index_t>(handle,
-                                      const_cast<math_t*>(x1),
-                                      const_cast<math_t*>(x2),
-                                      out,
-                                      n1,
-                                      n2,
-                                      n_cols,
-                                      NULL,
-                                      0,
-                                      fin_op,
-                                      is_row_major);
-  }
-};
-
-};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/detail/kernels/kernel_matrices.hpp b/cpp/src/distance/detail/kernels/kernel_matrices.hpp
new file mode 100644
index 000000000..d675f3ceb
--- /dev/null
+++ b/cpp/src/distance/detail/kernels/kernel_matrices.hpp
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "gram_matrix.hpp"
+#include <raft/core/resource/cuda_stream.hpp>
+
+namespace cuvs::distance::kernels::detail {
+
+/**
+ * Create a kernel matrix using polynomial kernel function.
+ */
+template <typename math_t, typename exp_t>
+class PolynomialKernel : public GramMatrixBase<math_t> {
+  exp_t exponent;
+  math_t gain;
+  math_t offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a polynomial kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
+   *
+   * @tparam math_t floating point type
+   * @tparam exp_t type of exponent
+   * @param exponent
+   * @param gain
+   * @param offset
+   */
+  PolynomialKernel(exp_t exponent, math_t gain, math_t offset)
+    : GramMatrixBase<math_t>(), exponent(exponent), gain(gain), offset(offset){};
+
+  [[deprecated]] PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), exponent(exponent), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using polynomial kernel.
+   *
+   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using tanh kernel function.
+ */
+template <typename math_t>
+class TanhKernel : public GramMatrixBase<math_t> {
+  math_t gain, offset;
+
+  void applyKernel(
+    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a tanh kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   * @param offset
+   */
+  TanhKernel(math_t gain, math_t offset) : GramMatrixBase<math_t>(), gain(gain), offset(offset) {}
+
+  [[deprecated]] TanhKernel(math_t gain, math_t offset, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain), offset(offset){};
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using tanh kernel.
+   *
+   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and < , > denotes dot product.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 unused.
+   * @param norm_x2 unused.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+/**
+ * Create a kernel matrix using RBF kernel function.
+ */
+template <typename math_t>
+class RBFKernel : public GramMatrixBase<math_t> {
+  math_t gain;
+
+  void applyKernel(math_t* inout,
+                   int ld,
+                   int rows,
+                   int cols,
+                   math_t* norm_x1,
+                   math_t* norm_x2,
+                   bool is_row_major,
+                   cudaStream_t stream);
+
+ public:
+  /**
+   * Constructs a RBF kernel object.
+   * It evaluates the kernel matrix using the following formula:
+   * K_ij = exp(-gain*|x1_i- x2_k|^2)
+   *
+   * @tparam math_t floating point type
+   * @param gain
+   */
+  RBFKernel(math_t gain) : GramMatrixBase<math_t>(), gain(gain){};
+
+  [[deprecated]] RBFKernel(math_t gain, cublasHandle_t handle)
+    : GramMatrixBase<math_t>(handle), gain(gain){};
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       dense_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  void matrixRowNormL2(raft::resources const& handle,
+                       csr_input_matrix_view_t<math_t> matrix,
+                       math_t* target);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 dense device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                dense_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 dense device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                dense_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate kernel matrix using RBF kernel.
+   *
+   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
+   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
+   * in the x2 set, and | | euclidean distance.
+   *
+   * @param [in] handle raft handle
+   * @param [in] x1 csr device matrix view, size [n1*n_cols]
+   * @param [in] x2 csr device matrix view, size [n2*n_cols]
+   * @param [out] out dense device matrix view for the Gram matrix, size [n1*n2]
+   * @param norm_x1 optional L2-norm of x1's rows for computation within RBF.
+   * @param norm_x2 optional L2-norm of x2's rows for computation within RBF.
+   */
+  void evaluate(raft::resources const& handle,
+                csr_input_matrix_view_t<math_t> x1,
+                csr_input_matrix_view_t<math_t> x2,
+                dense_output_matrix_view_t<math_t> out,
+                math_t* norm_x1,
+                math_t* norm_x2);
+
+  /** Evaluate the Gram matrix using the legacy interface.
+   *
+   * @param [in] x1 device array of vectors, size [n1*n_cols]
+   * @param [in] n1 number vectors in x1
+   * @param [in] n_cols number of columns (features) in x1 and x2
+   * @param [in] x2 device array of vectors, size [n2*n_cols]
+   * @param [in] n2 number vectors in x2
+   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
+   * @param [in] is_row_major whether the input and output matrices are in row
+   *        major format
+   * @param [in] stream cuda stream
+   * @param ld1 leading dimension of x1 (usually it is n1)
+   * @param ld2 leading dimension of x2 (usually it is n2)
+   * @param ld_out leading dimension of out (usually it is n1)
+   */
+  [[deprecated]] void evaluate(const math_t* x1,
+                               int n1,
+                               int n_cols,
+                               const math_t* x2,
+                               int n2,
+                               math_t* out,
+                               bool is_row_major,
+                               cudaStream_t stream,
+                               int ld1,
+                               int ld2,
+                               int ld_out);
+};
+
+};  // end namespace cuvs::distance::kernels::detail
diff --git a/cpp/src/distance/kernels.cuh b/cpp/src/distance/kernels.cuh
index 5983f1692..ca73eadc0 100644
--- a/cpp/src/distance/kernels.cuh
+++ b/cpp/src/distance/kernels.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "detail/kernels/gram_matrix.cuh"
-#include "detail/kernels/kernel_factory.cuh"
+#include "detail/kernels/gram_matrix.hpp"
+#include "detail/kernels/kernel_factory.hpp"
 
 namespace cuvs::distance::kernels {