Merge branch 'branch-25.02' into account-for-raft-update

rapidsai · Jan 20, 2025 · b0c7ab9 · b0c7ab9
2 parents 45a41fa + bd603a9
commit b0c7ab9
Show file tree

Hide file tree

Showing 21 changed files with 253 additions and 251 deletions.
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ There are several benefits to using cuVS and GPUs for vector search, including
 6. Multiple language support
 7. Building blocks for composing new or accelerating existing algorithms
 
-In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale. 
+In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a delightful development experience, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale.
 
 ## cuVS Technology Stack
 

diff --git a/ci/run_cuvs_pytests.sh b/ci/run_cuvs_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuvs/cuvs
 
-pytest --cache-clear --verbose "$@" tests
+pytest --cache-clear --verbose "$@" test
diff --git a/cpp/include/cuvs/core/detail/interop.hpp b/cpp/include/cuvs/core/detail/interop.hpp
@@ -86,7 +86,6 @@ inline MdspanType from_dlpack(DLManagedTensor* managed_tensor)
   RAFT_EXPECTS(to_data_type.lanes == tensor.dtype.lanes,
                "lanes mismatch between return mdspan and DLTensor");
   RAFT_EXPECTS(tensor.dtype.lanes == 1, "More than 1 DLTensor lanes not supported");
-  RAFT_EXPECTS(tensor.strides == nullptr, "Strided memory layout for DLTensor not supported");
 
   auto to_device = accessor_type_to_DLDevice<typename MdspanType::accessor_type>();
   if (to_device.device_type == kDLCUDA) {
@@ -110,4 +109,36 @@ inline MdspanType from_dlpack(DLManagedTensor* managed_tensor)
   return MdspanType{reinterpret_cast<typename MdspanType::data_handle_type>(tensor.data), exts};
 }
 
+inline bool is_f_contiguous(DLManagedTensor* managed_tensor)
+{
+  auto tensor = managed_tensor->dl_tensor;
+
+  if (!tensor.strides) { return false; }
+  int64_t expected_stride = 1;
+  for (int64_t i = 0; i < tensor.ndim; ++i) {
+    if (tensor.strides[i] != expected_stride) { return false; }
+    expected_stride *= tensor.shape[i];
+  }
+
+  return true;
+}
+
+inline bool is_c_contiguous(DLManagedTensor* managed_tensor)
+{
+  auto tensor = managed_tensor->dl_tensor;
+
+  if (!tensor.strides) {
+    // no stride information indicates a row-major tensor according to the dlpack spec
+    return true;
+  }
+
+  int64_t expected_stride = 1;
+  for (int64_t i = tensor.ndim - 1; i >= 0; --i) {
+    if (tensor.strides[i] != expected_stride) { return false; }
+    expected_stride *= tensor.shape[i];
+  }
+
+  return true;
+}
+
 }  // namespace cuvs::core::detail
diff --git a/cpp/include/cuvs/core/interop.hpp b/cpp/include/cuvs/core/interop.hpp
@@ -51,9 +51,25 @@ inline bool is_dlpack_host_compatible(DLTensor tensor)
   return detail::is_dlpack_host_compatible(tensor);
 }
 
+/**
+ * @brief Check if DLManagedTensor has a row-major (c-contiguous) layout
+ *
+ * @param tensor DLManagedTensor object to check
+ * @return bool
+ */
+inline bool is_c_contiguous(DLManagedTensor* tensor) { return detail::is_c_contiguous(tensor); }
+
+/**
+ * @brief Check if DLManagedTensor has a col-major (f-contiguous) layout
+ *
+ * @param tensor DLManagedTensor object to check
+ * @return bool
+ */
+inline bool is_f_contiguous(DLManagedTensor* tensor) { return detail::is_f_contiguous(tensor); }
+
 /**
  * @brief Convert a DLManagedTensor to an mdspan
- * NOTE: This function only supports compact row-major layouts.
+ * NOTE: This function only supports compact row-major and col-major layouts.
  *
  * @code {.cpp}
  * #include <raft/core/device_mdspan.hpp>

diff --git a/cpp/include/cuvs/neighbors/refine.hpp b/cpp/include/cuvs/neighbors/refine.hpp
@@ -76,6 +76,51 @@ void refine(raft::resources const& handle,
             raft::device_matrix_view<float, int64_t, raft::row_major> distances,
             cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded);
 
+/**
+ * @brief Refine nearest neighbor search.
+ *
+ * Refinement is an operation that follows an approximate NN search. The approximate search has
+ * already selected n_candidates neighbor candidates for each query. We narrow it down to k
+ * neighbors. For each query, we calculate the exact distance between the query and its
+ * n_candidates neighbor candidate, and select the k nearest ones.
+ *
+ * The k nearest neighbors and distances are returned.
+ *
+ * Example usage
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_pq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_pq::build(handle, index_params, dataset);
+ *   // use default search parameters
+ *   ivf_pq::search_params search_params;
+ *   // search m = 4 * k nearest neighbours for each of the N queries
+ *   ivf_pq::search(handle, search_params, index, queries, neighbor_candidates,
+ *                  out_dists_tmp);
+ *   // refine it to the k nearest one
+ *   refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
+ *           index.metric());
+ * @endcode
+ *
+ *
+ * @param[in] handle the raft handle
+ * @param[in] dataset device matrix that stores the dataset [n_rows, dims]
+ * @param[in] queries device matrix of the queries [n_queris, dims]
+ * @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
+ *   n_candidates >= k
+ * @param[out] indices device matrix that stores the refined indices [n_queries, k]
+ * @param[out] distances device matrix that stores the refined distances [n_queries, k]
+ * @param[in] metric distance metric to use. Euclidean (L2) is used by default
+ */
+void refine(raft::resources const& handle,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<const uint32_t, int64_t, raft::row_major> neighbor_candidates,
+            raft::device_matrix_view<uint32_t, int64_t, raft::row_major> indices,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded);
+
 /**
  * @brief Refine nearest neighbor search.
  *

diff --git a/cpp/src/distance/pairwise_distance_c.cpp b/cpp/src/distance/pairwise_distance_c.cpp
@@ -29,7 +29,7 @@
 
 namespace {
 
-template <typename T, typename DistT>
+template <typename T, typename DistT, typename LayoutT = raft::row_major>
 void _pairwise_distance(cuvsResources_t res,
                         DLManagedTensor* x_tensor,
                         DLManagedTensor* y_tensor,
@@ -39,8 +39,8 @@ void _pairwise_distance(cuvsResources_t res,
 {
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
 
-  using mdspan_type           = raft::device_matrix_view<T const, int64_t, raft::row_major>;
-  using distances_mdspan_type = raft::device_matrix_view<DistT, int64_t, raft::row_major>;
+  using mdspan_type           = raft::device_matrix_view<T const, int64_t, LayoutT>;
+  using distances_mdspan_type = raft::device_matrix_view<DistT, int64_t, LayoutT>;
 
   auto x_mds         = cuvs::core::from_dlpack<mdspan_type>(x_tensor);
   auto y_mds         = cuvs::core::from_dlpack<mdspan_type>(y_tensor);
@@ -70,17 +70,64 @@ extern "C" cuvsError_t cuvsPairwiseDistance(cuvsResources_t res,
       RAFT_FAIL("Inputs to cuvsPairwiseDistance must all have the same dtype");
     }
 
-    if (x_dt.bits == 32) {
-      _pairwise_distance<float, float>(
-        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
-    } else if (x_dt.bits == 16) {
-      _pairwise_distance<half, float>(
-        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
-    } else if (x_dt.bits == 64) {
-      _pairwise_distance<double, double>(
-        res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+    bool x_row_major;
+    if (cuvs::core::is_c_contiguous(x_tensor)) {
+      x_row_major = true;
+    } else if (cuvs::core::is_f_contiguous(x_tensor)) {
+      x_row_major = false;
     } else {
-      RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
+      RAFT_FAIL("X input to cuvsPairwiseDistance must be contiguous (non-strided)");
+    }
+
+    bool y_row_major;
+    if (cuvs::core::is_c_contiguous(y_tensor)) {
+      y_row_major = true;
+    } else if (cuvs::core::is_f_contiguous(y_tensor)) {
+      y_row_major = false;
+    } else {
+      RAFT_FAIL("Y input to cuvsPairwiseDistance must be contiguous (non-strided)");
+    }
+
+    bool distances_row_major;
+    if (cuvs::core::is_c_contiguous(distances_tensor)) {
+      distances_row_major = true;
+    } else if (cuvs::core::is_f_contiguous(distances_tensor)) {
+      distances_row_major = false;
+    } else {
+      RAFT_FAIL("distances input to cuvsPairwiseDistance must be contiguous (non-strided)");
+    }
+
+    if ((x_row_major != y_row_major) || (x_row_major != distances_row_major)) {
+      RAFT_FAIL(
+        "Inputs to cuvsPairwiseDistance must all have the same layout (row-major or col-major)");
+    }
+
+    if (x_row_major) {
+      if (x_dt.bits == 32) {
+        _pairwise_distance<float, float>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 16) {
+        _pairwise_distance<half, float>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 64) {
+        _pairwise_distance<double, double>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else {
+        RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
+      }
+    } else {
+      if (x_dt.bits == 32) {
+        _pairwise_distance<float, float, raft::col_major>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 16) {
+        _pairwise_distance<half, float, raft::col_major>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else if (x_dt.bits == 64) {
+        _pairwise_distance<double, double, raft::col_major>(
+          res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
+      } else {
+        RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
+      }
     }
   });
 }
diff --git a/cpp/src/neighbors/brute_force_c.cpp b/cpp/src/neighbors/brute_force_c.cpp
@@ -33,15 +33,15 @@
 
 namespace {
 
-template <typename T>
+template <typename T, typename LayoutT = raft::row_major>
 void* _build(cuvsResources_t res,
              DLManagedTensor* dataset_tensor,
              cuvsDistanceType metric,
              T metric_arg)
 {
   auto res_ptr = reinterpret_cast<raft::resources*>(res);
 
-  using mdspan_type = raft::device_matrix_view<T const, int64_t, raft::row_major>;
+  using mdspan_type = raft::device_matrix_view<T const, int64_t, LayoutT>;
   auto mds          = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);
 
   cuvs::neighbors::brute_force::index_params params;
@@ -53,7 +53,7 @@ void* _build(cuvsResources_t res,
   return index_on_heap;
 }
 
-template <typename T>
+template <typename T, typename QueriesLayoutT = raft::row_major>
 void _search(cuvsResources_t res,
              cuvsBruteForceIndex index,
              DLManagedTensor* queries_tensor,
@@ -64,7 +64,7 @@ void _search(cuvsResources_t res,
   auto res_ptr   = reinterpret_cast<raft::resources*>(res);
   auto index_ptr = reinterpret_cast<cuvs::neighbors::brute_force::index<T>*>(index.addr);
 
-  using queries_mdspan_type   = raft::device_matrix_view<T const, int64_t, raft::row_major>;
+  using queries_mdspan_type   = raft::device_matrix_view<T const, int64_t, QueriesLayoutT>;
   using neighbors_mdspan_type = raft::device_matrix_view<int64_t, int64_t, raft::row_major>;
   using distances_mdspan_type = raft::device_matrix_view<float, int64_t, raft::row_major>;
   using prefilter_mds_type    = raft::device_vector_view<const uint32_t, int64_t>;
@@ -150,8 +150,15 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res,
     auto dataset = dataset_tensor->dl_tensor;
 
     if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
-      index->addr =
-        reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
+      if (cuvs::core::is_c_contiguous(dataset_tensor)) {
+        index->addr =
+          reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
+      } else if (cuvs::core::is_f_contiguous(dataset_tensor)) {
+        index->addr = reinterpret_cast<uintptr_t>(
+          _build<float, raft::col_major>(res, dataset_tensor, metric, metric_arg));
+      } else {
+        RAFT_FAIL("dataset input to cuvsBruteForceBuild must be contiguous (non-strided)");
+      }
       index->dtype = dataset.dtype;
     } else {
       RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
@@ -189,7 +196,14 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
     RAFT_EXPECTS(queries.dtype.code == index.dtype.code, "type mismatch between index and queries");
 
     if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
-      _search<float>(res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
+      if (cuvs::core::is_c_contiguous(queries_tensor)) {
+        _search<float>(res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
+      } else if (cuvs::core::is_f_contiguous(queries_tensor)) {
+        _search<float, raft::col_major>(
+          res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
+      } else {
+        RAFT_FAIL("queries input to cuvsBruteForceSearch must be contiguous (non-strided)");
+      }
     } else {
       RAFT_FAIL("Unsupported queries DLtensor dtype: %d and bits: %d",
                 queries.dtype.code,

diff --git a/cpp/src/neighbors/detail/dynamic_batching.cuh b/cpp/src/neighbors/detail/dynamic_batching.cuh
@@ -238,8 +238,8 @@ enum struct slot_state : int32_t {
 struct batch_token {
   uint64_t value = 0;
 
-  constexpr inline batch_token() {}
-  explicit constexpr inline batch_token(uint32_t buffer_id) { id() = buffer_id; }
+  constexpr inline batch_token() = default;
+  RAFT_INLINE_FUNCTION explicit batch_token(uint32_t buffer_id) { id() = buffer_id; }
 
   /**
    * Sequential id of the batch in the array of batches.
@@ -492,7 +492,7 @@ struct batch_queue_t {
    * NB: "round" is the number of times the queue counters went over the whole ring buffer.
    *     It's used to avoid the ABA problem for atomic token updates.
    */
-  static constexpr inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
+  static inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
   {
     // Modify the seq_id to identify that the token slot is empty
     auto empty_round    = static_cast<uint32_t>(slot_state::kEmptyPast) * kSize;

diff --git a/cpp/src/neighbors/ivf_flat_index.cpp b/cpp/src/neighbors/ivf_flat_index.cpp
@@ -226,6 +226,7 @@ void index<T, IdxT>::check_consistency()
     "inconsistent number of lists (clusters)");
 }
 
+template struct index<float, uint32_t>;  // Used for refine function
 template struct index<float, int64_t>;
 template struct index<half, int64_t>;
 template struct index<int8_t, int64_t>;

diff --git a/cpp/src/neighbors/refine/detail/refine_device_float_float.cu b/cpp/src/neighbors/refine/detail/refine_device_float_float.cu
@@ -43,5 +43,6 @@
   }
 
 instantiate_cuvs_neighbors_refine_d(int64_t, float, float, int64_t);
+instantiate_cuvs_neighbors_refine_d(uint32_t, float, float, int64_t);
 
 #undef instantiate_cuvs_neighbors_refine_d
diff --git a/cpp/src/neighbors/refine/refine_device.cuh b/cpp/src/neighbors/refine/refine_device.cuh
@@ -84,12 +84,13 @@ void refine_device(
   cuvs::neighbors::ivf_flat::index<data_t, idx_t> refinement_index(
     handle, cuvs::distance::DistanceType(metric), n_queries, false, true, dim);
 
-  cuvs::neighbors::ivf_flat::detail::fill_refinement_index(handle,
-                                                           &refinement_index,
-                                                           dataset.data_handle(),
-                                                           neighbor_candidates.data_handle(),
-                                                           n_queries,
-                                                           n_candidates);
+  cuvs::neighbors::ivf_flat::detail::fill_refinement_index<data_t, idx_t>(
+    handle,
+    &refinement_index,
+    dataset.data_handle(),
+    neighbor_candidates.data_handle(),
+    static_cast<idx_t>(n_queries),
+    static_cast<uint32_t>(n_candidates));
   uint32_t grid_dim_x = 1;
 
   // the neighbor ids will be computed in uint32_t as offset

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -879,7 +879,7 @@ inline auto enum_variety_ip() -> test_cases_t
         // InnerProduct score is signed,
         // thus we're forced to used signed 8-bit representation,
         // thus we have one bit less precision
-        y.min_recall = y.min_recall.value() * 0.90;
+        y.min_recall = y.min_recall.value() * 0.88;
       } else {
         // In other cases it seems to perform a little bit better, still worse than L2
         y.min_recall = y.min_recall.value() * 0.94;

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -198,7 +198,7 @@ def setup(app):
 linkcode_resolve = make_linkcode_resolve(
     "cuvs",
     "https://github.com/rapidsai/cuvs/"
-    "blob/{revision}/python/cuvs/cuvs/"
+    "blob/{revision}/python/cuvs/"
     "{package}/{path}#L{lineno}",
 )