Skip to content

Commit

Permalink
Merge branch 'branch-25.02' into account-for-raft-update
Browse files Browse the repository at this point in the history
  • Loading branch information
viclafargue committed Jan 20, 2025
2 parents 45a41fa + bd603a9 commit b0c7ab9
Show file tree
Hide file tree
Showing 21 changed files with 253 additions and 251 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ There are several benefits to using cuVS and GPUs for vector search, including
6. Multiple language support
7. Building blocks for composing new or accelerating existing algorithms

In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a deslightful development experimence, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale.
In addition to the items above, cuVS takes on the burden of keeping non-trivial accelerated code up to date as new NVIDIA architectures and CUDA versions are released. This provides a delightful development experience, guaranteeing that any libraries, databases, or applications built on top of it will always be getting the best performance and scale.

## cuVS Technology Stack

Expand Down
2 changes: 1 addition & 1 deletion ci/run_cuvs_pytests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ set -euo pipefail
# Support invoking run_pytests.sh outside the script directory
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuvs/cuvs

pytest --cache-clear --verbose "$@" tests
pytest --cache-clear --verbose "$@" test
33 changes: 32 additions & 1 deletion cpp/include/cuvs/core/detail/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,6 @@ inline MdspanType from_dlpack(DLManagedTensor* managed_tensor)
RAFT_EXPECTS(to_data_type.lanes == tensor.dtype.lanes,
"lanes mismatch between return mdspan and DLTensor");
RAFT_EXPECTS(tensor.dtype.lanes == 1, "More than 1 DLTensor lanes not supported");
RAFT_EXPECTS(tensor.strides == nullptr, "Strided memory layout for DLTensor not supported");

auto to_device = accessor_type_to_DLDevice<typename MdspanType::accessor_type>();
if (to_device.device_type == kDLCUDA) {
Expand All @@ -110,4 +109,36 @@ inline MdspanType from_dlpack(DLManagedTensor* managed_tensor)
return MdspanType{reinterpret_cast<typename MdspanType::data_handle_type>(tensor.data), exts};
}

inline bool is_f_contiguous(DLManagedTensor* managed_tensor)
{
auto tensor = managed_tensor->dl_tensor;

if (!tensor.strides) { return false; }
int64_t expected_stride = 1;
for (int64_t i = 0; i < tensor.ndim; ++i) {
if (tensor.strides[i] != expected_stride) { return false; }
expected_stride *= tensor.shape[i];
}

return true;
}

inline bool is_c_contiguous(DLManagedTensor* managed_tensor)
{
auto tensor = managed_tensor->dl_tensor;

if (!tensor.strides) {
// no stride information indicates a row-major tensor according to the dlpack spec
return true;
}

int64_t expected_stride = 1;
for (int64_t i = tensor.ndim - 1; i >= 0; --i) {
if (tensor.strides[i] != expected_stride) { return false; }
expected_stride *= tensor.shape[i];
}

return true;
}

} // namespace cuvs::core::detail
18 changes: 17 additions & 1 deletion cpp/include/cuvs/core/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,25 @@ inline bool is_dlpack_host_compatible(DLTensor tensor)
return detail::is_dlpack_host_compatible(tensor);
}

/**
* @brief Check if DLManagedTensor has a row-major (c-contiguous) layout
*
* @param tensor DLManagedTensor object to check
* @return bool
*/
inline bool is_c_contiguous(DLManagedTensor* tensor) { return detail::is_c_contiguous(tensor); }

/**
* @brief Check if DLManagedTensor has a col-major (f-contiguous) layout
*
* @param tensor DLManagedTensor object to check
* @return bool
*/
inline bool is_f_contiguous(DLManagedTensor* tensor) { return detail::is_f_contiguous(tensor); }

/**
* @brief Convert a DLManagedTensor to an mdspan
* NOTE: This function only supports compact row-major layouts.
* NOTE: This function only supports compact row-major and col-major layouts.
*
* @code {.cpp}
* #include <raft/core/device_mdspan.hpp>
Expand Down
45 changes: 45 additions & 0 deletions cpp/include/cuvs/neighbors/refine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,51 @@ void refine(raft::resources const& handle,
raft::device_matrix_view<float, int64_t, raft::row_major> distances,
cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded);

/**
* @brief Refine nearest neighbor search.
*
* Refinement is an operation that follows an approximate NN search. The approximate search has
* already selected n_candidates neighbor candidates for each query. We narrow it down to k
* neighbors. For each query, we calculate the exact distance between the query and its
* n_candidates neighbor candidate, and select the k nearest ones.
*
* The k nearest neighbors and distances are returned.
*
* Example usage
* @code{.cpp}
* using namespace cuvs::neighbors;
* // use default index parameters
* ivf_pq::index_params index_params;
* // create and fill the index from a [N, D] dataset
* auto index = ivf_pq::build(handle, index_params, dataset);
* // use default search parameters
* ivf_pq::search_params search_params;
* // search m = 4 * k nearest neighbours for each of the N queries
* ivf_pq::search(handle, search_params, index, queries, neighbor_candidates,
* out_dists_tmp);
* // refine it to the k nearest one
* refine(handle, dataset, queries, neighbor_candidates, out_indices, out_dists,
* index.metric());
* @endcode
*
*
* @param[in] handle the raft handle
* @param[in] dataset device matrix that stores the dataset [n_rows, dims]
* @param[in] queries device matrix of the queries [n_queris, dims]
* @param[in] neighbor_candidates indices of candidate vectors [n_queries, n_candidates], where
* n_candidates >= k
* @param[out] indices device matrix that stores the refined indices [n_queries, k]
* @param[out] distances device matrix that stores the refined distances [n_queries, k]
* @param[in] metric distance metric to use. Euclidean (L2) is used by default
*/
void refine(raft::resources const& handle,
raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
raft::device_matrix_view<const uint32_t, int64_t, raft::row_major> neighbor_candidates,
raft::device_matrix_view<uint32_t, int64_t, raft::row_major> indices,
raft::device_matrix_view<float, int64_t, raft::row_major> distances,
cuvs::distance::DistanceType metric = cuvs::distance::DistanceType::L2Unexpanded);

/**
* @brief Refine nearest neighbor search.
*
Expand Down
73 changes: 60 additions & 13 deletions cpp/src/distance/pairwise_distance_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

namespace {

template <typename T, typename DistT>
template <typename T, typename DistT, typename LayoutT = raft::row_major>
void _pairwise_distance(cuvsResources_t res,
DLManagedTensor* x_tensor,
DLManagedTensor* y_tensor,
Expand All @@ -39,8 +39,8 @@ void _pairwise_distance(cuvsResources_t res,
{
auto res_ptr = reinterpret_cast<raft::resources*>(res);

using mdspan_type = raft::device_matrix_view<T const, int64_t, raft::row_major>;
using distances_mdspan_type = raft::device_matrix_view<DistT, int64_t, raft::row_major>;
using mdspan_type = raft::device_matrix_view<T const, int64_t, LayoutT>;
using distances_mdspan_type = raft::device_matrix_view<DistT, int64_t, LayoutT>;

auto x_mds = cuvs::core::from_dlpack<mdspan_type>(x_tensor);
auto y_mds = cuvs::core::from_dlpack<mdspan_type>(y_tensor);
Expand Down Expand Up @@ -70,17 +70,64 @@ extern "C" cuvsError_t cuvsPairwiseDistance(cuvsResources_t res,
RAFT_FAIL("Inputs to cuvsPairwiseDistance must all have the same dtype");
}

if (x_dt.bits == 32) {
_pairwise_distance<float, float>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else if (x_dt.bits == 16) {
_pairwise_distance<half, float>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else if (x_dt.bits == 64) {
_pairwise_distance<double, double>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
bool x_row_major;
if (cuvs::core::is_c_contiguous(x_tensor)) {
x_row_major = true;
} else if (cuvs::core::is_f_contiguous(x_tensor)) {
x_row_major = false;
} else {
RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
RAFT_FAIL("X input to cuvsPairwiseDistance must be contiguous (non-strided)");
}

bool y_row_major;
if (cuvs::core::is_c_contiguous(y_tensor)) {
y_row_major = true;
} else if (cuvs::core::is_f_contiguous(y_tensor)) {
y_row_major = false;
} else {
RAFT_FAIL("Y input to cuvsPairwiseDistance must be contiguous (non-strided)");
}

bool distances_row_major;
if (cuvs::core::is_c_contiguous(distances_tensor)) {
distances_row_major = true;
} else if (cuvs::core::is_f_contiguous(distances_tensor)) {
distances_row_major = false;
} else {
RAFT_FAIL("distances input to cuvsPairwiseDistance must be contiguous (non-strided)");
}

if ((x_row_major != y_row_major) || (x_row_major != distances_row_major)) {
RAFT_FAIL(
"Inputs to cuvsPairwiseDistance must all have the same layout (row-major or col-major)");
}

if (x_row_major) {
if (x_dt.bits == 32) {
_pairwise_distance<float, float>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else if (x_dt.bits == 16) {
_pairwise_distance<half, float>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else if (x_dt.bits == 64) {
_pairwise_distance<double, double>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else {
RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
}
} else {
if (x_dt.bits == 32) {
_pairwise_distance<float, float, raft::col_major>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else if (x_dt.bits == 16) {
_pairwise_distance<half, float, raft::col_major>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else if (x_dt.bits == 64) {
_pairwise_distance<double, double, raft::col_major>(
res, x_tensor, y_tensor, distances_tensor, metric, metric_arg);
} else {
RAFT_FAIL("Unsupported DLtensor dtype: %d and bits: %d", x_dt.code, x_dt.bits);
}
}
});
}
28 changes: 21 additions & 7 deletions cpp/src/neighbors/brute_force_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@

namespace {

template <typename T>
template <typename T, typename LayoutT = raft::row_major>
void* _build(cuvsResources_t res,
DLManagedTensor* dataset_tensor,
cuvsDistanceType metric,
T metric_arg)
{
auto res_ptr = reinterpret_cast<raft::resources*>(res);

using mdspan_type = raft::device_matrix_view<T const, int64_t, raft::row_major>;
using mdspan_type = raft::device_matrix_view<T const, int64_t, LayoutT>;
auto mds = cuvs::core::from_dlpack<mdspan_type>(dataset_tensor);

cuvs::neighbors::brute_force::index_params params;
Expand All @@ -53,7 +53,7 @@ void* _build(cuvsResources_t res,
return index_on_heap;
}

template <typename T>
template <typename T, typename QueriesLayoutT = raft::row_major>
void _search(cuvsResources_t res,
cuvsBruteForceIndex index,
DLManagedTensor* queries_tensor,
Expand All @@ -64,7 +64,7 @@ void _search(cuvsResources_t res,
auto res_ptr = reinterpret_cast<raft::resources*>(res);
auto index_ptr = reinterpret_cast<cuvs::neighbors::brute_force::index<T>*>(index.addr);

using queries_mdspan_type = raft::device_matrix_view<T const, int64_t, raft::row_major>;
using queries_mdspan_type = raft::device_matrix_view<T const, int64_t, QueriesLayoutT>;
using neighbors_mdspan_type = raft::device_matrix_view<int64_t, int64_t, raft::row_major>;
using distances_mdspan_type = raft::device_matrix_view<float, int64_t, raft::row_major>;
using prefilter_mds_type = raft::device_vector_view<const uint32_t, int64_t>;
Expand Down Expand Up @@ -150,8 +150,15 @@ extern "C" cuvsError_t cuvsBruteForceBuild(cuvsResources_t res,
auto dataset = dataset_tensor->dl_tensor;

if (dataset.dtype.code == kDLFloat && dataset.dtype.bits == 32) {
index->addr =
reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
if (cuvs::core::is_c_contiguous(dataset_tensor)) {
index->addr =
reinterpret_cast<uintptr_t>(_build<float>(res, dataset_tensor, metric, metric_arg));
} else if (cuvs::core::is_f_contiguous(dataset_tensor)) {
index->addr = reinterpret_cast<uintptr_t>(
_build<float, raft::col_major>(res, dataset_tensor, metric, metric_arg));
} else {
RAFT_FAIL("dataset input to cuvsBruteForceBuild must be contiguous (non-strided)");
}
index->dtype = dataset.dtype;
} else {
RAFT_FAIL("Unsupported dataset DLtensor dtype: %d and bits: %d",
Expand Down Expand Up @@ -189,7 +196,14 @@ extern "C" cuvsError_t cuvsBruteForceSearch(cuvsResources_t res,
RAFT_EXPECTS(queries.dtype.code == index.dtype.code, "type mismatch between index and queries");

if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
_search<float>(res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
if (cuvs::core::is_c_contiguous(queries_tensor)) {
_search<float>(res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
} else if (cuvs::core::is_f_contiguous(queries_tensor)) {
_search<float, raft::col_major>(
res, index, queries_tensor, neighbors_tensor, distances_tensor, prefilter);
} else {
RAFT_FAIL("queries input to cuvsBruteForceSearch must be contiguous (non-strided)");
}
} else {
RAFT_FAIL("Unsupported queries DLtensor dtype: %d and bits: %d",
queries.dtype.code,
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/neighbors/detail/dynamic_batching.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,8 @@ enum struct slot_state : int32_t {
struct batch_token {
uint64_t value = 0;

constexpr inline batch_token() {}
explicit constexpr inline batch_token(uint32_t buffer_id) { id() = buffer_id; }
constexpr inline batch_token() = default;
RAFT_INLINE_FUNCTION explicit batch_token(uint32_t buffer_id) { id() = buffer_id; }

/**
* Sequential id of the batch in the array of batches.
Expand Down Expand Up @@ -492,7 +492,7 @@ struct batch_queue_t {
* NB: "round" is the number of times the queue counters went over the whole ring buffer.
* It's used to avoid the ABA problem for atomic token updates.
*/
static constexpr inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
static inline auto make_empty_token(seq_order_id seq_id) noexcept -> batch_token
{
// Modify the seq_id to identify that the token slot is empty
auto empty_round = static_cast<uint32_t>(slot_state::kEmptyPast) * kSize;
Expand Down
1 change: 1 addition & 0 deletions cpp/src/neighbors/ivf_flat_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ void index<T, IdxT>::check_consistency()
"inconsistent number of lists (clusters)");
}

template struct index<float, uint32_t>; // Used for refine function
template struct index<float, int64_t>;
template struct index<half, int64_t>;
template struct index<int8_t, int64_t>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,6 @@
}

instantiate_cuvs_neighbors_refine_d(int64_t, float, float, int64_t);
instantiate_cuvs_neighbors_refine_d(uint32_t, float, float, int64_t);

#undef instantiate_cuvs_neighbors_refine_d
13 changes: 7 additions & 6 deletions cpp/src/neighbors/refine/refine_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,13 @@ void refine_device(
cuvs::neighbors::ivf_flat::index<data_t, idx_t> refinement_index(
handle, cuvs::distance::DistanceType(metric), n_queries, false, true, dim);

cuvs::neighbors::ivf_flat::detail::fill_refinement_index(handle,
&refinement_index,
dataset.data_handle(),
neighbor_candidates.data_handle(),
n_queries,
n_candidates);
cuvs::neighbors::ivf_flat::detail::fill_refinement_index<data_t, idx_t>(
handle,
&refinement_index,
dataset.data_handle(),
neighbor_candidates.data_handle(),
static_cast<idx_t>(n_queries),
static_cast<uint32_t>(n_candidates));
uint32_t grid_dim_x = 1;

// the neighbor ids will be computed in uint32_t as offset
Expand Down
2 changes: 1 addition & 1 deletion cpp/test/neighbors/ann_ivf_pq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,7 @@ inline auto enum_variety_ip() -> test_cases_t
// InnerProduct score is signed,
// thus we're forced to used signed 8-bit representation,
// thus we have one bit less precision
y.min_recall = y.min_recall.value() * 0.90;
y.min_recall = y.min_recall.value() * 0.88;
} else {
// In other cases it seems to perform a little bit better, still worse than L2
y.min_recall = y.min_recall.value() * 0.94;
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def setup(app):
linkcode_resolve = make_linkcode_resolve(
"cuvs",
"https://github.com/rapidsai/cuvs/"
"blob/{revision}/python/cuvs/cuvs/"
"blob/{revision}/python/cuvs/"
"{package}/{path}#L{lineno}",
)

Expand Down
Loading

0 comments on commit b0c7ab9

Please sign in to comment.