Skip to content

Commit

Permalink
In sparse vector index, sort per row indices and use two pointers to …
Browse files Browse the repository at this point in the history
…compute refined distance.

Signed-off-by: Buqian Zheng <[email protected]>
  • Loading branch information
zhengbuqian committed Nov 22, 2023
1 parent 460800d commit 853e414
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 49 deletions.
2 changes: 1 addition & 1 deletion include/knowhere/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ namespace sparse {
// type used to represent the id of a vector in the index interface.
// this is same as other index types.
using label_t = int64_t;
// type used to represent the id of a vector inside the index.
// type used to represent the id, indices and indptr of a vector inside the index.
using table_t = uint32_t;

/**
Expand Down
102 changes: 58 additions & 44 deletions src/index/sparse/sparse_inverted_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ namespace knowhere::sparse {

// Not thread safe, concurrent access must be protected. Concurrent read operations are allowed.
// TODO: make class thread safe so we can perform concurrent add/search.
template <typename T, typename IndPtrT = int64_t, typename IndicesT = int32_t, typename ShapeT = int64_t>
template <typename T>
class InvertedIndex {
public:
explicit InvertedIndex() {
indptr_.push_back(0);
}

void
Expand All @@ -52,8 +53,8 @@ class InvertedIndex {
writeBinaryPOD(writer, indptr_[i]);
}
for (size_t i = 0; i < nnz_; ++i) {
writeBinaryPOD(writer, indices_[i]);
writeBinaryPOD(writer, data_[i]);
writeBinaryPOD(writer, data_[i].first);
writeBinaryPOD(writer, data_[i].second);
}
for (size_t i = 0; i < n_cols_; ++i) {
auto lut = inverted_lut_[i];
Expand Down Expand Up @@ -82,11 +83,10 @@ class InvertedIndex {
for (size_t i = 0; i <= n_rows_; ++i) {
readBinaryPOD(reader, indptr_[i]);
}
indices_.resize(nnz_);
data_.resize(nnz_);
for (size_t i = 0; i < nnz_; ++i) {
readBinaryPOD(reader, indices_[i]);
readBinaryPOD(reader, data_[i]);
readBinaryPOD(reader, data_[i].first);
readBinaryPOD(reader, data_[i].second);
}
inverted_lut_.resize(n_cols_);
for (size_t i = 0; i < n_cols_; ++i) {
Expand All @@ -107,6 +107,7 @@ class InvertedIndex {
return Status::success;
}

template <typename IndPtrT = int64_t, typename IndicesT = int32_t, typename ShapeT = int64_t>
Status
Add(const void* csr_matrix) {
size_t rows, cols, nnz;
Expand All @@ -115,17 +116,21 @@ class InvertedIndex {
const T* data;
parse_csr_matrix(csr_matrix, rows, cols, nnz, indptr, indices, data);

for (size_t i = 0; i < rows + 1; ++i) {
indptr_.push_back(nnz_ + indptr[i]);
}

// TODO: benchmark performance: for growing segments with lots of small
// csr_matrix to add, it may be better to rely on the vector's internal
// memory management to avoid frequent reallocations caused by reserve.
indices_.reserve(nnz_ + nnz);
indices_.insert(indices_.end(), indices, indices + nnz);
data_.reserve(nnz_ + nnz);
data_.insert(data_.end(), data, data + nnz);
for (size_t i = 0; i < nnz; ++i) {
data_.emplace_back(indices[i], data[i]);
}

for (size_t i = 1; i < rows + 1; ++i) {
indptr_.push_back(nnz_ + indptr[i]);
auto start = *(indptr_.rbegin() + 1);
auto end = *(indptr_.rbegin());
// make sure each row in data_ is sorted by index
std::sort(data_.begin() + start, data_.begin() + end);
}

if (n_cols_ < cols) {
n_cols_ = cols;
Expand All @@ -137,9 +142,10 @@ class InvertedIndex {

for (size_t i = n_rows_; i < n_rows_ + rows; ++i) {
for (IndPtrT j = indptr_[i]; j < indptr_[i + 1]; ++j) {
inverted_lut_[indices_[j]].emplace_back(i, data_[j]);
auto [idx, val] = data_[j];
inverted_lut_[idx].emplace_back(i, val);
if (use_wand_) {
max_in_dim_[indices_[j]] = std::max(max_in_dim_[indices_[j]], data_[j]);
max_in_dim_[idx] = std::max(max_in_dim_[idx], val);
}
}
}
Expand All @@ -149,6 +155,7 @@ class InvertedIndex {
return Status::success;
}

template <typename IndPtrT = int64_t, typename IndicesT = int32_t, typename ShapeT = int64_t>
void
Search(const void* query_csr_matrix, int64_t q_id, size_t k, float drop_ratio_search, float* distances,
label_t* labels, size_t refine_factor, const BitsetView& bitset) const {
Expand All @@ -169,43 +176,36 @@ class InvertedIndex {
refine_factor = 1;
}

std::vector<std::pair<IndicesT, T>> q_vec(len);
std::vector<std::pair<table_t, T>> q_vec(len);
for (size_t i = 0; i < len; ++i) {
q_vec[i] = std::make_pair(indices[i], data[i]);
}
std::sort(q_vec.begin(), q_vec.end(),
[](const auto& lhs, const auto& rhs) { return std::abs(lhs.second) > std::abs(rhs.second); });
while (!q_vec.empty() && q_vec[0].second * drop_ratio_search > q_vec.back().second) {
q_vec.pop_back();
}

MaxMinHeap<T> heap(k * refine_factor);
if (!use_wand_) {
search_brute_force(q_vec, heap, bitset);
search_brute_force(q_vec, drop_ratio_search, heap, bitset);
} else {
search_wand(q_vec, heap, bitset);
search_wand(q_vec, drop_ratio_search, heap, bitset);
}

// no refinement needed
if (refine_factor == 1) {
collect_result(heap, distances, labels);
} else {
// TODO tweak the map buckets number for best performance
std::unordered_map<IndicesT, T> q_map(4 * len);
for (size_t i = 0; i < len; ++i) {
q_map[indices[i]] = data[i];
}
refine_and_collect(q_map, heap, k, distances, labels);
std::sort(q_vec.begin(), q_vec.end(),
[](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; });
refine_and_collect(q_vec, heap, k, distances, labels);
}
}

[[nodiscard]] size_t
size() const {
size_t res = 0;
res += sizeof(*this);
res += sizeof(T) * data_.capacity();
res += sizeof(IndicesT) * indices_.capacity();
res += sizeof(IndPtrT) * indptr_.capacity();
res += sizeof(std::pair<table_t, T>) * data_.capacity();
res += sizeof(table_t) * indptr_.capacity();
res += sizeof(std::vector<Neighbor<T>>) * inverted_lut_.capacity();
for (auto& lut : inverted_lut_) {
res += sizeof(Neighbor<T>) * lut.capacity();
Expand All @@ -228,29 +228,39 @@ class InvertedIndex {

private:
[[nodiscard]] float
dot_product(const std::unordered_map<IndicesT, T>& q_map, table_t u) const {
dot_product(const std::vector<std::pair<table_t, T>>& q_vec, table_t u) const {
float res = 0.0f;
for (IndPtrT i = indptr_[u]; i < indptr_[u + 1]; ++i) {
auto idx = indices_[i];
float val = float(data_[i]);
auto it = q_map.find(idx);
if (it != q_map.end()) {
res += val * it->second;
table_t pu = indptr_[u];
table_t pq = 0;
while (pu < indptr_[u + 1] && pq < q_vec.size()) {
auto [idx, val] = data_[pu];
auto [q_idx, q_val] = q_vec[pq];
if (idx == q_idx) {
res += float(val) * float(q_val);
pu++;
pq++;
} else if (idx < q_idx) {
pu++;
} else {
pq++;
}
}
return res;
}

// find the top-k candidates using brute force search, k as specified by the capacity of the heap.
void
search_brute_force(const std::vector<std::pair<IndicesT, T>>& q_vec, MaxMinHeap<T>& heap,
search_brute_force(const std::vector<std::pair<table_t, T>>& q_vec, float drop_ratio, MaxMinHeap<T>& heap,
const BitsetView& bitset) const {
std::vector<float> scores(n_rows_, 0.0f);
for (auto [i, v] : q_vec) {
for (size_t j = 0; j < inverted_lut_[i].size(); j++) {
auto [idx, val] = inverted_lut_[i][j];
scores[idx] += v * float(val);
}
if (q_vec[0].second * drop_ratio > v) {
break;
}
}
for (size_t i = 0; i < n_rows_; ++i) {
if ((bitset.empty() || !bitset.test(i)) && scores[i] != 0) {
Expand Down Expand Up @@ -322,11 +332,16 @@ class InvertedIndex {
}; // class Cursor

void
search_wand(std::vector<std::pair<IndicesT, T>>& q_vec, MaxMinHeap<T>& heap, const BitsetView& bitset) const {
search_wand(std::vector<std::pair<table_t, T>>& q_vec, float drop_ratio, MaxMinHeap<T>& heap,
const BitsetView& bitset) const {
auto q_dim = q_vec.size();
std::vector<std::shared_ptr<Cursor>> cursors(q_dim);
for (size_t i = 0; i < q_dim; ++i) {
auto [idx, val] = q_vec[i];
if (q_vec[0].second * drop_ratio > val) {
cursors.resize(i);
break;
}
cursors[i] = std::make_shared<Cursor>(inverted_lut_[idx], n_rows_, max_in_dim_[idx] * val, val, bitset);
}
auto sort_cursors = [&cursors] {
Expand Down Expand Up @@ -380,15 +395,15 @@ class InvertedIndex {
}

void
refine_and_collect(const std::unordered_map<IndicesT, T>& q_map, MaxMinHeap<T>& inaccurate, size_t k,
refine_and_collect(const std::vector<std::pair<table_t, T>>& q_vec, MaxMinHeap<T>& inaccurate, size_t k,
float* distances, label_t* labels) const {
std::priority_queue<Neighbor<T>, std::vector<Neighbor<T>>, std::greater<Neighbor<T>>> heap;

while (!inaccurate.empty()) {
auto [u, d] = inaccurate.top();
inaccurate.pop();

auto dist_acc = dot_product(q_map, u);
auto dist_acc = dot_product(q_vec, u);
if (heap.size() < k) {
heap.emplace(u, dist_acc);
} else if (heap.top().distance < dist_acc) {
Expand All @@ -415,9 +430,8 @@ class InvertedIndex {
size_t nnz_ = 0;
std::vector<std::vector<Neighbor<T>>> inverted_lut_;

std::vector<T> data_;
std::vector<IndicesT> indices_;
std::vector<IndPtrT> indptr_;
std::vector<std::pair<table_t, T>> data_;
std::vector<table_t> indptr_;

bool use_wand_ = false;
float drop_ratio_build_ = 0;
Expand Down
4 changes: 0 additions & 4 deletions tests/ut/test_sparse.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,8 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") {
{2000, 300, 0.95, 0.97},
// 300 dim, avg doc nnz 9, avg query nnz 3
{2000, 300, 0.97, 0.99},
// 3000 dim, avg doc nnz 120, avg query nnz 90
{20000, 3000, 0.95, 0.97},
// 3000 dim, avg doc nnz 90, avg query nnz 30
{20000, 3000, 0.97, 0.99},
// 30000 dim, avg doc nnz 60, avg query nnz 30
{100000, 30000, 0.998, 0.999},
}));
auto topk = 5;
int64_t nq = GENERATE(10, 100);
Expand Down
1 change: 1 addition & 0 deletions tests/ut/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ GenTestVersionList() {
return GENERATE(as<int32_t>{}, knowhere::Version::GetCurrentVersion().VersionNumber());
}

// Generate a sparse dataset with given sparsity. The indices of each row may be not ordered.
template <typename ValueT = float, typename IndPtrT = int64_t, typename IndicesT = int32_t, typename ShapeT = int64_t>
inline knowhere::DataSetPtr
GenSparseDataSet(ShapeT rows, ShapeT cols, float sparsity, int seed = 42) {
Expand Down

0 comments on commit 853e414

Please sign in to comment.