diff --git a/include/knowhere/utils.h b/include/knowhere/utils.h index 2fcefc663..25062b941 100644 --- a/include/knowhere/utils.h +++ b/include/knowhere/utils.h @@ -100,7 +100,7 @@ namespace sparse { // type used to represent the id of a vector in the index interface. // this is same as other index types. using label_t = int64_t; -// type used to represent the id of a vector inside the index. +// type used to represent the id, indices and indptr of a vector inside the index. using table_t = uint32_t; /** diff --git a/src/index/sparse/sparse_inverted_index.h b/src/index/sparse/sparse_inverted_index.h index 0246d8410..bb1b852ab 100644 --- a/src/index/sparse/sparse_inverted_index.h +++ b/src/index/sparse/sparse_inverted_index.h @@ -25,10 +25,11 @@ namespace knowhere::sparse { // Not thread safe, concurrent access must be protected. Concurrent read operations are allowed. // TODO: make class thread safe so we can perform concurrent add/search. -template +template class InvertedIndex { public: explicit InvertedIndex() { + indptr_.push_back(0); } void @@ -52,8 +53,8 @@ class InvertedIndex { writeBinaryPOD(writer, indptr_[i]); } for (size_t i = 0; i < nnz_; ++i) { - writeBinaryPOD(writer, indices_[i]); - writeBinaryPOD(writer, data_[i]); + writeBinaryPOD(writer, data_[i].first); + writeBinaryPOD(writer, data_[i].second); } for (size_t i = 0; i < n_cols_; ++i) { auto lut = inverted_lut_[i]; @@ -82,11 +83,10 @@ class InvertedIndex { for (size_t i = 0; i <= n_rows_; ++i) { readBinaryPOD(reader, indptr_[i]); } - indices_.resize(nnz_); data_.resize(nnz_); for (size_t i = 0; i < nnz_; ++i) { - readBinaryPOD(reader, indices_[i]); - readBinaryPOD(reader, data_[i]); + readBinaryPOD(reader, data_[i].first); + readBinaryPOD(reader, data_[i].second); } inverted_lut_.resize(n_cols_); for (size_t i = 0; i < n_cols_; ++i) { @@ -107,6 +107,7 @@ class InvertedIndex { return Status::success; } + template Status Add(const void* csr_matrix) { size_t rows, cols, nnz; @@ -115,17 +116,21 @@ class InvertedIndex { const T* data; parse_csr_matrix(csr_matrix, rows, cols, nnz, indptr, indices, data); - for (size_t i = 0; i < rows + 1; ++i) { - indptr_.push_back(nnz_ + indptr[i]); - } - // TODO: benchmark performance: for growing segments with lots of small // csr_matrix to add, it may be better to rely on the vector's internal // memory management to avoid frequent reallocations caused by reserve. - indices_.reserve(nnz_ + nnz); - indices_.insert(indices_.end(), indices, indices + nnz); data_.reserve(nnz_ + nnz); - data_.insert(data_.end(), data, data + nnz); + for (size_t i = 0; i < nnz; ++i) { + data_.emplace_back(indices[i], data[i]); + } + + for (size_t i = 1; i < rows + 1; ++i) { + indptr_.push_back(nnz_ + indptr[i]); + auto start = *(indptr_.rbegin() + 1); + auto end = *(indptr_.rbegin()); + // make sure each row in data_ is sorted by index + std::sort(data_.begin() + start, data_.begin() + end); + } if (n_cols_ < cols) { n_cols_ = cols; @@ -137,9 +142,10 @@ class InvertedIndex { for (size_t i = n_rows_; i < n_rows_ + rows; ++i) { for (IndPtrT j = indptr_[i]; j < indptr_[i + 1]; ++j) { - inverted_lut_[indices_[j]].emplace_back(i, data_[j]); + auto [idx, val] = data_[j]; + inverted_lut_[idx].emplace_back(i, val); if (use_wand_) { - max_in_dim_[indices_[j]] = std::max(max_in_dim_[indices_[j]], data_[j]); + max_in_dim_[idx] = std::max(max_in_dim_[idx], val); } } } @@ -149,6 +155,7 @@ class InvertedIndex { return Status::success; } + template void Search(const void* query_csr_matrix, int64_t q_id, size_t k, float drop_ratio_search, float* distances, label_t* labels, size_t refine_factor, const BitsetView& bitset) const { @@ -169,33 +176,27 @@ class InvertedIndex { refine_factor = 1; } - std::vector> q_vec(len); + std::vector> q_vec(len); for (size_t i = 0; i < len; ++i) { q_vec[i] = std::make_pair(indices[i], data[i]); } std::sort(q_vec.begin(), q_vec.end(), [](const auto& lhs, const auto& rhs) { return std::abs(lhs.second) > std::abs(rhs.second); }); - while (!q_vec.empty() && q_vec[0].second * drop_ratio_search > q_vec.back().second) { - q_vec.pop_back(); - } MaxMinHeap heap(k * refine_factor); if (!use_wand_) { - search_brute_force(q_vec, heap, bitset); + search_brute_force(q_vec, drop_ratio_search, heap, bitset); } else { - search_wand(q_vec, heap, bitset); + search_wand(q_vec, drop_ratio_search, heap, bitset); } // no refinement needed if (refine_factor == 1) { collect_result(heap, distances, labels); } else { - // TODO tweak the map buckets number for best performance - std::unordered_map q_map(4 * len); - for (size_t i = 0; i < len; ++i) { - q_map[indices[i]] = data[i]; - } - refine_and_collect(q_map, heap, k, distances, labels); + std::sort(q_vec.begin(), q_vec.end(), + [](const auto& lhs, const auto& rhs) { return lhs.first < rhs.first; }); + refine_and_collect(q_vec, heap, k, distances, labels); } } @@ -203,9 +204,8 @@ class InvertedIndex { size() const { size_t res = 0; res += sizeof(*this); - res += sizeof(T) * data_.capacity(); - res += sizeof(IndicesT) * indices_.capacity(); - res += sizeof(IndPtrT) * indptr_.capacity(); + res += sizeof(std::pair) * data_.capacity(); + res += sizeof(table_t) * indptr_.capacity(); res += sizeof(std::vector>) * inverted_lut_.capacity(); for (auto& lut : inverted_lut_) { res += sizeof(Neighbor) * lut.capacity(); @@ -228,14 +228,21 @@ class InvertedIndex { private: [[nodiscard]] float - dot_product(const std::unordered_map& q_map, table_t u) const { + dot_product(const std::vector>& q_vec, table_t u) const { float res = 0.0f; - for (IndPtrT i = indptr_[u]; i < indptr_[u + 1]; ++i) { - auto idx = indices_[i]; - float val = float(data_[i]); - auto it = q_map.find(idx); - if (it != q_map.end()) { - res += val * it->second; + table_t pu = indptr_[u]; + table_t pq = 0; + while (pu < indptr_[u + 1] && pq < q_vec.size()) { + auto [idx, val] = data_[pu]; + auto [q_idx, q_val] = q_vec[pq]; + if (idx == q_idx) { + res += float(val) * float(q_val); + pu++; + pq++; + } else if (idx < q_idx) { + pu++; + } else { + pq++; } } return res; @@ -243,7 +250,7 @@ class InvertedIndex { // find the top-k candidates using brute force search, k as specified by the capacity of the heap. void - search_brute_force(const std::vector>& q_vec, MaxMinHeap& heap, + search_brute_force(const std::vector>& q_vec, float drop_ratio, MaxMinHeap& heap, const BitsetView& bitset) const { std::vector scores(n_rows_, 0.0f); for (auto [i, v] : q_vec) { @@ -251,6 +258,9 @@ class InvertedIndex { auto [idx, val] = inverted_lut_[i][j]; scores[idx] += v * float(val); } + if (q_vec[0].second * drop_ratio > v) { + break; + } } for (size_t i = 0; i < n_rows_; ++i) { if ((bitset.empty() || !bitset.test(i)) && scores[i] != 0) { @@ -322,11 +332,16 @@ class InvertedIndex { }; // class Cursor void - search_wand(std::vector>& q_vec, MaxMinHeap& heap, const BitsetView& bitset) const { + search_wand(std::vector>& q_vec, float drop_ratio, MaxMinHeap& heap, + const BitsetView& bitset) const { auto q_dim = q_vec.size(); std::vector> cursors(q_dim); for (size_t i = 0; i < q_dim; ++i) { auto [idx, val] = q_vec[i]; + if (q_vec[0].second * drop_ratio > val) { + cursors.resize(i); + break; + } cursors[i] = std::make_shared(inverted_lut_[idx], n_rows_, max_in_dim_[idx] * val, val, bitset); } auto sort_cursors = [&cursors] { @@ -380,7 +395,7 @@ class InvertedIndex { } void - refine_and_collect(const std::unordered_map& q_map, MaxMinHeap& inaccurate, size_t k, + refine_and_collect(const std::vector>& q_vec, MaxMinHeap& inaccurate, size_t k, float* distances, label_t* labels) const { std::priority_queue, std::vector>, std::greater>> heap; @@ -388,7 +403,7 @@ class InvertedIndex { auto [u, d] = inaccurate.top(); inaccurate.pop(); - auto dist_acc = dot_product(q_map, u); + auto dist_acc = dot_product(q_vec, u); if (heap.size() < k) { heap.emplace(u, dist_acc); } else if (heap.top().distance < dist_acc) { @@ -415,9 +430,8 @@ class InvertedIndex { size_t nnz_ = 0; std::vector>> inverted_lut_; - std::vector data_; - std::vector indices_; - std::vector indptr_; + std::vector> data_; + std::vector indptr_; bool use_wand_ = false; float drop_ratio_build_ = 0; diff --git a/tests/ut/test_sparse.cc b/tests/ut/test_sparse.cc index 8c2752c65..439ee1380 100644 --- a/tests/ut/test_sparse.cc +++ b/tests/ut/test_sparse.cc @@ -24,12 +24,8 @@ TEST_CASE("Test Mem Sparse Index With Float Vector", "[float metrics]") { {2000, 300, 0.95, 0.97}, // 300 dim, avg doc nnz 9, avg query nnz 3 {2000, 300, 0.97, 0.99}, - // 3000 dim, avg doc nnz 120, avg query nnz 90 - {20000, 3000, 0.95, 0.97}, // 3000 dim, avg doc nnz 90, avg query nnz 30 {20000, 3000, 0.97, 0.99}, - // 30000 dim, avg doc nnz 60, avg query nnz 30 - {100000, 30000, 0.998, 0.999}, })); auto topk = 5; int64_t nq = GENERATE(10, 100); diff --git a/tests/ut/utils.h b/tests/ut/utils.h index 8b98aa16c..ef75e3cd2 100644 --- a/tests/ut/utils.h +++ b/tests/ut/utils.h @@ -233,6 +233,7 @@ GenTestVersionList() { return GENERATE(as{}, knowhere::Version::GetCurrentVersion().VersionNumber()); } +// Generate a sparse dataset with given sparsity. The indices of each row may be not ordered. template inline knowhere::DataSetPtr GenSparseDataSet(ShapeT rows, ShapeT cols, float sparsity, int seed = 42) {