From de86a8852d3179e8c4709ffe88bf1a2a82f36b74 Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 16:45:07 -0700 Subject: [PATCH 01/15] yeet --- CMakeLists.txt | 9 +- TESTING_RECALL.md | 2 +- examples/python/EXAMPLES.md | 8 +- examples/python/example.py | 2 +- examples/python/example_filter.py | 2 +- examples/python/example_replace_deleted.py | 2 +- examples/python/example_search.py | 2 +- examples/python/example_serialization.py | 2 +- examples/python/pyw_hnswlib.py | 4 +- hnswlib/bruteforce.h | 2 +- hnswlib/hnswalg.h | 19 +-- hnswlib/hnswlib.h | 2 +- python_bindings/LazyIndex.py | 4 +- python_bindings/bindings.cpp | 53 ++++---- setup.py | 8 +- tests/cpp/sift_1b.cpp | 5 +- tests/cpp/sift_test.cpp | 127 +++++++++--------- tests/cpp/updates_test.cpp | 5 +- tests/python/bindings_test.py | 2 +- tests/python/bindings_test_filter.py | 2 +- tests/python/bindings_test_getdata.py | 2 +- tests/python/bindings_test_labels.py | 6 +- tests/python/bindings_test_metadata.py | 2 +- tests/python/bindings_test_pickle.py | 22 ++- tests/python/bindings_test_recall.py | 2 +- tests/python/bindings_test_replace.py | 6 +- tests/python/bindings_test_resize.py | 2 +- tests/python/bindings_test_spaces.py | 2 +- .../python/bindings_test_stress_mt_replace.py | 2 +- tests/python/speedtest.py | 39 +++--- 30 files changed, 181 insertions(+), 166 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 03d4f3fe..ab614081 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,14 +6,15 @@ add_library(hnswlib INTERFACE) target_include_directories(hnswlib INTERFACE .) if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) - set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD_REQUIRED ON) if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize") + SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++17 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize") elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) + SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++17 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" ) elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) + SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++17 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) endif() # examples diff --git a/TESTING_RECALL.md b/TESTING_RECALL.md index 29136ec8..ffe7dff5 100644 --- a/TESTING_RECALL.md +++ b/TESTING_RECALL.md @@ -59,7 +59,7 @@ bf_index.init_index(max_elements=num_elements) # Controlling the recall for hnsw by setting ef: # higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(200) +hnsw_index.set_ef_search_default(200) # Set number of threads used during batch search/construction in hnsw # By default using all available cores diff --git a/examples/python/EXAMPLES.md b/examples/python/EXAMPLES.md index 6c1b20e4..2b9b7005 100644 --- a/examples/python/EXAMPLES.md +++ b/examples/python/EXAMPLES.md @@ -23,7 +23,7 @@ p.init_index(max_elements = num_elements, ef_construction = 200, M = 16) p.add_items(data, ids) # Controlling the recall by setting ef: -p.set_ef(50) # ef should always be > k +p.set_ef_search_default(50) # ef should always be > k # Query dataset, k - number of the closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k = 1) @@ -72,7 +72,7 @@ p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -p.set_ef(10) +p.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores @@ -133,7 +133,7 @@ hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) +hnsw_index.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores @@ -185,7 +185,7 @@ hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) +hnsw_index.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example.py b/examples/python/example.py index a495f915..05907763 100644 --- a/examples/python/example.py +++ b/examples/python/example.py @@ -34,7 +34,7 @@ # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -p.set_ef(10) +p.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_filter.py b/examples/python/example_filter.py index 1c89be95..3552a624 100644 --- a/examples/python/example_filter.py +++ b/examples/python/example_filter.py @@ -27,7 +27,7 @@ # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) +hnsw_index.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_replace_deleted.py b/examples/python/example_replace_deleted.py index 23bbc098..47a95186 100644 --- a/examples/python/example_replace_deleted.py +++ b/examples/python/example_replace_deleted.py @@ -36,7 +36,7 @@ # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) +hnsw_index.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/example_search.py b/examples/python/example_search.py index 8c67dbd7..da48aab3 100644 --- a/examples/python/example_search.py +++ b/examples/python/example_search.py @@ -24,7 +24,7 @@ p.add_items(data, ids) # Controlling the recall by setting ef: -p.set_ef(50) # ef should always be > k +p.set_ef_search_default(50) # ef should always be > k # Query dataset, k - number of the closest elements (returns 2 numpy arrays) labels, distances = p.knn_query(data, k=1) diff --git a/examples/python/example_serialization.py b/examples/python/example_serialization.py index cfa69641..f1e026ab 100644 --- a/examples/python/example_serialization.py +++ b/examples/python/example_serialization.py @@ -35,7 +35,7 @@ # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -p.set_ef(10) +p.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores diff --git a/examples/python/pyw_hnswlib.py b/examples/python/pyw_hnswlib.py index 4d9a73b0..0649e712 100644 --- a/examples/python/pyw_hnswlib.py +++ b/examples/python/pyw_hnswlib.py @@ -42,8 +42,8 @@ def add_items(self, data, ids=None): start += 1 self.index.add_items(data=data, ids=np.asarray(int_labels)) - def set_ef(self, ef): - self.index.set_ef(ef) + def set_ef_search_default(self, ef): + self.index.set_ef_search_default(ef) def load_index(self, path): self.index.load_index(path) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 0c7a9821..4447d14c 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -103,7 +103,7 @@ namespace hnswlib } std::priority_queue> - searchKnn(const void *query_data, size_t k, BaseFilterFunctor *isIdAllowed = nullptr) const + searchKnn(const void *query_data, size_t k, BaseFilterFunctor *isIdAllowed = nullptr, const std::optional ef_search = std::nullopt) const { assert(k <= cur_element_count); std::priority_queue> topResults; diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index cc767847..65bb7ba3 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -32,7 +32,7 @@ namespace hnswlib size_t maxM_{0}; size_t maxM0_{0}; size_t ef_construction_{0}; - size_t ef_{0}; + size_t ef_search_default_{0}; double mult_{0.0}, revSize_{0.0}; int maxlevel_{0}; @@ -119,6 +119,7 @@ namespace hnswlib size_t max_elements, size_t M = 16, size_t ef_construction = 200, + size_t ef_search_default = 10, size_t random_seed = 100, bool allow_replace_deleted = false, bool normalize = false, @@ -141,7 +142,7 @@ namespace hnswlib maxM_ = M_; maxM0_ = M_ * 2; ef_construction_ = std::max(ef_construction, M_); - ef_ = 10; + ef_search_default_ = ef_search_default; level_generator_.seed(random_seed); update_probability_generator_.seed(random_seed + 1); @@ -208,9 +209,9 @@ namespace hnswlib } }; - void setEf(size_t ef) + void setEfSearchDefault(size_t ef) { - ef_ = ef; + ef_search_default_ = ef; } inline std::mutex &getLabelOpMutex(labeltype label) const @@ -746,6 +747,7 @@ namespace hnswlib writeBinaryPOD(output, M_); writeBinaryPOD(output, mult_); writeBinaryPOD(output, ef_construction_); + writeBinaryPOD(output, ef_search_default_); output.write(data_level0_memory_, cur_element_count * size_data_per_element_); output.write(length_memory_, cur_element_count * sizeof(float)); @@ -1006,6 +1008,7 @@ namespace hnswlib readBinaryPOD(input_header, M_); readBinaryPOD(input_header, mult_); readBinaryPOD(input_header, ef_construction_); + readBinaryPOD(input_header, ef_search_default_); input_header.close(); data_size_ = s->get_data_size(); @@ -1064,7 +1067,6 @@ namespace hnswlib throw std::runtime_error("Not enough memory: loadPersistedIndex failed to allocate linklists"); element_levels_ = std::vector(max_elements_); revSize_ = 1.0 / mult_; - ef_ = 10; for (size_t i = 0; i < cur_element_count; i++) { label_lookup_[getExternalLabel(i)] = i; @@ -1130,6 +1132,7 @@ namespace hnswlib readBinaryPOD(input, M_); readBinaryPOD(input, mult_); readBinaryPOD(input, ef_construction_); + readBinaryPOD(input, ef_search_default_); data_size_ = s->get_data_size(); fstdistfunc_ = s->get_dist_func(); @@ -1715,7 +1718,7 @@ namespace hnswlib } std::priority_queue> - searchKnn(const void *query_data, size_t k, BaseFilterFunctor *isIdAllowed = nullptr) const + searchKnn(const void *query_data, size_t k, BaseFilterFunctor *isIdAllowed = nullptr, const std::optional ef_search = std::nullopt) const { std::priority_queue> result; if (cur_element_count == 0) @@ -1758,12 +1761,12 @@ namespace hnswlib if (num_deleted_) { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_, k), isIdAllowed); + currObj, query_data, std::max(ef_search.value_or(ef_search_default_), k), isIdAllowed); } else { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_, k), isIdAllowed); + currObj, query_data, std::max(ef_search.value_or(ef_search_default_), k), isIdAllowed); } while (top_candidates.size() > k) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 3e01016d..e7798294 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -180,7 +180,7 @@ namespace hnswlib virtual void addPoint(const void *datapoint, labeltype label, bool replace_deleted = false) = 0; virtual std::priority_queue> - searchKnn(const void *, size_t, BaseFilterFunctor *isIdAllowed = nullptr) const = 0; + searchKnn(const void *, size_t, BaseFilterFunctor *isIdAllowed = nullptr, const std::optional ef_search = std::nullopt) const = 0; // Return k nearest neighbor in the order of closer fist virtual std::vector> diff --git a/python_bindings/LazyIndex.py b/python_bindings/LazyIndex.py index 60c5b926..772bdd95 100644 --- a/python_bindings/LazyIndex.py +++ b/python_bindings/LazyIndex.py @@ -44,11 +44,11 @@ def resize_index(self, size): else: return super().resize_index(size) - def set_ef(self, ef): + def set_ef_search_default(self, ef): if self.max_elements == 0: self.init_ef_construction = ef return - super().set_ef(ef) + super().set_ef_search_default(ef) def get_max_elements(self): return self.max_elements diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index cad2f7b9..1e037d45 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -173,7 +173,7 @@ class Index std::string space_name; int dim; size_t seed; - size_t default_ef; + size_t ef_search_default; bool index_inited; bool ep_added; @@ -207,8 +207,6 @@ class Index ep_added = true; index_inited = false; num_threads_default = std::thread::hardware_concurrency(); - - default_ef = 10; } ~Index() @@ -222,6 +220,7 @@ class Index size_t maxElements, size_t M, size_t efConstruction, + size_t efSearchDefault, size_t random_seed, bool allow_replace_deleted, bool is_persistent_index, @@ -232,18 +231,19 @@ class Index throw std::runtime_error("The index is already initiated."); } cur_l = 0; - appr_alg = new hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, random_seed, allow_replace_deleted, normalize, is_persistent_index, persistence_location); + appr_alg = new hnswlib::HierarchicalNSW(l2space, maxElements, M, efConstruction, efSearchDefault, random_seed, allow_replace_deleted, normalize, is_persistent_index, persistence_location); index_inited = true; ep_added = false; - appr_alg->ef_ = default_ef; + ef_search_default = efSearchDefault; + appr_alg->ef_search_default_ = efSearchDefault; seed = random_seed; } - void set_ef(size_t ef) + void set_ef_search_default(size_t ef_search_default) { - default_ef = ef; + ef_search_default = ef_search_default; if (appr_alg) - appr_alg->ef_ = ef; + appr_alg->ef_search_default_ = ef_search_default; } void set_num_threads(int num_threads) @@ -456,7 +456,7 @@ class Index "M"_a = appr_alg->M_, "mult"_a = appr_alg->mult_, "ef_construction"_a = appr_alg->ef_construction_, - "ef"_a = appr_alg->ef_, + "ef_search_default"_a = appr_alg->ef_search_default_, "has_deletions"_a = (bool)appr_alg->num_deleted_, "size_links_per_element"_a = appr_alg->size_links_per_element_, "allow_replace_deleted"_a = appr_alg->allow_replace_deleted_, @@ -506,7 +506,7 @@ class Index "seed"_a = seed); if (index_inited == false) - return py::dict(**params, "ef"_a = default_ef); + return py::dict(**params, "ef_search_default"_a = ef_search_default); auto ann_params = getAnnData(); @@ -535,6 +535,7 @@ class Index d["max_elements"].cast(), d["M"].cast(), d["ef_construction"].cast(), + d["ef_search_default"].cast(), new_index->seed); new_index->cur_l = d["cur_element_count"].cast(); } @@ -542,7 +543,7 @@ class Index new_index->index_inited = index_inited_; new_index->ep_added = d["ep_added"].cast(); new_index->num_threads_default = d["num_threads"].cast(); - new_index->default_ef = d["ef"].cast(); + new_index->ef_search_default = d["ef_search_default"].cast(); if (index_inited_) new_index->setAnnData(d); @@ -577,7 +578,7 @@ class Index assert_true(appr_alg->mult_ == d["mult"].cast(), "Invalid value of mult_ "); assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast(), "Invalid value of ef_construction_ "); - appr_alg->ef_ = d["ef"].cast(); + appr_alg->ef_search_default_ = d["ef_search_default"].cast(); assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast(), "Invalid value of size_links_per_element_ "); @@ -665,7 +666,8 @@ class Index py::object input, size_t k = 1, int num_threads = -1, - const std::function &filter = nullptr) + const std::function &filter = nullptr, + const std::optional ef_search = std::nullopt) { py::array_t items(input); auto buffer = items.request(); @@ -701,7 +703,7 @@ class Index (void*)items.data(row), k, p_idFilter); if (result.size() != k) throw std::runtime_error( - "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); + "Cannot return the results in a contigious 2D array. Probably ef_search_default or M is too small"); for (int i = k - 1; i >= 0; i--) { auto& result_tuple = result.top(); data_numpy_d[row * k + i] = result_tuple.first; @@ -723,7 +725,7 @@ class Index (void*)(norm_array.data() + start_idx), k, p_idFilter); if (result.size() != k) throw std::runtime_error( - "Cannot return the results in a contigious 2D array. Probably ef or M is too small"); + "Cannot return the results in a contigious 2D array. Probably ef_search_default or M is too small"); for (int i = k - 1; i >= 0; i--) { auto& result_tuple = result.top(); data_numpy_d[row * k + i] = result_tuple.first; @@ -900,7 +902,8 @@ class BFIndex py::object knnQuery_return_numpy( py::object input, size_t k = 1, - const std::function &filter = nullptr) + const std::function &filter = nullptr, + const std::optional ef_search = std::nullopt) { py::array_t items(input); auto buffer = items.request(); @@ -921,7 +924,7 @@ class BFIndex for (size_t row = 0; row < rows; row++) { std::priority_queue> result = alg->searchKnn( - (void *)items.data(row), k, p_idFilter); + (void *)items.data(row), k, p_idFilter, ef_search); for (int i = k - 1; i >= 0; i--) { auto &result_tuple = result.top(); @@ -966,6 +969,7 @@ PYBIND11_PLUGIN(hnswlib) py::arg("max_elements"), py::arg("M") = 16, py::arg("ef_construction") = 200, + py::arg("ef_search_default") = 10, py::arg("random_seed") = 100, py::arg("allow_replace_deleted") = false, py::arg("is_persistent_index") = false, @@ -975,7 +979,8 @@ PYBIND11_PLUGIN(hnswlib) py::arg("data"), py::arg("k") = 1, py::arg("num_threads") = -1, - py::arg("filter") = py::none()) + py::arg("filter") = py::none(), + py::arg("ef_search") = py::none()) .def("add_items", &Index::addItems, py::arg("data"), @@ -984,7 +989,7 @@ PYBIND11_PLUGIN(hnswlib) py::arg("replace_deleted") = false) .def("get_items", &Index::getDataReturnList, py::arg("ids") = py::none()) .def("get_ids_list", &Index::getIdsList) - .def("set_ef", &Index::set_ef, py::arg("ef")) + .def("set_ef_search_default", &Index::set_ef_search_default, py::arg("ef_search_default")) .def("set_num_threads", &Index::set_num_threads, py::arg("num_threads")) .def("save_index", &Index::saveIndex, py::arg("path_to_index")) .def("load_index", @@ -1004,12 +1009,12 @@ PYBIND11_PLUGIN(hnswlib) .def_readonly("space", &Index::space_name) .def_readonly("dim", &Index::dim) .def_readwrite("num_threads", &Index::num_threads_default) - .def_property("ef", [](const Index &index) - { return index.index_inited ? index.appr_alg->ef_ : index.default_ef; }, [](Index &index, const size_t ef_) + .def_property("ef_search_default", [](const Index &index) + { return index.index_inited ? index.appr_alg->ef_search_default_ : index.ef_search_default; }, [](Index &index, const size_t ef_search_default_) { - index.default_ef = ef_; + index.ef_search_default = ef_search_default_; if (index.appr_alg) - index.appr_alg->ef_ = ef_; }) + index.appr_alg->ef_search_default_ = ef_search_default_; }) .def_property_readonly("max_elements", [](const Index &index) { return index.index_inited ? index.appr_alg->max_elements_ : 0; }) .def_property_readonly("element_count", [](const Index &index) @@ -1036,7 +1041,7 @@ PYBIND11_PLUGIN(hnswlib) py::class_>(m, "BFIndex") .def(py::init(), py::arg("space"), py::arg("dim")) .def("init_index", &BFIndex::init_new_index, py::arg("max_elements")) - .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none()) + .def("knn_query", &BFIndex::knnQuery_return_numpy, py::arg("data"), py::arg("k") = 1, py::arg("filter") = py::none(), py::arg("ef_search") = py::none()) .def("add_items", &BFIndex::addItems, py::arg("data"), py::arg("ids") = py::none()) .def("delete_vector", &BFIndex::deleteVector, py::arg("label")) .def("save_index", &BFIndex::saveIndex, py::arg("path_to_index")) diff --git a/setup.py b/setup.py index cbade1bf..0746c2f9 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ from setuptools import Extension, setup from setuptools.command.build_ext import build_ext -__version__ = "0.7.4" +__version__ = "0.7.5" include_dirs = [ pybind11.get_include(), @@ -59,9 +59,11 @@ def has_flag(compiler, flagname): def cpp_flag(compiler): - """Return the -std=c++[11/14] compiler flag. - The c++14 is prefered over c++11 (when it is available). + """Return the -std=c++[11/14/17] compiler flag. + The c++14 is prefered over c++17 (when it is available). """ + if has_flag(compiler, "-std=c++17"): + return "-std=c++17" if has_flag(compiler, "-std=c++14"): return "-std=c++14" elif has_flag(compiler, "-std=c++11"): diff --git a/tests/cpp/sift_1b.cpp b/tests/cpp/sift_1b.cpp index 872f0fdd..43a0dfcc 100644 --- a/tests/cpp/sift_1b.cpp +++ b/tests/cpp/sift_1b.cpp @@ -235,9 +235,8 @@ test_vs_recall( { efs.push_back(i); } - for (size_t ef : efs) - { - appr_alg.setEf(ef); + for (size_t ef : efs) { + appr_alg.setEfSearchDefault(ef); StopW stopw = StopW(); float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k); diff --git a/tests/cpp/sift_test.cpp b/tests/cpp/sift_test.cpp index b81bdb04..326c35d7 100644 --- a/tests/cpp/sift_test.cpp +++ b/tests/cpp/sift_test.cpp @@ -146,9 +146,8 @@ void test_vs_recall( /*for (int i = 300; i <600; i += 20) { efs.push_back(i); }*/ - for (size_t ef : efs) - { - appr_alg.setEf(ef); + for (size_t ef : efs) { + appr_alg.setEfSearchDefault(ef); StopW stopw = StopW(); float recall = test_approx(massQ, vecsize, qsize, appr_alg, vecdim, answers, k); @@ -281,65 +280,65 @@ void sift_test() // appr_alg.opt = true; return; - // test_approx(mass, massQ, vecsize, qsize, appr_alg, vecdim, answers); - // //return; - // - // cout << appr_alg.maxlevel_ << "\n"; - // //CHECK: - // //for (size_t io = 0; io < vecsize; io++) { - // // if (appr_alg.getExternalLabel(io) != io) - // // throw new exception("bad!"); - // //} - // DISTFUNC fstdistfunc_ = l2space.get_dist_func(); - ////#pragma omp parallel for - // for (int i = 0; i < vecsize; i++) { - // int *data = (int *)(appr_alg.linkList0_ + i * appr_alg.size_links_per_element0_); - // //cout << "numconn:" << *data<<"\n"; - // tableint *datal = (tableint *)(data + 1); - // - // std::priority_queue< std::pair< float, tableint >> rez; - // unordered_set g; - // for (int j = 0; j < *data; j++) { - // g.insert(datal[j]); - // } - // appr_alg.setEf(400); - // std::priority_queue< std::pair< float, tableint >> closest_elements = appr_alg.searchKnnInternal(appr_alg.getDataByInternalId(i), 17); - // while (closest_elements.size() > 0) { - // if (closest_elements.top().second != i) { - // g.insert(closest_elements.top().second); - // } - // closest_elements.pop(); - // } - // - // for (tableint l : g) { - // float other = fstdistfunc_(appr_alg.getDataByInternalId(l), appr_alg.getDataByInternalId(i), l2space.get_dist_func_param()); - // rez.emplace(other, l); - // } - // while (rez.size() > 32) - // rez.pop(); - // int len = rez.size(); - // *data = len; - // // check there are no loop connections created - // for (int j = 0; j < len; j++) { - // datal[j] = rez.top().second; - // if (datal[j] == i) - // throw new exception(); - // rez.pop(); - // } - // - // } - // - // //get_knn_quality(massA, vecsize, maxn, appr_alg); - // test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k); - // /*test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k); - // test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k); - // test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k);*/ - // - // - // - // - // - // /*for(int i=0;i<1000;i++) - // cout << mass[i] << "\n";*/ - // //("11", std::ios::binary); + //test_approx(mass, massQ, vecsize, qsize, appr_alg, vecdim, answers); +// //return; +// +// cout << appr_alg.maxlevel_ << "\n"; +// //CHECK: +// //for (size_t io = 0; io < vecsize; io++) { +// // if (appr_alg.getExternalLabel(io) != io) +// // throw new exception("bad!"); +// //} +// DISTFUNC fstdistfunc_ = l2space.get_dist_func(); +////#pragma omp parallel for +// for (int i = 0; i < vecsize; i++) { +// int *data = (int *)(appr_alg.linkList0_ + i * appr_alg.size_links_per_element0_); +// //cout << "numconn:" << *data<<"\n"; +// tableint *datal = (tableint *)(data + 1); +// +// std::priority_queue< std::pair< float, tableint >> rez; +// unordered_set g; +// for (int j = 0; j < *data; j++) { +// g.insert(datal[j]); +// } +// appr_alg.setEfSearchDefault(400); +// std::priority_queue< std::pair< float, tableint >> closest_elements = appr_alg.searchKnnInternal(appr_alg.getDataByInternalId(i), 17); +// while (closest_elements.size() > 0) { +// if (closest_elements.top().second != i) { +// g.insert(closest_elements.top().second); +// } +// closest_elements.pop(); +// } +// +// for (tableint l : g) { +// float other = fstdistfunc_(appr_alg.getDataByInternalId(l), appr_alg.getDataByInternalId(i), l2space.get_dist_func_param()); +// rez.emplace(other, l); +// } +// while (rez.size() > 32) +// rez.pop(); +// int len = rez.size(); +// *data = len; +// // check there are no loop connections created +// for (int j = 0; j < len; j++) { +// datal[j] = rez.top().second; +// if (datal[j] == i) +// throw new exception(); +// rez.pop(); +// } +// +// } +// +// //get_knn_quality(massA, vecsize, maxn, appr_alg); +// test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k); +// /*test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k); +// test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k); +// test_vs_recall( massQ, vecsize, qsize, appr_alg, vecdim, answers, k);*/ +// +// +// +// +// +// /*for(int i=0;i<1000;i++) +// cout << mass[i] << "\n";*/ +// //("11", std::ios::binary); } diff --git a/tests/cpp/updates_test.cpp b/tests/cpp/updates_test.cpp index 961dfefd..0b7bccec 100644 --- a/tests/cpp/updates_test.cpp +++ b/tests/cpp/updates_test.cpp @@ -167,9 +167,8 @@ test_vs_recall( std::cout << "ef\trecall\ttime\thops\tdistcomp\n"; bool test_passed = false; - for (size_t ef : efs) - { - appr_alg.setEf(ef); + for (size_t ef : efs) { + appr_alg.setEfSearchDefault(ef); appr_alg.metric_hops = 0; appr_alg.metric_distance_computations = 0; diff --git a/tests/python/bindings_test.py b/tests/python/bindings_test.py index 011af3f2..fcfaf2c2 100644 --- a/tests/python/bindings_test.py +++ b/tests/python/bindings_test.py @@ -29,7 +29,7 @@ def testRandomSelf(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - p.set_ef(10) + p.set_ef_search_default(10) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_filter.py b/tests/python/bindings_test_filter.py index a9a1c864..48bf251f 100644 --- a/tests/python/bindings_test_filter.py +++ b/tests/python/bindings_test_filter.py @@ -33,7 +33,7 @@ def testRandomSelf(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - hnsw_index.set_ef(10) + hnsw_index.set_ef_search_default(10) hnsw_index.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_getdata.py b/tests/python/bindings_test_getdata.py index c89c7deb..26493147 100644 --- a/tests/python/bindings_test_getdata.py +++ b/tests/python/bindings_test_getdata.py @@ -34,7 +34,7 @@ def testGettingItems(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - p.set_ef(100) + p.set_ef_search_default(100) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_labels.py b/tests/python/bindings_test_labels.py index 505c2542..30d69ee6 100644 --- a/tests/python/bindings_test_labels.py +++ b/tests/python/bindings_test_labels.py @@ -35,7 +35,7 @@ def testRandomSelf(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - p.set_ef(100) + p.set_ef_search_default(100) p.set_num_threads(4) # by default using all available cores @@ -77,7 +77,7 @@ def testRandomSelf(self): print("\nLoading index from '%s'\n" % index_path) p.load_index(index_path) - p.set_ef(100) + p.set_ef_search_default(100) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) @@ -126,7 +126,7 @@ def testRandomSelf(self): p.save_index(del_index_path) p = hnswlib.Index(space="l2", dim=dim) p.load_index(del_index_path) - p.set_ef(100) + p.set_ef_search_default(100) labels1_after, _ = p.knn_query(data1, k=1) for la in labels1_after: diff --git a/tests/python/bindings_test_metadata.py b/tests/python/bindings_test_metadata.py index c2d494f2..3e234c2a 100644 --- a/tests/python/bindings_test_metadata.py +++ b/tests/python/bindings_test_metadata.py @@ -28,7 +28,7 @@ def testMetadata(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - p.set_ef(100) + p.set_ef_search_default(100) p.set_num_threads(4) # by default using all available cores diff --git a/tests/python/bindings_test_pickle.py b/tests/python/bindings_test_pickle.py index 039de859..a88e12a8 100644 --- a/tests/python/bindings_test_pickle.py +++ b/tests/python/bindings_test_pickle.py @@ -98,8 +98,8 @@ def test_space_main(self, space, dim): max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M ) - p.ef = self.ef - p0.ef = self.ef + p.ef_search_default = self.ef_search_default + p0.ef_search_default = self.ef_search_default p1 = pickle.loads(pickle.dumps(p)) # pickle Index before adding items @@ -171,10 +171,18 @@ def test_space_main(self, space, dim): ) # Check ef parameter value - self.assertEqual(p.ef, self.ef, "incorrect value of p.ef") - self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef") - self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef") - self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef") + self.assertEqual( + p.ef_search_default, self.ef_search_default, "incorrect value of p.ef" + ) + self.assertEqual( + p0.ef_search_default, self.ef_search_default, "incorrect value of p0.ef" + ) + self.assertEqual( + p2.ef_search_default, self.ef_search_default, "incorrect value of p2.ef" + ) + self.assertEqual( + p1.ef_search_default, self.ef_search_default, "incorrect value of p1.ef" + ) # Check M parameter value self.assertEqual(p.M, self.M, "incorrect value of p.M") @@ -207,7 +215,7 @@ class PickleUnitTests(unittest.TestCase): def setUp(self): self.ef_construction = 200 self.M = 32 - self.ef = 400 + self.ef_search_default = 400 self.num_elements = 1000 self.num_test_elements = 100 diff --git a/tests/python/bindings_test_recall.py b/tests/python/bindings_test_recall.py index 04ca093c..3b59f8f6 100644 --- a/tests/python/bindings_test_recall.py +++ b/tests/python/bindings_test_recall.py @@ -38,7 +38,7 @@ def testRandomSelf(self): # Controlling the recall for hnsw by setting ef: # higher ef leads to better accuracy, but slower search - hnsw_index.set_ef(200) + hnsw_index.set_ef_search_default(200) # Set number of threads used during batch search/construction in hnsw # By default using all available cores diff --git a/tests/python/bindings_test_replace.py b/tests/python/bindings_test_replace.py index f4bdd9f8..b2619367 100644 --- a/tests/python/bindings_test_replace.py +++ b/tests/python/bindings_test_replace.py @@ -51,7 +51,7 @@ def testRandomSelf(self): allow_replace_deleted=True, ) - hnsw_index.set_ef(100) + hnsw_index.set_ef_search_default(100) hnsw_index.set_num_threads(4) # Add batch 1 and 2 @@ -213,9 +213,9 @@ def test_recall_degradation(self): bf_index = hnswlib.BFIndex(space="l2", dim=dim) bf_index.init_index(max_elements=max_num_elements) - hnsw_index_no_replace.set_ef(100) + hnsw_index_no_replace.set_ef_search_default(100) hnsw_index_no_replace.set_num_threads(50) - hnsw_index_with_replace.set_ef(100) + hnsw_index_with_replace.set_ef_search_default(100) hnsw_index_with_replace.set_num_threads(50) # Add data diff --git a/tests/python/bindings_test_resize.py b/tests/python/bindings_test_resize.py index 2f1fbfbc..75b5ed16 100644 --- a/tests/python/bindings_test_resize.py +++ b/tests/python/bindings_test_resize.py @@ -34,7 +34,7 @@ def testRandomSelf(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - p.set_ef(20) + p.set_ef_search_default(20) p.set_num_threads(idx % 8) # by default using all available cores diff --git a/tests/python/bindings_test_spaces.py b/tests/python/bindings_test_spaces.py index 5e296569..f6cd501a 100644 --- a/tests/python/bindings_test_spaces.py +++ b/tests/python/bindings_test_spaces.py @@ -36,7 +36,7 @@ def testRandomSelf(self): p = hnswlib.Index(space=space, dim=dim) p.init_index(max_elements=5, ef_construction=100, M=16) - p.set_ef(10) + p.set_ef_search_default(10) p.add_items(data2) diff --git a/tests/python/bindings_test_stress_mt_replace.py b/tests/python/bindings_test_stress_mt_replace.py index 5fbc4a2b..81e3f688 100644 --- a/tests/python/bindings_test_stress_mt_replace.py +++ b/tests/python/bindings_test_stress_mt_replace.py @@ -38,7 +38,7 @@ def testRandomSelf(self): allow_replace_deleted=True, ) - hnsw_index.set_ef(100) + hnsw_index.set_ef_search_default(100) hnsw_index.set_num_threads(50) # Add batch 1 and 2 diff --git a/tests/python/speedtest.py b/tests/python/speedtest.py index 8d16cfc3..904694a0 100644 --- a/tests/python/speedtest.py +++ b/tests/python/speedtest.py @@ -6,13 +6,13 @@ # Use nargs to specify how many arguments an option should take. ap = argparse.ArgumentParser() -ap.add_argument('-d') -ap.add_argument('-n') -ap.add_argument('-t') +ap.add_argument("-d") +ap.add_argument("-n") +ap.add_argument("-t") args = ap.parse_args() dim = int(args.d) name = args.n -threads=int(args.t) +threads = int(args.t) num_elements = 400000 # Generating sample data @@ -22,7 +22,7 @@ # index_path=f'speed_index{dim}.bin' # Declaring index -p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip +p = hnswlib.Index(space="l2", dim=dim) # possible options are l2, cosine or ip # if not os.path.isfile(index_path) : @@ -30,36 +30,35 @@ # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search -p.set_ef(10) +p.set_ef_search_default(10) # Set number of threads used during batch search/construction # By default using all available cores p.set_num_threads(64) -t0=time.time() +t0 = time.time() p.add_items(data) -construction_time=time.time()-t0 +construction_time = time.time() - t0 # Serializing and deleting the index: # print("Saving index to '%s'" % index_path) # p.save_index(index_path) p.set_num_threads(threads) -times=[] +times = [] time.sleep(1) -p.set_ef(15) +p.set_ef_search_default(15) for _ in range(1): # p.load_index(index_path) for _ in range(3): - t0=time.time() - qdata=data[:5000*threads] + t0 = time.time() + qdata = data[: 5000 * threads] labels, distances = p.knn_query(qdata, k=1) - tt=time.time()-t0 + tt = time.time() - t0 times.append(tt) - recall=np.sum(labels.reshape(-1)==np.arange(len(qdata)))/len(qdata) - print(f"{tt} seconds, recall= {recall}") - -str_out=f"{np.mean(times)}, {np.median(times)}, {np.std(times)}, {construction_time}, {recall}, {name}" + recall = np.sum(labels.reshape(-1) == np.arange(len(qdata))) / len(qdata) + print(f"{tt} seconds, recall= {recall}") + +str_out = f"{np.mean(times)}, {np.median(times)}, {np.std(times)}, {construction_time}, {recall}, {name}" print(str_out) -with open (f"log2_{dim}_t{threads}.txt","a") as f: - f.write(str_out+"\n") +with open(f"log2_{dim}_t{threads}.txt", "a") as f: + f.write(str_out + "\n") f.flush() - From 6dbfbc06153f3dc88b06bcc19c47724570b4945a Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 17:07:10 -0700 Subject: [PATCH 02/15] Concurrency --- hnswlib/hnswalg.h | 25 ++++++++++++++++++------- setup.py | 4 ++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 65bb7ba3..adc8d211 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace hnswlib { @@ -44,6 +45,7 @@ namespace hnswlib std::mutex global; std::vector link_list_locks_; + std::shared_mutex ef_search_default_lock_; tableint enterpoint_node_{0}; @@ -209,9 +211,10 @@ namespace hnswlib } }; - void setEfSearchDefault(size_t ef) + void setEfSearchDefault(size_t ef_search_default) { - ef_search_default_ = ef; + std::unique_lock lock(ef_search_default_lock_); + ef_search_default_ = ef_search_default; } inline std::mutex &getLabelOpMutex(labeltype label) const @@ -375,7 +378,7 @@ namespace hnswlib template std::priority_queue, std::vector>, CompareByFirst> - searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, BaseFilterFunctor *isIdAllowed = nullptr) const + searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef_search_default, BaseFilterFunctor *isIdAllowed = nullptr) const { VisitedList *vl = visited_list_pool_->getFreeVisitedList(); vl_type *visited_array = vl->mass; @@ -405,7 +408,7 @@ namespace hnswlib std::pair current_node_pair = candidate_set.top(); if ((-current_node_pair.first) > lowerBound && - (top_candidates.size() == ef || (!isIdAllowed && !has_deletions))) + (top_candidates.size() == ef_search_default || (!isIdAllowed && !has_deletions))) { break; } @@ -444,7 +447,7 @@ namespace hnswlib char *currObj1 = (getDataByInternalId(candidate_id)); dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_); - if (top_candidates.size() < ef || lowerBound > dist) + if (top_candidates.size() < ef_search_default || lowerBound > dist) { candidate_set.emplace(-dist, candidate_id); #ifdef USE_SSE @@ -456,7 +459,7 @@ namespace hnswlib if ((!has_deletions || !isMarkedDeleted(candidate_id)) && ((!isIdAllowed) || (*isIdAllowed)(getExternalLabel(candidate_id)))) top_candidates.emplace(dist, candidate_id); - if (top_candidates.size() > ef) + if (top_candidates.size() > ef_search_default) top_candidates.pop(); if (!top_candidates.empty()) @@ -1720,7 +1723,15 @@ namespace hnswlib std::priority_queue> searchKnn(const void *query_data, size_t k, BaseFilterFunctor *isIdAllowed = nullptr, const std::optional ef_search = std::nullopt) const { - std::priority_queue> result; + if !ef_search + .has_value() + { + std::shared_lock lock(ef_search_default_lock_); + ef_search = ef_search_default_; + } + + std::priority_queue> + result; if (cur_element_count == 0) return result; diff --git a/setup.py b/setup.py index 0746c2f9..8ecb30e8 100644 --- a/setup.py +++ b/setup.py @@ -93,8 +93,8 @@ class BuildExt(build_ext): if sys.platform == "darwin": if platform.machine() == "arm64": c_opts["unix"].remove("-march=native") - c_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.7"] - link_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.7"] + c_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.12"] + link_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.12"] else: c_opts["unix"].append("-fopenmp") link_opts["unix"].extend(["-fopenmp", "-pthread"]) From c9170593a015732963a8400ef42531404ba8fa39 Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 17:17:12 -0700 Subject: [PATCH 03/15] No need to check value --- hnswlib/hnswalg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index adc8d211..300add0b 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -1772,12 +1772,12 @@ namespace hnswlib if (num_deleted_) { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_search.value_or(ef_search_default_), k), isIdAllowed); + currObj, query_data, std::max(ef_search, k), isIdAllowed); } else { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_search.value_or(ef_search_default_), k), isIdAllowed); + currObj, query_data, std::max(ef_search, k), isIdAllowed); } while (top_candidates.size() > k) From 600b08c03c8ac447c825b4369d9176e3720737aa Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 17:34:32 -0700 Subject: [PATCH 04/15] More concurrency --- hnswlib/hnswalg.h | 20 +++++++++++--------- setup.py | 4 ++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 300add0b..4da17098 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace hnswlib { @@ -45,7 +46,7 @@ namespace hnswlib std::mutex global; std::vector link_list_locks_; - std::shared_mutex ef_search_default_lock_; + mutable std::shared_mutex ef_search_default_lock_; tableint enterpoint_node_{0}; @@ -1723,13 +1724,14 @@ namespace hnswlib std::priority_queue> searchKnn(const void *query_data, size_t k, BaseFilterFunctor *isIdAllowed = nullptr, const std::optional ef_search = std::nullopt) const { - if !ef_search - .has_value() - { - std::shared_lock lock(ef_search_default_lock_); - ef_search = ef_search_default_; - } + size_t get_ef_search_default() + { + std::shared_lock lock(ef_search_default_lock_); + return ef_search_default_; + } + + const std::size_t this_ef_search = ef_search.has_value() ? ef_search.value() : get_ef_search_default(); std::priority_queue> result; if (cur_element_count == 0) @@ -1772,12 +1774,12 @@ namespace hnswlib if (num_deleted_) { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_search, k), isIdAllowed); + currObj, query_data, std::max(this_ef_search, k), isIdAllowed); } else { top_candidates = searchBaseLayerST( - currObj, query_data, std::max(ef_search, k), isIdAllowed); + currObj, query_data, std::max(this_ef_search, k), isIdAllowed); } while (top_candidates.size() > k) diff --git a/setup.py b/setup.py index 8ecb30e8..99a9ded2 100644 --- a/setup.py +++ b/setup.py @@ -93,8 +93,8 @@ class BuildExt(build_ext): if sys.platform == "darwin": if platform.machine() == "arm64": c_opts["unix"].remove("-march=native") - c_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.12"] - link_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.12"] + c_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.13"] + link_opts["unix"] += ["-stdlib=libc++", "-mmacosx-version-min=10.13"] else: c_opts["unix"].append("-fopenmp") link_opts["unix"].extend(["-fopenmp", "-pthread"]) From ea3c2efe19721a9814bc204469b8e5d4896f7a2e Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 17:37:09 -0700 Subject: [PATCH 05/15] lambnda --- hnswlib/hnswalg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 4da17098..7b628fa4 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -1725,11 +1725,11 @@ namespace hnswlib searchKnn(const void *query_data, size_t k, BaseFilterFunctor *isIdAllowed = nullptr, const std::optional ef_search = std::nullopt) const { - size_t get_ef_search_default() + auto get_ef_search_default = [this]() { std::shared_lock lock(ef_search_default_lock_); return ef_search_default_; - } + }; const std::size_t this_ef_search = ef_search.has_value() ? ef_search.value() : get_ef_search_default(); std::priority_queue> From 1a6ab0429eea28afba0e49113c283e280afd9c6a Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 17:40:52 -0700 Subject: [PATCH 06/15] headers --- hnswlib/hnswlib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index e7798294..59e31a91 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -122,7 +122,7 @@ static bool AVX512Capable() #include #include #include - +#include namespace hnswlib { typedef size_t labeltype; From e7fbc6ba3ac4f846b972dabffe22f445dafabca3 Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 17:42:14 -0700 Subject: [PATCH 07/15] example --- examples/python/EXAMPLES.md | 2 +- examples/python/example_search.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/python/EXAMPLES.md b/examples/python/EXAMPLES.md index 2b9b7005..f4e42105 100644 --- a/examples/python/EXAMPLES.md +++ b/examples/python/EXAMPLES.md @@ -37,7 +37,7 @@ p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") -print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") +print(f"Search speed/quality trade-off parameter: ef={p_copy_search_default}") ``` An example with updates after serialization/deserialization: diff --git a/examples/python/example_search.py b/examples/python/example_search.py index da48aab3..fe9cc30c 100644 --- a/examples/python/example_search.py +++ b/examples/python/example_search.py @@ -42,4 +42,4 @@ print( f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}" ) -print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef_search_default}") From 292798959c449a8bdc190372c5b5c623e9b1253e Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 17:58:07 -0700 Subject: [PATCH 08/15] Tests, headers --- python_bindings/bindings.cpp | 1 + tests/cpp/persistent_test.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 1e037d45..fbbb9399 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace py = pybind11; using namespace pybind11::literals; // needed to bring in _a literal diff --git a/tests/cpp/persistent_test.cpp b/tests/cpp/persistent_test.cpp index 666999b5..7ed569b5 100644 --- a/tests/cpp/persistent_test.cpp +++ b/tests/cpp/persistent_test.cpp @@ -34,7 +34,7 @@ namespace } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 100, false, false, true, "."); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 10, 100, false, false, true, "."); for (size_t i = 0; i < n; i++) { @@ -105,7 +105,7 @@ namespace } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n / 4, 16, 200, 100, false, false, true, "."); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n / 4, 16, 200, 10, 100, false, false, true, "."); // Add a quarter of the data for (size_t i = 0; i < n / 4; i++) @@ -189,7 +189,7 @@ namespace } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n, 16, 200, 100, false, false, true, "."); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n, 16, 200, 10, 100, false, false, true, "."); for (size_t i = 0; i < n; i++) { @@ -256,7 +256,7 @@ namespace } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n, 16, 200, 100, false, false, true, "."); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n, 16, 200, 10, 100, false, false, true, "."); for (size_t i = 0; i < n; i++) { From dbe6344f844199c60e4578275ba3e4b91e295f08 Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 18:05:06 -0700 Subject: [PATCH 09/15] Pass c++ version to windows compiler --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 99a9ded2..931c4902 100644 --- a/setup.py +++ b/setup.py @@ -109,6 +109,7 @@ def build_extensions(self): opts.append("-fvisibility=hidden") elif ct == "msvc": opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) + opts.append(cpp_flag(self.compiler)) for ext in self.extensions: ext.extra_compile_args.extend(opts) From 118528edb23aed00f1439a6a5d90e3c1c5860f4d Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 18:07:27 -0700 Subject: [PATCH 10/15] windows, please --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 931c4902..64171076 100644 --- a/setup.py +++ b/setup.py @@ -109,7 +109,7 @@ def build_extensions(self): opts.append("-fvisibility=hidden") elif ct == "msvc": opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - opts.append(cpp_flag(self.compiler)) + opts.append("/std::c++17") for ext in self.extensions: ext.extra_compile_args.extend(opts) From 58c3df0a7387bc94bd96a425ef6b6a11b27d6ec3 Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 18:10:48 -0700 Subject: [PATCH 11/15] windows... --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 64171076..b89243bb 100644 --- a/setup.py +++ b/setup.py @@ -109,7 +109,7 @@ def build_extensions(self): opts.append("-fvisibility=hidden") elif ct == "msvc": opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) - opts.append("/std::c++17") + opts.append("/std:c++latest") for ext in self.extensions: ext.extra_compile_args.extend(opts) From c608297e1b56c3815f1ad7403cb09b7d98fe5266 Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 18:19:24 -0700 Subject: [PATCH 12/15] positional args will be the death of me --- examples/cpp/EXAMPLES.md | 3 ++- examples/cpp/example_mt_replace_deleted.cpp | 15 ++++++++------- examples/cpp/example_replace_deleted.cpp | 3 ++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/examples/cpp/EXAMPLES.md b/examples/cpp/EXAMPLES.md index 3af603d4..c92be673 100644 --- a/examples/cpp/EXAMPLES.md +++ b/examples/cpp/EXAMPLES.md @@ -134,10 +134,11 @@ int main() { int M = 16; // Tightly connected with internal dimensionality of the data // strongly affects the memory consumption int ef_construction = 200; // Controls index search speed/build speed tradeoff + int ef_search_default = 10 // Controls index search speed tradeoff // Initing index hnswlib::L2Space space(dim); - hnswlib::HierarchicalNSW* alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction, 100, true); + hnswlib::HierarchicalNSW* alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction, ef_search_default, 100, true); // Generate random data std::mt19937 rng; diff --git a/examples/cpp/example_mt_replace_deleted.cpp b/examples/cpp/example_mt_replace_deleted.cpp index 0b2aa2ad..99a146a7 100644 --- a/examples/cpp/example_mt_replace_deleted.cpp +++ b/examples/cpp/example_mt_replace_deleted.cpp @@ -69,17 +69,18 @@ inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn int main() { - int dim = 16; // Dimension of the elements - int max_elements = 10000; // Maximum number of elements, should be known beforehand - int M = 16; // Tightly connected with internal dimensionality of the data - // strongly affects the memory consumption - int ef_construction = 200; // Controls index search speed/build speed tradeoff - int num_threads = 20; // Number of threads for operations with index + int dim = 16; // Dimension of the elements + int max_elements = 10000; // Maximum number of elements, should be known beforehand + int M = 16; // Tightly connected with internal dimensionality of the data + // strongly affects the memory consumption + int ef_construction = 200; // Controls index search speed/build speed tradeoff + int ef_search_default = 10; // Controls index search speed/recall tradeoff + int num_threads = 20; // Number of threads for operations with index // Initing index with allow_replace_deleted=true int seed = 100; hnswlib::L2Space space(dim); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction, seed, true); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction, ef_search_default, seed, true); // Generate random data std::mt19937 rng; diff --git a/examples/cpp/example_replace_deleted.cpp b/examples/cpp/example_replace_deleted.cpp index 23374d17..781238ab 100644 --- a/examples/cpp/example_replace_deleted.cpp +++ b/examples/cpp/example_replace_deleted.cpp @@ -7,11 +7,12 @@ int main() int M = 16; // Tightly connected with internal dimensionality of the data // strongly affects the memory consumption int ef_construction = 200; // Controls index search speed/build speed tradeoff + int ef_search_default = 10; // Initing index with allow_replace_deleted=true int seed = 100; hnswlib::L2Space space(dim); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction, seed, true); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, M, ef_construction, ef_search_default, seed, true); // Generate random data std::mt19937 rng; From 2662d5f187ecd20221b77a76817ef78ad697f786 Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 18:35:15 -0700 Subject: [PATCH 13/15] Windows... --- python_bindings/bindings.cpp | 2 +- tests/cpp/sift_1b.cpp | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index fbbb9399..d69be13a 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -579,7 +579,7 @@ class Index assert_true(appr_alg->mult_ == d["mult"].cast(), "Invalid value of mult_ "); assert_true(appr_alg->ef_construction_ == d["ef_construction"].cast(), "Invalid value of ef_construction_ "); - appr_alg->ef_search_default_ = d["ef_search_default"].cast(); + assert_true(appr_alg->ef_search_default_ = d["ef_search_default"].cast(), "Invalid value of ef_search_default_ "); assert_true(appr_alg->size_links_per_element_ == d["size_links_per_element"].cast(), "Invalid value of size_links_per_element_ "); diff --git a/tests/cpp/sift_1b.cpp b/tests/cpp/sift_1b.cpp index 43a0dfcc..7a15c15b 100644 --- a/tests/cpp/sift_1b.cpp +++ b/tests/cpp/sift_1b.cpp @@ -39,6 +39,9 @@ class StopW */ #if defined(_WIN32) +#define NOMINMAX +#define WIN32_LEAN_AND_MEAN +#define _ENABLE_EXTENDED_ALIGNED_STORAGE #include #include @@ -235,7 +238,8 @@ test_vs_recall( { efs.push_back(i); } - for (size_t ef : efs) { + for (size_t ef : efs) + { appr_alg.setEfSearchDefault(ef); StopW stopw = StopW(); From 6a3b38bc47735166b37297208bcc8206b82076fd Mon Sep 17 00:00:00 2001 From: atroyn Date: Thu, 27 Jun 2024 18:45:31 -0700 Subject: [PATCH 14/15] Tests --- tests/cpp/getUnormalized_test.cpp | 8 ++++---- tests/cpp/multiThread_replace_test.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/cpp/getUnormalized_test.cpp b/tests/cpp/getUnormalized_test.cpp index bdca679c..6f925006 100644 --- a/tests/cpp/getUnormalized_test.cpp +++ b/tests/cpp/getUnormalized_test.cpp @@ -29,7 +29,7 @@ namespace } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 100, false, true); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 10, 100, false, true); for (size_t i = 0; i < n; i++) { @@ -69,7 +69,7 @@ namespace } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 100, false, true); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 10, 100, false, true); for (size_t i = 0; i < n; i++) { @@ -113,7 +113,7 @@ namespace } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 100, false, true); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, 2 * n, 16, 200, 10, 100, false, true); for (size_t i = 0; i < n; i++) { @@ -179,7 +179,7 @@ void testResizeUnormalizedData() } hnswlib::InnerProductSpace space(d); - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n, 16, 200, 100, false, true); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, n, 16, 200, 10, 100, false, true); for (size_t i = 0; i < n; i++) { diff --git a/tests/cpp/multiThread_replace_test.cpp b/tests/cpp/multiThread_replace_test.cpp index c935f758..dbb0d1f3 100644 --- a/tests/cpp/multiThread_replace_test.cpp +++ b/tests/cpp/multiThread_replace_test.cpp @@ -102,7 +102,7 @@ int main() int iter = 0; while (iter < 200) { - hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, 16, 200, 123, true); + hnswlib::HierarchicalNSW *alg_hnsw = new hnswlib::HierarchicalNSW(&space, max_elements, 16, 200, 100, 123, true); // add batch1 data ParallelFor(0, max_elements, num_threads, [&](size_t row, size_t threadId) From 0bc3017f9708c3c9850c21f99ab6527aaae95509 Mon Sep 17 00:00:00 2001 From: atroyn Date: Fri, 28 Jun 2024 12:05:35 -0700 Subject: [PATCH 15/15] Typo --- examples/python/EXAMPLES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/python/EXAMPLES.md b/examples/python/EXAMPLES.md index f4e42105..5643f1f9 100644 --- a/examples/python/EXAMPLES.md +++ b/examples/python/EXAMPLES.md @@ -37,7 +37,7 @@ p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") -print(f"Search speed/quality trade-off parameter: ef={p_copy_search_default}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef_search_default}") ``` An example with updates after serialization/deserialization: