From 38c4a1783e20149aefd293c3dc438898f10ab4a0 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Mon, 6 Nov 2023 10:46:44 -0500
Subject: [PATCH] The final candidate of the version that uses Faiss 1.7.4

Signed-off-by: Alexandr Guzhva <alexanderguzhva@gmail.com>
---
 .github/workflows/ut.yaml                     |    2 +-
 CMakeLists.txt                                |   12 +
 cmake/libs/libfaiss.cmake                     |    4 +
 conanfile.py                                  |    5 +
 include/knowhere/bitsetview_idselector.h      |   33 +
 include/knowhere/comp/index_param.h           |    1 +
 src/common/comp/brute_force.cc                |   50 +-
 src/common/config.cc                          |    1 +
 src/index/flat/flat.cc                        |   33 +-
 src/index/ivf/ivf.cc                          |  223 +-
 src/index/ivf/ivf_config.h                    |    5 +
 src/simd/distances_avx.cc                     |  150 +-
 src/simd/distances_avx.h                      |   11 +
 src/simd/distances_avx512.cc                  |  179 +-
 src/simd/distances_avx512.h                   |   11 +
 src/simd/distances_ref.cc                     |  122 +-
 src/simd/distances_ref.h                      |   32 +
 src/simd/hook.cc                              |   24 +-
 src/simd/hook.h                               |   45 +
 tests/faiss/CMakeLists.txt                    |   67 +
 .../cmake/utils/platform_check.cmake          |   12 +
 tests/faiss_isolated/cmake/utils/utils.cmake  |   60 +
 tests/ut/test_ivfflat_cc.cc                   |    4 +-
 thirdparty/faiss/.circleci/Dockerfile.cpu     |   11 -
 .../faiss/.circleci/Dockerfile.faiss_gpu      |   28 -
 thirdparty/faiss/.circleci/config.yml         |  647 ++---
 thirdparty/faiss/CHANGELOG.md                 |   56 +-
 thirdparty/faiss/CMakeLists.txt               |   47 +-
 thirdparty/faiss/CONTRIBUTING.md              |    2 +-
 thirdparty/faiss/Doxyfile                     |    2 +-
 thirdparty/faiss/INSTALL.md                   |   40 +-
 thirdparty/faiss/README.md                    |   35 +-
 thirdparty/faiss/benchs/CMakeLists.txt        |   11 +
 thirdparty/faiss/benchs/README.md             |   39 +-
 thirdparty/faiss/benchs/bench_6bit_codec.cpp  |   39 +-
 .../faiss/benchs/bench_all_ivf/README.md      |    2 +-
 .../benchs/bench_all_ivf/bench_all_ivf.py     |   20 +-
 .../benchs/bench_all_ivf/cmp_with_scann.py    |   66 +-
 .../{datasets.py => datasets_oss.py}          |    1 -
 .../faiss/benchs/bench_big_batch_ivf.py       |  109 +
 .../benchs/bench_cppcontrib_sa_decode.cpp     | 1734 +++++++++++++
 thirdparty/faiss/benchs/bench_gpu_1bn.py      |    2 +-
 thirdparty/faiss/benchs/bench_gpu_sift1m.py   |    3 +-
 .../faiss/benchs/bench_hamming_computer.cpp   |  222 ++
 thirdparty/faiss/benchs/bench_hamming_knn.py  |   29 +
 thirdparty/faiss/benchs/bench_hnsw.py         |    2 +-
 .../faiss/benchs/bench_hybrid_cpu_gpu.py      |  606 +++++
 thirdparty/faiss/benchs/bench_ivf_fastscan.py |  112 +
 .../benchs/bench_ivf_fastscan_single_query.py |  122 +
 .../faiss/benchs/bench_ivf_selector.cpp       |  145 ++
 .../faiss/benchs/bench_polysemous_1bn.py      |    2 +-
 .../bench_pq_transposed_centroid_table.py     |  136 ++
 thirdparty/faiss/benchs/bench_quantizer.py    |   35 +-
 .../faiss/benchs/distributed_ondisk/README.md |  147 +-
 .../distributed_ondisk/distributed_kmeans.py  |  201 +-
 .../distributed_ondisk/make_index_vslice.py   |    2 +-
 .../distributed_ondisk/merge_to_ondisk.py     |    2 +-
 .../faiss/benchs/distributed_ondisk/rpc.py    |  252 --
 .../distributed_ondisk/search_server.py       |    9 +-
 .../faiss/benchs/link_and_code/README.md      |   16 +-
 .../link_and_code/bench_link_and_code.py      |    3 -
 thirdparty/faiss/conda/Dockerfile.cpu         |   19 -
 thirdparty/faiss/conda/Dockerfile.cuda10.2    |   18 -
 thirdparty/faiss/conda/Dockerfile.cuda11.3    |   18 -
 .../faiss/conda/conda_build_config.yaml       |    8 +-
 .../faiss/conda/faiss-gpu-raft/build-lib.sh   |   26 +
 .../faiss/conda/faiss-gpu-raft/build-pkg.sh   |   24 +
 .../faiss/conda/faiss-gpu-raft/meta.yaml      |  104 +
 .../test_cpu_dispatch.sh}                     |    4 +-
 thirdparty/faiss/conda/faiss-gpu/build-lib.sh |    3 +-
 thirdparty/faiss/conda/faiss-gpu/build-pkg.sh |    3 +-
 thirdparty/faiss/conda/faiss-gpu/meta.yaml    |   44 +-
 .../faiss/conda/faiss/build-lib-arm64.sh      |   22 +
 thirdparty/faiss/conda/faiss/build-lib-osx.sh |   27 +
 thirdparty/faiss/conda/faiss/build-lib.sh     |    2 +-
 .../faiss/conda/faiss/build-pkg-arm64.sh      |   22 +
 thirdparty/faiss/conda/faiss/build-pkg-osx.sh |   26 +
 thirdparty/faiss/conda/faiss/build-pkg.sh     |    2 +-
 thirdparty/faiss/conda/faiss/install-cmake.sh |   10 -
 thirdparty/faiss/conda/faiss/meta.yaml        |   38 +-
 thirdparty/faiss/contrib/README.md            |   17 +-
 thirdparty/faiss/contrib/big_batch_search.py  |  508 ++++
 thirdparty/faiss/contrib/client_server.py     |    2 +-
 thirdparty/faiss/contrib/clustering.py        |  399 +++
 thirdparty/faiss/contrib/datasets.py          |   66 +
 thirdparty/faiss/contrib/evaluation.py        |  239 +-
 thirdparty/faiss/contrib/exhaustive_search.py |  130 +-
 thirdparty/faiss/contrib/inspect_tools.py     |   27 +
 thirdparty/faiss/contrib/ivf_tools.py         |   82 +-
 thirdparty/faiss/contrib/ondisk.py            |   18 +-
 thirdparty/faiss/contrib/rpc.py               |   23 +-
 thirdparty/faiss/contrib/torch_utils.py       |   21 +-
 thirdparty/faiss/demos/CMakeLists.txt         |    3 +
 thirdparty/faiss/demos/demo_imi_flat.cpp      |    4 +-
 thirdparty/faiss/demos/demo_imi_pq.cpp        |    8 +-
 .../faiss/demos/demo_ivfpq_indexing.cpp       |    4 +-
 thirdparty/faiss/demos/demo_nndescent.cpp     |    4 +-
 .../faiss/demos/demo_residual_quantizer.cpp   |  297 +++
 thirdparty/faiss/demos/demo_sift1M.cpp        |    8 +-
 .../faiss/demos/demo_weighted_kmeans.cpp      |    9 +-
 thirdparty/faiss/faiss/AutoTune.cpp           |   46 +-
 thirdparty/faiss/faiss/AutoTune.h             |    2 -
 thirdparty/faiss/faiss/Clustering.cpp         |   32 +-
 thirdparty/faiss/faiss/Clustering.h           |   54 +-
 thirdparty/faiss/faiss/FaissHook.h            |   14 +-
 thirdparty/faiss/faiss/IVFlib.cpp             |  109 +-
 thirdparty/faiss/faiss/IVFlib.h               |   30 +-
 thirdparty/faiss/faiss/Index.cpp              |   47 +-
 thirdparty/faiss/faiss/Index.h                |   67 +-
 thirdparty/faiss/faiss/Index2Layer.cpp        |   24 +-
 thirdparty/faiss/faiss/Index2Layer.h          |   10 +-
 .../faiss/faiss/IndexAdditiveQuantizer.cpp    |  385 ++-
 .../faiss/faiss/IndexAdditiveQuantizer.h      |   76 +-
 .../faiss/IndexAdditiveQuantizerFastScan.cpp  |  299 +++
 .../faiss/IndexAdditiveQuantizerFastScan.h    |  199 ++
 thirdparty/faiss/faiss/IndexBinary.cpp        |   31 +-
 thirdparty/faiss/faiss/IndexBinary.h          |   46 +-
 thirdparty/faiss/faiss/IndexBinaryFlat.cpp    |   53 +-
 thirdparty/faiss/faiss/IndexBinaryFlat.h      |   10 +-
 .../faiss/faiss/IndexBinaryFromFloat.cpp      |    7 +-
 thirdparty/faiss/faiss/IndexBinaryFromFloat.h |    2 +-
 thirdparty/faiss/faiss/IndexBinaryHNSW.cpp    |   37 +-
 thirdparty/faiss/faiss/IndexBinaryHNSW.h      |    2 +-
 thirdparty/faiss/faiss/IndexBinaryHash.cpp    |   99 +-
 thirdparty/faiss/faiss/IndexBinaryHash.h      |    8 +-
 thirdparty/faiss/faiss/IndexBinaryIVF.cpp     |  772 +++---
 thirdparty/faiss/faiss/IndexBinaryIVF.h       |  116 +-
 .../faiss/faiss/IndexBinaryIVFThreadSafe.cpp  |  819 -------
 thirdparty/faiss/faiss/IndexFastScan.cpp      |  629 +++++
 thirdparty/faiss/faiss/IndexFastScan.h        |  152 ++
 thirdparty/faiss/faiss/IndexFlat.cpp          |  311 ++-
 thirdparty/faiss/faiss/IndexFlat.h            |   35 +-
 thirdparty/faiss/faiss/IndexFlatCodes.cpp     |   52 +-
 thirdparty/faiss/faiss/IndexFlatCodes.h       |   24 +-
 thirdparty/faiss/faiss/IndexFlatElkan.cpp     |   79 +
 thirdparty/faiss/faiss/IndexFlatElkan.h       |   45 +
 thirdparty/faiss/faiss/IndexHNSW.cpp          |  253 +-
 thirdparty/faiss/faiss/IndexHNSW.h            |   17 +-
 thirdparty/faiss/faiss/IndexIDMap.cpp         |  281 +++
 thirdparty/faiss/faiss/IndexIDMap.h           |  129 +
 thirdparty/faiss/faiss/IndexIVF.cpp           |  615 +++--
 thirdparty/faiss/faiss/IndexIVF.h             |  273 ++-
 .../faiss/faiss/IndexIVFAdditiveQuantizer.cpp |  128 +-
 .../faiss/faiss/IndexIVFAdditiveQuantizer.h   |   69 +-
 .../IndexIVFAdditiveQuantizerFastScan.cpp     |  574 +++++
 .../faiss/IndexIVFAdditiveQuantizerFastScan.h |  173 ++
 thirdparty/faiss/faiss/IndexIVFFastScan.cpp   | 1618 ++++++++++++
 thirdparty/faiss/faiss/IndexIVFFastScan.h     |  263 ++
 thirdparty/faiss/faiss/IndexIVFFlat.cpp       |  328 ++-
 thirdparty/faiss/faiss/IndexIVFFlat.h         |   15 +-
 .../faiss/IndexIVFIndependentQuantizer.cpp    |  172 ++
 .../faiss/IndexIVFIndependentQuantizer.h      |   56 +
 thirdparty/faiss/faiss/IndexIVFPQ.cpp         |  589 +++--
 thirdparty/faiss/faiss/IndexIVFPQ.h           |   14 +-
 thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp | 1216 +---------
 thirdparty/faiss/faiss/IndexIVFPQFastScan.h   |  156 +-
 thirdparty/faiss/faiss/IndexIVFPQR.cpp        |   50 +-
 thirdparty/faiss/faiss/IndexIVFPQR.h          |    9 +-
 .../faiss/faiss/IndexIVFSpectralHash.cpp      |   68 +-
 thirdparty/faiss/faiss/IndexIVFSpectralHash.h |   15 +-
 thirdparty/faiss/faiss/IndexIVFThreadSafe.cpp |  162 --
 thirdparty/faiss/faiss/IndexLSH.cpp           |   17 +-
 thirdparty/faiss/faiss/IndexLSH.h             |    2 +-
 thirdparty/faiss/faiss/IndexLattice.cpp       |   10 +-
 thirdparty/faiss/faiss/IndexLattice.h         |    3 +-
 thirdparty/faiss/faiss/IndexNNDescent.cpp     |    9 +-
 thirdparty/faiss/faiss/IndexNNDescent.h       |    3 +-
 thirdparty/faiss/faiss/IndexNSG.cpp           |   62 +-
 thirdparty/faiss/faiss/IndexNSG.h             |   43 +-
 thirdparty/faiss/faiss/IndexPQ.cpp            |  247 +-
 thirdparty/faiss/faiss/IndexPQ.h              |   25 +-
 thirdparty/faiss/faiss/IndexPQFastScan.cpp    |  461 +---
 thirdparty/faiss/faiss/IndexPQFastScan.h      |   82 +-
 thirdparty/faiss/faiss/IndexPreTransform.cpp  |   78 +-
 thirdparty/faiss/faiss/IndexPreTransform.h    |   16 +-
 thirdparty/faiss/faiss/IndexRefine.cpp        |   56 +-
 thirdparty/faiss/faiss/IndexRefine.h          |    4 +-
 thirdparty/faiss/faiss/IndexReplicas.cpp      |   61 +-
 thirdparty/faiss/faiss/IndexReplicas.h        |    3 +-
 thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp |  445 ++++
 thirdparty/faiss/faiss/IndexRowwiseMinMax.h   |   99 +
 thirdparty/faiss/faiss/IndexScaNN.cpp         |   58 +-
 thirdparty/faiss/faiss/IndexScaNN.h           |   17 +-
 .../faiss/faiss/IndexScalarQuantizer.cpp      |   87 +-
 thirdparty/faiss/faiss/IndexScalarQuantizer.h |   27 +-
 thirdparty/faiss/faiss/IndexShards.cpp        |  187 +-
 thirdparty/faiss/faiss/IndexShards.h          |    7 +-
 thirdparty/faiss/faiss/IndexShardsIVF.cpp     |  246 ++
 thirdparty/faiss/faiss/IndexShardsIVF.h       |   42 +
 thirdparty/faiss/faiss/MatrixStats.cpp        |   41 +-
 thirdparty/faiss/faiss/MatrixStats.h          |   30 +-
 thirdparty/faiss/faiss/MetaIndexes.cpp        |  272 +--
 thirdparty/faiss/faiss/MetaIndexes.h          |  112 +-
 thirdparty/faiss/faiss/MetricType.h           |   17 +-
 thirdparty/faiss/faiss/VectorTransform.cpp    |   96 +-
 thirdparty/faiss/faiss/VectorTransform.h      |   44 +-
 thirdparty/faiss/faiss/clone_index.cpp        |  275 ++-
 thirdparty/faiss/faiss/clone_index.h          |   10 +
 .../faiss/faiss/cppcontrib/SaDecodeKernels.h  |  322 +++
 .../faiss/cppcontrib/detail/CoarseBitType.h   |   31 +
 .../faiss/cppcontrib/detail/UintReader.h      |  273 +++
 .../cppcontrib/sa_decode/Level2-avx2-inl.h    | 2072 ++++++++++++++++
 .../faiss/cppcontrib/sa_decode/Level2-inl.h   |  414 ++++
 .../cppcontrib/sa_decode/Level2-neon-inl.h    | 2161 +++++++++++++++++
 .../faiss/cppcontrib/sa_decode/MinMax-inl.h   |  467 ++++
 .../cppcontrib/sa_decode/MinMaxFP16-inl.h     |  472 ++++
 .../faiss/cppcontrib/sa_decode/PQ-avx2-inl.h  | 1625 +++++++++++++
 .../faiss/faiss/cppcontrib/sa_decode/PQ-inl.h |  257 ++
 .../faiss/cppcontrib/sa_decode/PQ-neon-inl.h  | 1460 +++++++++++
 .../faiss/faiss/impl/AdditiveQuantizer.cpp    |  171 +-
 .../faiss/faiss/impl/AdditiveQuantizer.h      |   77 +-
 .../faiss/faiss/impl/AuxIndexStructures.cpp   |   78 +-
 .../faiss/faiss/impl/AuxIndexStructures.h     |   87 +-
 thirdparty/faiss/faiss/impl/CodePacker.cpp    |   67 +
 thirdparty/faiss/faiss/impl/CodePacker.h      |   71 +
 .../faiss/faiss/impl/DistanceComputer.h       |   85 +
 thirdparty/faiss/faiss/impl/FaissException.h  |   18 +
 thirdparty/faiss/faiss/impl/HNSW.cpp          |  463 +++-
 thirdparty/faiss/faiss/impl/HNSW.h            |   52 +-
 thirdparty/faiss/faiss/impl/IDSelector.cpp    |  125 +
 thirdparty/faiss/faiss/impl/IDSelector.h      |  173 ++
 .../faiss/faiss/impl/LocalSearchQuantizer.cpp |  151 +-
 .../faiss/faiss/impl/LocalSearchQuantizer.h   |   30 +-
 .../faiss/faiss/impl/LookupTableScaler.h      |   77 +
 thirdparty/faiss/faiss/impl/NNDescent.cpp     |   19 +-
 thirdparty/faiss/faiss/impl/NNDescent.h       |   20 +-
 thirdparty/faiss/faiss/impl/NSG.cpp           |    9 +-
 thirdparty/faiss/faiss/impl/NSG.h             |   13 +-
 .../faiss/faiss/impl/PolysemousTraining.cpp   |   19 +-
 .../faiss/faiss/impl/PolysemousTraining.h     |   21 +-
 .../faiss/impl/ProductAdditiveQuantizer.cpp   |  376 +++
 .../faiss/impl/ProductAdditiveQuantizer.h     |  154 ++
 .../faiss/faiss/impl/ProductQuantizer.cpp     |  413 ++--
 .../faiss/faiss/impl/ProductQuantizer.h       |   51 +-
 thirdparty/faiss/faiss/impl/Quantizer.h       |   46 +
 .../faiss/faiss/impl/ResidualQuantizer.cpp    |  740 ++----
 .../faiss/faiss/impl/ResidualQuantizer.h      |  140 +-
 thirdparty/faiss/faiss/impl/ResultHandler.h   |  142 +-
 .../faiss/faiss/impl/ScalarQuantizer.cpp      |  118 +-
 thirdparty/faiss/faiss/impl/ScalarQuantizer.h |  273 +--
 .../faiss/faiss/impl/ScalarQuantizerCodec.h   |  141 +-
 .../faiss/impl/ScalarQuantizerCodec_avx.h     |  110 +-
 .../faiss/impl/ScalarQuantizerCodec_avx512.h  |  133 +-
 .../faiss/faiss/impl/ScalarQuantizerDC.cpp    |   11 +-
 .../faiss/faiss/impl/ScalarQuantizerDC.h      |    9 +-
 .../faiss/impl/ScalarQuantizerDC_avx.cpp      |   13 +-
 .../faiss/faiss/impl/ScalarQuantizerDC_avx.h  |    9 +-
 .../faiss/impl/ScalarQuantizerDC_avx512.cpp   |   15 +-
 .../faiss/impl/ScalarQuantizerDC_avx512.h     |    9 +-
 .../faiss/faiss/impl/ScalarQuantizerOp.cpp    |  116 +-
 .../faiss/faiss/impl/ScalarQuantizerOp.h      |   56 +-
 .../faiss/faiss/impl/ScalarQuantizerScanner.h |  278 +++
 .../faiss/faiss/impl/ThreadedIndex-inl.h      |    6 +-
 thirdparty/faiss/faiss/impl/ThreadedIndex.h   |    8 +-
 .../impl/code_distance/code_distance-avx2.h   |  529 ++++
 .../code_distance/code_distance-generic.h     |   81 +
 .../faiss/impl/code_distance/code_distance.h  |  133 +
 .../impl/code_distance/code_distance_avx512.h |  102 +
 thirdparty/faiss/faiss/impl/index_read.cpp    |  328 ++-
 thirdparty/faiss/faiss/impl/index_write.cpp   |  275 ++-
 thirdparty/faiss/faiss/impl/kmeans1d.cpp      |   11 +-
 thirdparty/faiss/faiss/impl/kmeans1d.h        |    6 +-
 thirdparty/faiss/faiss/impl/lattice_Zn.cpp    |    7 +-
 thirdparty/faiss/faiss/impl/platform_macros.h |   75 +-
 thirdparty/faiss/faiss/impl/pq4_fast_scan.cpp |  120 +-
 thirdparty/faiss/faiss/impl/pq4_fast_scan.h   |   58 +-
 .../faiss/impl/pq4_fast_scan_search_1.cpp     |   91 +-
 .../faiss/impl/pq4_fast_scan_search_qbs.cpp   |  138 +-
 .../impl/residual_quantizer_encode_steps.cpp  |  962 ++++++++
 .../impl/residual_quantizer_encode_steps.h    |  176 ++
 .../faiss/faiss/impl/simd_result_handlers.h   |  156 +-
 thirdparty/faiss/faiss/index_factory.cpp      |  238 +-
 thirdparty/faiss/faiss/index_io.h             |    5 +
 .../faiss/invlists/BlockInvertedLists.cpp     |   41 +-
 .../faiss/faiss/invlists/BlockInvertedLists.h |   10 +-
 thirdparty/faiss/faiss/invlists/DirectMap.cpp |    2 +-
 thirdparty/faiss/faiss/invlists/DirectMap.h   |    5 +-
 .../faiss/faiss/invlists/InvertedLists.cpp    |  189 +-
 .../faiss/faiss/invlists/InvertedLists.h      |   67 +-
 .../faiss/invlists/OnDiskInvertedLists.cpp    |   13 +-
 .../faiss/invlists/OnDiskInvertedLists.h      |    2 +-
 thirdparty/faiss/faiss/utils/AlignedTable.h   |    4 +-
 thirdparty/faiss/faiss/utils/Heap.cpp         |  144 +-
 thirdparty/faiss/faiss/utils/Heap.h           |   82 +-
 .../faiss/utils/approx_topk/approx_topk.h     |   84 +
 .../faiss/faiss/utils/approx_topk/avx2-inl.h  |  196 ++
 .../faiss/faiss/utils/approx_topk/generic.h   |  138 ++
 .../faiss/faiss/utils/approx_topk/mode.h      |   34 +
 .../approx_topk_hamming/approx_topk_hamming.h |  367 +++
 .../faiss/faiss/utils/binary_distances.cpp    |   65 +-
 .../faiss/faiss/utils/binary_distances.h      |   10 +-
 thirdparty/faiss/faiss/utils/bit_table.cpp    |    1 +
 thirdparty/faiss/faiss/utils/distances.cpp    |  348 ++-
 thirdparty/faiss/faiss/utils/distances.h      |  119 +-
 .../faiss/utils/distances_fused/avx512.cpp    |  346 +++
 .../faiss/utils/distances_fused/avx512.h      |   36 +
 .../utils/distances_fused/distances_fused.cpp |   42 +
 .../utils/distances_fused/distances_fused.h   |   40 +
 .../utils/distances_fused/simdlib_based.cpp   |  352 +++
 .../utils/distances_fused/simdlib_based.h     |   32 +
 thirdparty/faiss/faiss/utils/distances_if.h   |  573 +++++
 .../faiss/faiss/utils/extra_distances-inl.h   |   22 +-
 .../faiss/faiss/utils/extra_distances.cpp     |   72 +-
 .../faiss/faiss/utils/extra_distances.h       |   12 +-
 thirdparty/faiss/faiss/utils/fp16-fp16c.h     |   28 +
 thirdparty/faiss/faiss/utils/fp16-inl.h       |  108 +
 thirdparty/faiss/faiss/utils/fp16.h           |   18 +
 thirdparty/faiss/faiss/utils/hamming-inl.h    |  293 +--
 thirdparty/faiss/faiss/utils/hamming.cpp      |  409 ++--
 thirdparty/faiss/faiss/utils/hamming.h        |   42 +-
 .../faiss/utils/hamming_distance/avx2-inl.h   |  462 ++++
 .../faiss/utils/hamming_distance/common.h     |   49 +
 .../utils/hamming_distance/generic-inl.h      |  432 ++++
 .../faiss/utils/hamming_distance/hamdis-inl.h |   83 +
 .../faiss/utils/hamming_distance/neon-inl.h   |  511 ++++
 thirdparty/faiss/faiss/utils/jaccard-inl.h    |    4 +
 .../faiss/faiss/utils/ordered_key_value.h     |   10 +
 thirdparty/faiss/faiss/utils/partitioning.cpp |   48 +-
 thirdparty/faiss/faiss/utils/prefetch.h       |   77 +
 thirdparty/faiss/faiss/utils/quantize_lut.cpp |   62 +
 thirdparty/faiss/faiss/utils/quantize_lut.h   |   20 +
 thirdparty/faiss/faiss/utils/random.cpp       |   53 +
 thirdparty/faiss/faiss/utils/random.h         |    5 +
 thirdparty/faiss/faiss/utils/simdlib.h        |    1 +
 thirdparty/faiss/faiss/utils/simdlib_avx2.h   |  347 ++-
 .../faiss/faiss/utils/simdlib_emulated.h      |  397 ++-
 thirdparty/faiss/faiss/utils/simdlib_neon.h   |  787 +++++-
 thirdparty/faiss/faiss/utils/sorting.cpp      |  832 +++++++
 thirdparty/faiss/faiss/utils/sorting.h        |  101 +
 .../utils/transpose/transpose-avx2-inl.h      |  165 ++
 thirdparty/faiss/faiss/utils/utils.cpp        |  283 +--
 thirdparty/faiss/faiss/utils/utils.h          |   85 +-
 thirdparty/faiss/tests/CMakeLists.txt         |   20 +-
 thirdparty/faiss/tests/common_faiss_tests.py  |    2 +-
 thirdparty/faiss/tests/test_RCQ_cropping.cpp  |  131 +
 thirdparty/faiss/tests/test_approx_topk.cpp   |  225 ++
 thirdparty/faiss/tests/test_binary_flat.cpp   |    2 +-
 thirdparty/faiss/tests/test_build_blocks.py   |  221 ++
 thirdparty/faiss/tests/test_clone.py          |   88 +
 thirdparty/faiss/tests/test_code_distance.cpp |  240 ++
 thirdparty/faiss/tests/test_contrib.py        |  280 ++-
 .../faiss/tests/test_contrib_with_scipy.py    |   89 +
 .../faiss/tests/test_cppcontrib_sa_decode.cpp | 1306 ++++++++++
 .../tests/test_cppcontrib_uintreader.cpp      |  114 +
 .../faiss/tests/test_dealloc_invlists.cpp     |    4 +-
 thirdparty/faiss/tests/test_distances_if.cpp  |  141 ++
 .../faiss/tests/test_distances_simd.cpp       |  110 +
 .../faiss/tests/test_extra_distances.py       |   12 +
 thirdparty/faiss/tests/test_factory.py        |   57 +-
 thirdparty/faiss/tests/test_fast_scan.py      |  321 ++-
 thirdparty/faiss/tests/test_fast_scan_ivf.py  |  348 ++-
 thirdparty/faiss/tests/test_heap.cpp          |   53 +
 thirdparty/faiss/tests/test_hnsw.cpp          |  192 ++
 thirdparty/faiss/tests/test_index.py          |  187 +-
 thirdparty/faiss/tests/test_index_accuracy.py |  305 +--
 thirdparty/faiss/tests/test_index_binary.py   |   18 +-
 .../faiss/tests/test_index_composite.py       |  244 +-
 thirdparty/faiss/tests/test_io.py             |   95 +-
 thirdparty/faiss/tests/test_ivflib.py         |    6 +-
 thirdparty/faiss/tests/test_ivfpq_codec.cpp   |    5 +-
 .../faiss/tests/test_ivfpq_indexing.cpp       |    4 +-
 ..._lsq.py => test_local_search_quantizer.py} |  210 +-
 thirdparty/faiss/tests/test_lowlevel_ivf.cpp  |    4 +-
 thirdparty/faiss/tests/test_mem_leak.cpp      |    4 +-
 thirdparty/faiss/tests/test_merge.cpp         |   14 +-
 thirdparty/faiss/tests/test_merge_index.py    |  264 ++
 thirdparty/faiss/tests/test_meta_index.py     |  190 +-
 thirdparty/faiss/tests/test_ondisk_ivf.cpp    |   16 +-
 .../faiss/tests/test_pairs_decoding.cpp       |   10 +-
 .../faiss/tests/test_params_override.cpp      |   66 +-
 thirdparty/faiss/tests/test_partition.py      |   10 +-
 thirdparty/faiss/tests/test_partitioning.cpp  |   33 +
 thirdparty/faiss/tests/test_pq_encoding.cpp   |   52 +
 .../faiss/tests/test_product_quantizer.py     |   38 +
 thirdparty/faiss/tests/test_refine.py         |   31 +-
 .../faiss/tests/test_residual_quantizer.py    |  334 ++-
 thirdparty/faiss/tests/test_rowwise_minmax.py |   56 +
 thirdparty/faiss/tests/test_search_params.py  |  468 ++++
 thirdparty/faiss/tests/test_simdlib.cpp       |  264 ++
 thirdparty/faiss/tests/test_sliding_ivf.cpp   |    4 +-
 .../faiss/tests/test_standalone_codec.py      |   24 +-
 .../faiss/tests/test_threaded_index.cpp       |   20 +-
 .../faiss/tests/test_transfer_invlists.cpp    |    2 +-
 thirdparty/faiss/tests/torch_test_contrib.py  |    5 +-
 thirdparty/faiss/tutorial/cpp/1-Flat.cpp      |    2 +-
 thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp   |    2 +-
 thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp     |    2 +-
 thirdparty/faiss/tutorial/cpp/5-GPU.cpp       |  234 --
 thirdparty/faiss/tutorial/cpp/6-GPU.cpp       |  255 --
 thirdparty/faiss/tutorial/cpp/6-RUN.cpp       |  247 --
 thirdparty/faiss/tutorial/cpp/7-GPU.cpp       |  347 ---
 thirdparty/faiss/tutorial/cpp/8-GPU.cpp       |  479 ----
 .../faiss/tutorial/cpp/9-BinaryFlat.cpp       |  115 -
 thirdparty/faiss/tutorial/cpp/CMakeLists.txt  |   21 -
 .../tutorial/cpp/tutorial_faiss_test.cpp      |  378 ---
 395 files changed, 47637 insertions(+), 12354 deletions(-)
 create mode 100644 include/knowhere/bitsetview_idselector.h
 create mode 100644 tests/faiss/CMakeLists.txt
 create mode 100644 tests/faiss_isolated/cmake/utils/platform_check.cmake
 create mode 100644 tests/faiss_isolated/cmake/utils/utils.cmake
 delete mode 100644 thirdparty/faiss/.circleci/Dockerfile.cpu
 delete mode 100644 thirdparty/faiss/.circleci/Dockerfile.faiss_gpu
 create mode 100644 thirdparty/faiss/benchs/CMakeLists.txt
 rename thirdparty/faiss/benchs/bench_all_ivf/{datasets.py => datasets_oss.py} (99%)
 create mode 100644 thirdparty/faiss/benchs/bench_big_batch_ivf.py
 create mode 100644 thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp
 create mode 100644 thirdparty/faiss/benchs/bench_hamming_knn.py
 create mode 100644 thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py
 create mode 100644 thirdparty/faiss/benchs/bench_ivf_fastscan.py
 create mode 100644 thirdparty/faiss/benchs/bench_ivf_fastscan_single_query.py
 create mode 100644 thirdparty/faiss/benchs/bench_ivf_selector.cpp
 create mode 100644 thirdparty/faiss/benchs/bench_pq_transposed_centroid_table.py
 delete mode 100755 thirdparty/faiss/benchs/distributed_ondisk/rpc.py
 delete mode 100644 thirdparty/faiss/conda/Dockerfile.cpu
 delete mode 100644 thirdparty/faiss/conda/Dockerfile.cuda10.2
 delete mode 100644 thirdparty/faiss/conda/Dockerfile.cuda11.3
 create mode 100644 thirdparty/faiss/conda/faiss-gpu-raft/build-lib.sh
 create mode 100644 thirdparty/faiss/conda/faiss-gpu-raft/build-pkg.sh
 create mode 100644 thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
 rename thirdparty/faiss/conda/{faiss-gpu/install-cmake.sh => faiss-gpu-raft/test_cpu_dispatch.sh} (54%)
 create mode 100755 thirdparty/faiss/conda/faiss/build-lib-arm64.sh
 create mode 100755 thirdparty/faiss/conda/faiss/build-lib-osx.sh
 create mode 100755 thirdparty/faiss/conda/faiss/build-pkg-arm64.sh
 create mode 100755 thirdparty/faiss/conda/faiss/build-pkg-osx.sh
 delete mode 100755 thirdparty/faiss/conda/faiss/install-cmake.sh
 create mode 100644 thirdparty/faiss/contrib/big_batch_search.py
 create mode 100644 thirdparty/faiss/contrib/clustering.py
 create mode 100644 thirdparty/faiss/demos/demo_residual_quantizer.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h
 delete mode 100644 thirdparty/faiss/faiss/IndexBinaryIVFThreadSafe.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexFastScan.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexFastScan.h
 create mode 100644 thirdparty/faiss/faiss/IndexFlatElkan.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexFlatElkan.h
 create mode 100644 thirdparty/faiss/faiss/IndexIDMap.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexIDMap.h
 create mode 100644 thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h
 create mode 100644 thirdparty/faiss/faiss/IndexIVFFastScan.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexIVFFastScan.h
 create mode 100644 thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.h
 delete mode 100644 thirdparty/faiss/faiss/IndexIVFThreadSafe.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexRowwiseMinMax.h
 create mode 100644 thirdparty/faiss/faiss/IndexShardsIVF.cpp
 create mode 100644 thirdparty/faiss/faiss/IndexShardsIVF.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/SaDecodeKernels.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/detail/CoarseBitType.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/detail/UintReader.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h
 create mode 100644 thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h
 create mode 100644 thirdparty/faiss/faiss/impl/CodePacker.cpp
 create mode 100644 thirdparty/faiss/faiss/impl/CodePacker.h
 create mode 100644 thirdparty/faiss/faiss/impl/DistanceComputer.h
 create mode 100644 thirdparty/faiss/faiss/impl/IDSelector.cpp
 create mode 100644 thirdparty/faiss/faiss/impl/IDSelector.h
 create mode 100644 thirdparty/faiss/faiss/impl/LookupTableScaler.h
 create mode 100644 thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.cpp
 create mode 100644 thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.h
 create mode 100644 thirdparty/faiss/faiss/impl/Quantizer.h
 create mode 100644 thirdparty/faiss/faiss/impl/ScalarQuantizerScanner.h
 create mode 100644 thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h
 create mode 100644 thirdparty/faiss/faiss/impl/code_distance/code_distance-generic.h
 create mode 100644 thirdparty/faiss/faiss/impl/code_distance/code_distance.h
 create mode 100644 thirdparty/faiss/faiss/impl/code_distance/code_distance_avx512.h
 create mode 100644 thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
 create mode 100644 thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.h
 create mode 100644 thirdparty/faiss/faiss/utils/approx_topk/approx_topk.h
 create mode 100644 thirdparty/faiss/faiss/utils/approx_topk/avx2-inl.h
 create mode 100644 thirdparty/faiss/faiss/utils/approx_topk/generic.h
 create mode 100644 thirdparty/faiss/faiss/utils/approx_topk/mode.h
 create mode 100644 thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
 create mode 100644 thirdparty/faiss/faiss/utils/distances_fused/avx512.cpp
 create mode 100644 thirdparty/faiss/faiss/utils/distances_fused/avx512.h
 create mode 100644 thirdparty/faiss/faiss/utils/distances_fused/distances_fused.cpp
 create mode 100644 thirdparty/faiss/faiss/utils/distances_fused/distances_fused.h
 create mode 100644 thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.cpp
 create mode 100644 thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.h
 create mode 100644 thirdparty/faiss/faiss/utils/distances_if.h
 create mode 100644 thirdparty/faiss/faiss/utils/fp16-fp16c.h
 create mode 100644 thirdparty/faiss/faiss/utils/fp16-inl.h
 create mode 100644 thirdparty/faiss/faiss/utils/fp16.h
 create mode 100644 thirdparty/faiss/faiss/utils/hamming_distance/avx2-inl.h
 create mode 100644 thirdparty/faiss/faiss/utils/hamming_distance/common.h
 create mode 100644 thirdparty/faiss/faiss/utils/hamming_distance/generic-inl.h
 create mode 100644 thirdparty/faiss/faiss/utils/hamming_distance/hamdis-inl.h
 create mode 100644 thirdparty/faiss/faiss/utils/hamming_distance/neon-inl.h
 create mode 100644 thirdparty/faiss/faiss/utils/prefetch.h
 create mode 100644 thirdparty/faiss/faiss/utils/sorting.cpp
 create mode 100644 thirdparty/faiss/faiss/utils/sorting.h
 create mode 100644 thirdparty/faiss/faiss/utils/transpose/transpose-avx2-inl.h
 create mode 100644 thirdparty/faiss/tests/test_RCQ_cropping.cpp
 create mode 100644 thirdparty/faiss/tests/test_approx_topk.cpp
 create mode 100644 thirdparty/faiss/tests/test_clone.py
 create mode 100644 thirdparty/faiss/tests/test_code_distance.cpp
 create mode 100644 thirdparty/faiss/tests/test_contrib_with_scipy.py
 create mode 100644 thirdparty/faiss/tests/test_cppcontrib_sa_decode.cpp
 create mode 100644 thirdparty/faiss/tests/test_cppcontrib_uintreader.cpp
 create mode 100644 thirdparty/faiss/tests/test_distances_if.cpp
 create mode 100644 thirdparty/faiss/tests/test_distances_simd.cpp
 create mode 100644 thirdparty/faiss/tests/test_heap.cpp
 create mode 100644 thirdparty/faiss/tests/test_hnsw.cpp
 rename thirdparty/faiss/tests/{test_lsq.py => test_local_search_quantizer.py} (65%)
 create mode 100644 thirdparty/faiss/tests/test_merge_index.py
 create mode 100644 thirdparty/faiss/tests/test_partitioning.cpp
 create mode 100644 thirdparty/faiss/tests/test_rowwise_minmax.py
 create mode 100644 thirdparty/faiss/tests/test_search_params.py
 create mode 100644 thirdparty/faiss/tests/test_simdlib.cpp
 delete mode 100644 thirdparty/faiss/tutorial/cpp/5-GPU.cpp
 delete mode 100644 thirdparty/faiss/tutorial/cpp/6-GPU.cpp
 delete mode 100644 thirdparty/faiss/tutorial/cpp/6-RUN.cpp
 delete mode 100644 thirdparty/faiss/tutorial/cpp/7-GPU.cpp
 delete mode 100644 thirdparty/faiss/tutorial/cpp/8-GPU.cpp
 delete mode 100644 thirdparty/faiss/tutorial/cpp/9-BinaryFlat.cpp
 delete mode 100644 thirdparty/faiss/tutorial/cpp/tutorial_faiss_test.cpp

diff --git a/.github/workflows/ut.yaml b/.github/workflows/ut.yaml
index db33aac01..f0d9fa170 100644
--- a/.github/workflows/ut.yaml
+++ b/.github/workflows/ut.yaml
@@ -23,7 +23,7 @@ jobs:
   ut:
     name: ut on ubuntu-20.04
     runs-on: ubuntu-20.04
-    timeout-minutes: 60
+    timeout-minutes: 90
     strategy:
       fail-fast: false
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2464e4978..2c50d5ef4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,14 @@ knowhere_option(WITH_BENCHMARK "Build with benchmark" OFF)
 knowhere_option(WITH_COVERAGE "Build with coverage" OFF)
 knowhere_option(WITH_CCACHE "Build with ccache" ON)
 knowhere_option(WITH_PROFILER "Build with profiler" OFF)
+knowhere_option(WITH_FAISS_TESTS "Build with Faiss unit tests" OFF)
+
+# this is needed for clang on ubuntu:20.04, otherwise
+#   the linked fails with 'undefined reference' error.
+# fmt v9 was used by the time the error was encountered.
+# clang on ubuntu:22.04 seems to be unaffected.
+# gcc seems to be unaffected.
+add_definitions(-DFMT_HEADER_ONLY)
 
 # this is needed for clang on ubuntu:20.04, otherwise
 #   the linked fails with 'undefined reference' error.
@@ -156,6 +164,10 @@ if(WITH_BENCHMARK)
   add_subdirectory(benchmark)
 endif()
 
+if(WITH_FAISS_TESTS)
+  add_subdirectory(tests/faiss)
+endif()
+
 install(TARGETS knowhere
         DESTINATION ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
 install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/knowhere"
diff --git a/cmake/libs/libfaiss.cmake b/cmake/libs/libfaiss.cmake
index 6f44c0899..c40e84ca0 100644
--- a/cmake/libs/libfaiss.cmake
+++ b/cmake/libs/libfaiss.cmake
@@ -16,6 +16,10 @@ knowhere_file_glob(GLOB FAISS_AVX2_SRCS
 
 list(REMOVE_ITEM FAISS_SRCS ${FAISS_AVX512_SRCS})
 
+# disable RHNSW
+knowhere_file_glob(GLOB FAISS_RHNSW_SRCS thirdparty/faiss/faiss/impl/RHNSW.cpp)
+list(REMOVE_ITEM FAISS_SRCS ${FAISS_RHNSW_SRCS})
+
 if(__X86_64)
   set(UTILS_SRC src/simd/distances_ref.cc src/simd/hook.cc)
   set(UTILS_SSE_SRC src/simd/distances_sse.cc)
diff --git a/conanfile.py b/conanfile.py
index 920871819..2e096af89 100644
--- a/conanfile.py
+++ b/conanfile.py
@@ -32,6 +32,7 @@ class KnowhereConan(ConanFile):
         "with_ut": [True, False],
         "with_benchmark": [True, False],
         "with_coverage": [True, False],
+        "with_faiss_tests": [True, False],
     }
     default_options = {
         "shared": True,
@@ -47,6 +48,7 @@ class KnowhereConan(ConanFile):
         "with_coverage": False,
         "boost:without_test": True,
         "fmt:header_only": True,
+        "with_faiss_tests": False,
     }
 
     exports_sources = (
@@ -96,6 +98,8 @@ def requirements(self):
         if self.options.with_benchmark:
             self.requires("gtest/1.13.0")
             self.requires("hdf5/1.14.0")
+        if self.options.with_faiss_tests:
+            self.requires("gtest/1.13.0")
 
     @property
     def _required_boost_components(self):
@@ -156,6 +160,7 @@ def generate(self):
         tc.variables["WITH_UT"] = self.options.with_ut
         tc.variables["WITH_BENCHMARK"] = self.options.with_benchmark
         tc.variables["WITH_COVERAGE"] = self.options.with_coverage
+        tc.variables["WITH_FAISS_TESTS"] = self.options.with_faiss_tests
         tc.generate()
         deps = CMakeDeps(self)
         deps.generate()
diff --git a/include/knowhere/bitsetview_idselector.h b/include/knowhere/bitsetview_idselector.h
new file mode 100644
index 000000000..163e58cca
--- /dev/null
+++ b/include/knowhere/bitsetview_idselector.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2019-2023 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+#pragma once
+
+#include <faiss/impl/IDSelector.h>
+
+#include "knowhere/bitsetview.h"
+
+namespace knowhere {
+
+struct BitsetViewIDSelector : faiss::IDSelector {
+    BitsetView bitset_view;
+
+    inline BitsetViewIDSelector(BitsetView bitset_view) : bitset_view{bitset_view} {
+    }
+
+    inline bool
+    is_member(faiss::idx_t id) const override final {
+        // it is by design that bitset_view.empty() is not tested here
+        return (!bitset_view.test(id));
+    }
+};
+
+}  // namespace knowhere
diff --git a/include/knowhere/comp/index_param.h b/include/knowhere/comp/index_param.h
index 986d07241..675ccc35c 100644
--- a/include/knowhere/comp/index_param.h
+++ b/include/knowhere/comp/index_param.h
@@ -75,6 +75,7 @@ namespace indexparam {
 // IVF Params
 constexpr const char* NPROBE = "nprobe";
 constexpr const char* NLIST = "nlist";
+constexpr const char* USE_ELKAN = "use_elkan";
 constexpr const char* NBITS = "nbits";  // PQ/SQ
 constexpr const char* M = "m";          // PQ param for IVFPQ
 constexpr const char* SSIZE = "ssize";
diff --git a/src/common/comp/brute_force.cc b/src/common/comp/brute_force.cc
index ebd452020..042581c8f 100644
--- a/src/common/comp/brute_force.cc
+++ b/src/common/comp/brute_force.cc
@@ -18,6 +18,7 @@
 #include "faiss/MetricType.h"
 #include "faiss/utils/binary_distances.h"
 #include "faiss/utils/distances.h"
+#include "knowhere/bitsetview_idselector.h"
 #include "knowhere/comp/thread_pool.h"
 #include "knowhere/config.h"
 #include "knowhere/expected.h"
@@ -67,11 +68,15 @@ BruteForce::Search(const DataSetPtr base_dataset, const DataSetPtr query_dataset
             ThreadPool::ScopedOmpSetter setter(1);
             auto cur_labels = labels + topk * index;
             auto cur_distances = distances + topk * index;
+
+            BitsetViewIDSelector bw_idselector(bitset);
+            faiss::IDSelector* id_selector = (bitset.empty()) ? nullptr : &bw_idselector;
+
             switch (faiss_metric_type) {
                 case faiss::METRIC_L2: {
                     auto cur_query = (const float*)xq + dim * index;
                     faiss::float_maxheap_array_t buf{(size_t)1, (size_t)topk, cur_labels, cur_distances};
-                    faiss::knn_L2sqr(cur_query, (const float*)xb, dim, 1, nb, &buf, nullptr, bitset);
+                    faiss::knn_L2sqr(cur_query, (const float*)xb, dim, 1, nb, &buf, nullptr, id_selector);
                     break;
                 }
                 case faiss::METRIC_INNER_PRODUCT: {
@@ -79,16 +84,16 @@ BruteForce::Search(const DataSetPtr base_dataset, const DataSetPtr query_dataset
                     faiss::float_minheap_array_t buf{(size_t)1, (size_t)topk, cur_labels, cur_distances};
                     if (is_cosine) {
                         auto copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
-                        faiss::knn_cosine(copied_query.get(), (const float*)xb, nullptr, dim, 1, nb, &buf, bitset);
+                        faiss::knn_cosine(copied_query.get(), (const float*)xb, nullptr, dim, 1, nb, &buf, id_selector);
                     } else {
-                        faiss::knn_inner_product(cur_query, (const float*)xb, dim, 1, nb, &buf, bitset);
+                        faiss::knn_inner_product(cur_query, (const float*)xb, dim, 1, nb, &buf, id_selector);
                     }
                     break;
                 }
                 case faiss::METRIC_Jaccard: {
                     auto cur_query = (const uint8_t*)xq + (dim / 8) * index;
                     faiss::float_maxheap_array_t res = {size_t(1), size_t(topk), cur_labels, cur_distances};
-                    binary_knn_hc(faiss::METRIC_Jaccard, &res, cur_query, (const uint8_t*)xb, nb, dim / 8, bitset);
+                    binary_knn_hc(faiss::METRIC_Jaccard, &res, cur_query, (const uint8_t*)xb, nb, dim / 8, id_selector);
                     break;
                 }
                 case faiss::METRIC_Hamming: {
@@ -96,7 +101,7 @@ BruteForce::Search(const DataSetPtr base_dataset, const DataSetPtr query_dataset
                     std::vector<int32_t> int_distances(topk);
                     faiss::int_maxheap_array_t res = {size_t(1), size_t(topk), cur_labels, int_distances.data()};
                     binary_knn_hc(faiss::METRIC_Hamming, &res, (const uint8_t*)cur_query, (const uint8_t*)xb, nb,
-                                  dim / 8, bitset);
+                                  dim / 8, id_selector);
                     for (int i = 0; i < topk; ++i) {
                         cur_distances[i] = int_distances[i];
                     }
@@ -107,7 +112,7 @@ BruteForce::Search(const DataSetPtr base_dataset, const DataSetPtr query_dataset
                     // only matched ids will be chosen, not to use heap
                     auto cur_query = (const uint8_t*)xq + (dim / 8) * index;
                     binary_knn_mc(faiss_metric_type, cur_query, (const uint8_t*)xb, 1, nb, topk, dim / 8, cur_distances,
-                                  cur_labels, bitset);
+                                  cur_labels, id_selector);
                     break;
                 }
                 default: {
@@ -161,11 +166,15 @@ BruteForce::SearchWithBuf(const DataSetPtr base_dataset, const DataSetPtr query_
             ThreadPool::ScopedOmpSetter setter(1);
             auto cur_labels = labels + topk * index;
             auto cur_distances = distances + topk * index;
+
+            BitsetViewIDSelector bw_idselector(bitset);
+            faiss::IDSelector* id_selector = (bitset.empty()) ? nullptr : &bw_idselector;
+
             switch (faiss_metric_type) {
                 case faiss::METRIC_L2: {
                     auto cur_query = (const float*)xq + dim * index;
                     faiss::float_maxheap_array_t buf{(size_t)1, (size_t)topk, cur_labels, cur_distances};
-                    faiss::knn_L2sqr(cur_query, (const float*)xb, dim, 1, nb, &buf, nullptr, bitset);
+                    faiss::knn_L2sqr(cur_query, (const float*)xb, dim, 1, nb, &buf, nullptr, id_selector);
                     break;
                 }
                 case faiss::METRIC_INNER_PRODUCT: {
@@ -173,16 +182,16 @@ BruteForce::SearchWithBuf(const DataSetPtr base_dataset, const DataSetPtr query_
                     faiss::float_minheap_array_t buf{(size_t)1, (size_t)topk, cur_labels, cur_distances};
                     if (is_cosine) {
                         auto copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
-                        faiss::knn_cosine(copied_query.get(), (const float*)xb, nullptr, dim, 1, nb, &buf, bitset);
+                        faiss::knn_cosine(copied_query.get(), (const float*)xb, nullptr, dim, 1, nb, &buf, id_selector);
                     } else {
-                        faiss::knn_inner_product(cur_query, (const float*)xb, dim, 1, nb, &buf, bitset);
+                        faiss::knn_inner_product(cur_query, (const float*)xb, dim, 1, nb, &buf, id_selector);
                     }
                     break;
                 }
                 case faiss::METRIC_Jaccard: {
                     auto cur_query = (const uint8_t*)xq + (dim / 8) * index;
                     faiss::float_maxheap_array_t res = {size_t(1), size_t(topk), cur_labels, cur_distances};
-                    binary_knn_hc(faiss::METRIC_Jaccard, &res, cur_query, (const uint8_t*)xb, nb, dim / 8, bitset);
+                    binary_knn_hc(faiss::METRIC_Jaccard, &res, cur_query, (const uint8_t*)xb, nb, dim / 8, id_selector);
                     break;
                 }
                 case faiss::METRIC_Hamming: {
@@ -190,7 +199,7 @@ BruteForce::SearchWithBuf(const DataSetPtr base_dataset, const DataSetPtr query_
                     std::vector<int32_t> int_distances(topk);
                     faiss::int_maxheap_array_t res = {size_t(1), size_t(topk), cur_labels, int_distances.data()};
                     binary_knn_hc(faiss::METRIC_Hamming, &res, (const uint8_t*)cur_query, (const uint8_t*)xb, nb,
-                                  dim / 8, bitset);
+                                  dim / 8, id_selector);
                     for (int i = 0; i < topk; ++i) {
                         cur_distances[i] = int_distances[i];
                     }
@@ -201,7 +210,7 @@ BruteForce::SearchWithBuf(const DataSetPtr base_dataset, const DataSetPtr query_
                     // only matched ids will be chosen, not to use heap
                     auto cur_query = (const uint8_t*)xq + (dim / 8) * index;
                     binary_knn_mc(faiss_metric_type, cur_query, (const uint8_t*)xb, 1, nb, topk, dim / 8, cur_distances,
-                                  cur_labels, bitset);
+                                  cur_labels, id_selector);
                     break;
                 }
                 default: {
@@ -263,10 +272,14 @@ BruteForce::RangeSearch(const DataSetPtr base_dataset, const DataSetPtr query_da
         futs.emplace_back(pool->push([&, index = i] {
             ThreadPool::ScopedOmpSetter setter(1);
             faiss::RangeSearchResult res(1);
+
+            BitsetViewIDSelector bw_idselector(bitset);
+            faiss::IDSelector* id_selector = (bitset.empty()) ? nullptr : &bw_idselector;
+
             switch (faiss_metric_type) {
                 case faiss::METRIC_L2: {
                     auto cur_query = (const float*)xq + dim * index;
-                    faiss::range_search_L2sqr(cur_query, (const float*)xb, dim, 1, nb, radius, &res, bitset);
+                    faiss::range_search_L2sqr(cur_query, (const float*)xb, dim, 1, nb, radius, &res, id_selector);
                     break;
                 }
                 case faiss::METRIC_INNER_PRODUCT: {
@@ -275,24 +288,25 @@ BruteForce::RangeSearch(const DataSetPtr base_dataset, const DataSetPtr query_da
                     if (is_cosine) {
                         auto copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                         faiss::range_search_cosine(copied_query.get(), (const float*)xb, nullptr, dim, 1, nb, radius,
-                                                   &res, bitset);
+                                                   &res, id_selector);
                     } else {
                         faiss::range_search_inner_product(cur_query, (const float*)xb, dim, 1, nb, radius, &res,
-                                                          bitset);
+                                                          id_selector);
                     }
                     break;
                 }
                 case faiss::METRIC_Jaccard: {
                     auto cur_query = (const uint8_t*)xq + (dim / 8) * index;
-                    faiss::binary_range_search<faiss::CMin<float, int64_t>, float>(
-                        faiss::METRIC_Jaccard, cur_query, (const uint8_t*)xb, 1, nb, radius, dim / 8, &res, bitset);
+                    faiss::binary_range_search<faiss::CMin<float, int64_t>, float>(faiss::METRIC_Jaccard, cur_query,
+                                                                                   (const uint8_t*)xb, 1, nb, radius,
+                                                                                   dim / 8, &res, id_selector);
                     break;
                 }
                 case faiss::METRIC_Hamming: {
                     auto cur_query = (const uint8_t*)xq + (dim / 8) * index;
                     faiss::binary_range_search<faiss::CMin<int, int64_t>, int>(faiss::METRIC_Hamming, cur_query,
                                                                                (const uint8_t*)xb, 1, nb, (int)radius,
-                                                                               dim / 8, &res, bitset);
+                                                                               dim / 8, &res, id_selector);
                     break;
                 }
                 default: {
diff --git a/src/common/config.cc b/src/common/config.cc
index f54c705a3..2e238e911 100644
--- a/src/common/config.cc
+++ b/src/common/config.cc
@@ -18,6 +18,7 @@ static const std::unordered_set<std::string> ext_legal_json_keys = {"metric_type
                                                                     "dim",
                                                                     "nlist",           // IVF param
                                                                     "nprobe",          // IVF param
+                                                                    "use_elkan",       // IVF param
                                                                     "ssize",           // IVF_FLAT_CC param
                                                                     "nbits",           // IVF_PQ param
                                                                     "m",               // IVF_PQ param
diff --git a/src/index/flat/flat.cc b/src/index/flat/flat.cc
index eae477af4..e7ef2af77 100644
--- a/src/index/flat/flat.cc
+++ b/src/index/flat/flat.cc
@@ -16,6 +16,7 @@
 #include "faiss/index_io.h"
 #include "index/flat/flat_config.h"
 #include "io/memory_io.h"
+#include "knowhere/bitsetview_idselector.h"
 #include "knowhere/comp/thread_pool.h"
 #include "knowhere/factory.h"
 #include "knowhere/log.h"
@@ -93,6 +94,10 @@ class FlatIndexNode : public IndexNode {
                     ThreadPool::ScopedOmpSetter setter(1);
                     auto cur_ids = ids + k * index;
                     auto cur_dis = distances + k * index;
+
+                    BitsetViewIDSelector bw_idselector(bitset);
+                    faiss::IDSelector* id_selector = (bitset.empty()) ? nullptr : &bw_idselector;
+
                     if constexpr (std::is_same<T, faiss::IndexFlat>::value) {
                         auto cur_query = (const float*)x + dim * index;
                         std::unique_ptr<float[]> copied_query = nullptr;
@@ -100,11 +105,20 @@ class FlatIndexNode : public IndexNode {
                             copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                             cur_query = copied_query.get();
                         }
-                        index_->search(1, cur_query, k, cur_dis, cur_ids, bitset);
+
+                        faiss::SearchParameters search_params;
+                        search_params.sel = id_selector;
+
+                        index_->search(1, cur_query, k, cur_dis, cur_ids, &search_params);
                     }
                     if constexpr (std::is_same<T, faiss::IndexBinaryFlat>::value) {
                         auto cur_i_dis = reinterpret_cast<int32_t*>(cur_dis);
-                        index_->search(1, (const uint8_t*)x + index * dim / 8, k, cur_i_dis, cur_ids, bitset);
+
+                        faiss::SearchParameters search_params;
+                        search_params.sel = id_selector;
+
+                        index_->search(1, (const uint8_t*)x + index * dim / 8, k, cur_i_dis, cur_ids, &search_params);
+
                         if (index_->metric_type == faiss::METRIC_Hamming) {
                             for (int64_t j = 0; j < k; j++) {
                                 cur_dis[j] = static_cast<float>(cur_i_dis[j]);
@@ -166,6 +180,10 @@ class FlatIndexNode : public IndexNode {
                 futs.emplace_back(search_pool_->push([&, index = i] {
                     ThreadPool::ScopedOmpSetter setter(1);
                     faiss::RangeSearchResult res(1);
+
+                    BitsetViewIDSelector bw_idselector(bitset);
+                    faiss::IDSelector* id_selector = (bitset.empty()) ? nullptr : &bw_idselector;
+
                     if constexpr (std::is_same<T, faiss::IndexFlat>::value) {
                         auto cur_query = (const float*)xq + dim * index;
                         std::unique_ptr<float[]> copied_query = nullptr;
@@ -173,10 +191,17 @@ class FlatIndexNode : public IndexNode {
                             copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                             cur_query = copied_query.get();
                         }
-                        index_->range_search(1, cur_query, radius, &res, bitset);
+
+                        faiss::SearchParameters search_params;
+                        search_params.sel = id_selector;
+
+                        index_->range_search(1, cur_query, radius, &res, &search_params);
                     }
                     if constexpr (std::is_same<T, faiss::IndexBinaryFlat>::value) {
-                        index_->range_search(1, (const uint8_t*)xq + index * dim / 8, radius, &res, bitset);
+                        faiss::SearchParameters search_params;
+                        search_params.sel = id_selector;
+
+                        index_->range_search(1, (const uint8_t*)xq + index * dim / 8, radius, &res, &search_params);
                     }
                     auto elem_cnt = res.lims[1];
                     result_dist_array[index].resize(elem_cnt);
diff --git a/src/index/ivf/ivf.cc b/src/index/ivf/ivf.cc
index 27ed0cf28..53c8276de 100644
--- a/src/index/ivf/ivf.cc
+++ b/src/index/ivf/ivf.cc
@@ -14,6 +14,7 @@
 #include "faiss/IndexBinaryFlat.h"
 #include "faiss/IndexBinaryIVF.h"
 #include "faiss/IndexFlat.h"
+#include "faiss/IndexFlatElkan.h"
 #include "faiss/IndexIVFFlat.h"
 #include "faiss/IndexIVFPQ.h"
 #include "faiss/IndexIVFPQFastScan.h"
@@ -22,6 +23,7 @@
 #include "faiss/index_io.h"
 #include "index/ivf/ivf_config.h"
 #include "io/memory_io.h"
+#include "knowhere/bitsetview_idselector.h"
 #include "knowhere/comp/thread_pool.h"
 #include "knowhere/dataset.h"
 #include "knowhere/expected.h"
@@ -32,16 +34,6 @@
 
 namespace knowhere {
 
-template <typename T>
-struct QuantizerT {
-    typedef faiss::IndexFlat type;
-};
-
-template <>
-struct QuantizerT<faiss::IndexBinaryIVF> {
-    using type = faiss::IndexBinaryFlat;
-};
-
 template <typename T>
 class IvfIndexNode : public IndexNode {
  public:
@@ -240,6 +232,17 @@ MatchNbits(int64_t size, int64_t nbits) {
     return nbits;
 }
 
+namespace {
+
+// turn IndexFlatElkan into IndexFlat
+std::unique_ptr<faiss::IndexFlat>
+to_index_flat(std::unique_ptr<faiss::IndexFlat>&& index) {
+    // C++ slicing here
+    return std::make_unique<faiss::IndexFlat>(std::move(*index));
+}
+
+}  // namespace
+
 template <typename T>
 Status
 IvfIndexNode<T>::Train(const DataSet& dataset, const Config& cfg) {
@@ -279,24 +282,50 @@ IvfIndexNode<T>::Train(const DataSet& dataset, const Config& cfg) {
         }
     }
 
-    typename QuantizerT<T>::type* qzr = nullptr;
-    faiss::IndexIVFPQFastScan* base_index = nullptr;
     std::unique_ptr<T> index;
+    // if cfg.use_elkan is used, then we'll use a temporary instance of
+    //  IndexFlatElkan for the training.
     try {
         if constexpr (std::is_same<faiss::IndexIVFFlat, T>::value) {
             const IvfFlatConfig& ivf_flat_cfg = static_cast<const IvfFlatConfig&>(cfg);
             auto nlist = MatchNlist(rows, ivf_flat_cfg.nlist.value());
-            qzr = new (std::nothrow) typename QuantizerT<T>::type(dim, metric.value());
-            index = std::make_unique<faiss::IndexIVFFlat>(qzr, dim, nlist, metric.value(), is_cosine);
+
+            const bool use_elkan = ivf_flat_cfg.use_elkan.value_or(true);
+
+            // create quantizer for the training
+            std::unique_ptr<faiss::IndexFlat> qzr =
+                std::make_unique<faiss::IndexFlatElkan>(dim, metric.value(), false, use_elkan);
+            // create index. Index does not own qzr
+            index = std::make_unique<faiss::IndexIVFFlat>(qzr.get(), dim, nlist, metric.value(), is_cosine);
+            // train
             index->train(rows, (const float*)data);
+            // replace quantizer with a regular IndexFlat
+            qzr = to_index_flat(std::move(qzr));
+            index->quantizer = qzr.get();
+            // transfer ownership of qzr to index
+            qzr.release();
+            index->own_fields = true;
         }
         if constexpr (std::is_same<faiss::IndexIVFFlatCC, T>::value) {
             const IvfFlatCcConfig& ivf_flat_cc_cfg = static_cast<const IvfFlatCcConfig&>(cfg);
             auto nlist = MatchNlist(rows, ivf_flat_cc_cfg.nlist.value());
-            qzr = new (std::nothrow) typename QuantizerT<T>::type(dim, metric.value());
-            index = std::make_unique<faiss::IndexIVFFlatCC>(qzr, dim, nlist, ivf_flat_cc_cfg.ssize.value(),
+
+            const bool use_elkan = ivf_flat_cc_cfg.use_elkan.value_or(true);
+
+            // create quantizer for the training
+            std::unique_ptr<faiss::IndexFlat> qzr =
+                std::make_unique<faiss::IndexFlatElkan>(dim, metric.value(), false, use_elkan);
+            // create index. Index does not own qzr
+            index = std::make_unique<faiss::IndexIVFFlatCC>(qzr.get(), dim, nlist, ivf_flat_cc_cfg.ssize.value(),
                                                             metric.value(), is_cosine);
+            // train
             index->train(rows, (const float*)data);
+            // replace quantizer with a regular IndexFlat
+            qzr = to_index_flat(std::move(qzr));
+            index->quantizer = qzr.get();
+            // transfer ownership of qzr to index
+            qzr.release();
+            index->own_fields = true;
             // ivfflat_cc has no serialize stage, make map at build stage
             index->make_direct_map(true, faiss::DirectMap::ConcurrentArray);
         }
@@ -304,48 +333,93 @@ IvfIndexNode<T>::Train(const DataSet& dataset, const Config& cfg) {
             const IvfPqConfig& ivf_pq_cfg = static_cast<const IvfPqConfig&>(cfg);
             auto nlist = MatchNlist(rows, ivf_pq_cfg.nlist.value());
             auto nbits = MatchNbits(rows, ivf_pq_cfg.nbits.value());
-            qzr = new (std::nothrow) typename QuantizerT<T>::type(dim, metric.value());
-            index = std::make_unique<faiss::IndexIVFPQ>(qzr, dim, nlist, ivf_pq_cfg.m.value(), nbits, metric.value());
+
+            const bool use_elkan = ivf_pq_cfg.use_elkan.value_or(true);
+
+            // create quantizer for the training
+            std::unique_ptr<faiss::IndexFlat> qzr =
+                std::make_unique<faiss::IndexFlatElkan>(dim, metric.value(), false, use_elkan);
+            // create index. Index does not own qzr
+            index =
+                std::make_unique<faiss::IndexIVFPQ>(qzr.get(), dim, nlist, ivf_pq_cfg.m.value(), nbits, metric.value());
+            // train
             index->train(rows, (const float*)data);
+            // replace quantizer with a regular IndexFlat
+            qzr = to_index_flat(std::move(qzr));
+            index->quantizer = qzr.get();
+            // transfer ownership of qzr to index
+            qzr.release();
+            index->own_fields = true;
         }
         if constexpr (std::is_same<faiss::IndexScaNN, T>::value) {
             const ScannConfig& scann_cfg = static_cast<const ScannConfig&>(cfg);
             auto nlist = MatchNlist(rows, scann_cfg.nlist.value());
             bool is_cosine = base_cfg.metric_type.value() == metric::COSINE;
-            qzr = new (std::nothrow) typename QuantizerT<T>::type(dim, metric.value());
-            base_index = new (std::nothrow)
-                faiss::IndexIVFPQFastScan(qzr, dim, nlist, (dim + 1) / 2, 4, is_cosine, metric.value());
-            base_index->own_fields = true;
+
+            const bool use_elkan = scann_cfg.use_elkan.value_or(true);
+
+            // create quantizer for the training
+            std::unique_ptr<faiss::IndexFlat> qzr =
+                std::make_unique<faiss::IndexFlatElkan>(dim, metric.value(), false, use_elkan);
+            // create base index. it does not own qzr
+            auto base_index = std::make_unique<faiss::IndexIVFPQFastScan>(qzr.get(), dim, nlist, (dim + 1) / 2, 4,
+                                                                          is_cosine, metric.value());
+            // create scann index, which does not base_index by default,
+            //    but owns the refine index by default omg
             if (scann_cfg.with_raw_data.value()) {
-                index = std::make_unique<faiss::IndexScaNN>(base_index, (const float*)data);
+                index = std::make_unique<faiss::IndexScaNN>(base_index.get(), (const float*)data);
             } else {
-                index = std::make_unique<faiss::IndexScaNN>(base_index, nullptr);
+                index = std::make_unique<faiss::IndexScaNN>(base_index.get(), nullptr);
             }
+            // train
             index->train(rows, (const float*)data);
+            // at this moment, we still own qzr.
+            // replace quantizer with a regular IndexFlat
+            qzr = to_index_flat(std::move(qzr));
+            base_index->quantizer = qzr.get();
+            // release qzr
+            qzr.release();
+            base_index->own_fields = true;
+            // transfer ownership of the base index
+            base_index.release();
+            index->own_fields = true;
         }
         if constexpr (std::is_same<faiss::IndexIVFScalarQuantizer, T>::value) {
             const IvfSqConfig& ivf_sq_cfg = static_cast<const IvfSqConfig&>(cfg);
             auto nlist = MatchNlist(rows, ivf_sq_cfg.nlist.value());
-            qzr = new (std::nothrow) typename QuantizerT<T>::type(dim, metric.value());
-            index = std::make_unique<faiss::IndexIVFScalarQuantizer>(qzr, dim, nlist, faiss::QuantizerType::QT_8bit,
-                                                                     metric.value());
+
+            const bool use_elkan = ivf_sq_cfg.use_elkan.value_or(true);
+
+            // create quantizer for the training
+            std::unique_ptr<faiss::IndexFlat> qzr =
+                std::make_unique<faiss::IndexFlatElkan>(dim, metric.value(), false, use_elkan);
+            // create index. Index does not own qzr
+            index = std::make_unique<faiss::IndexIVFScalarQuantizer>(
+                qzr.get(), dim, nlist, faiss::ScalarQuantizer::QuantizerType::QT_8bit, metric.value());
+            // train
             index->train(rows, (const float*)data);
+            // replace quantizer with a regular IndexFlat
+            qzr = to_index_flat(std::move(qzr));
+            index->quantizer = qzr.get();
+            // transfer ownership of qzr to index
+            qzr.release();
+            index->own_fields = true;
         }
         if constexpr (std::is_same<faiss::IndexBinaryIVF, T>::value) {
             const IvfBinConfig& ivf_bin_cfg = static_cast<const IvfBinConfig&>(cfg);
             auto nlist = MatchNlist(rows, ivf_bin_cfg.nlist.value());
-            qzr = new (std::nothrow) typename QuantizerT<T>::type(dim, metric.value());
-            index = std::make_unique<faiss::IndexBinaryIVF>(qzr, dim, nlist, metric.value());
+
+            // create quantizer
+            auto qzr = std::make_unique<faiss::IndexBinaryFlat>(dim, metric.value());
+            // create index. Index does not own qzr
+            index = std::make_unique<faiss::IndexBinaryIVF>(qzr.get(), dim, nlist, metric.value());
+            // train
             index->train(rows, (const uint8_t*)data);
+            // transfer ownership of qzr to index
+            qzr.release();
+            index->own_fields = true;
         }
-        index->own_fields = true;
     } catch (std::exception& e) {
-        if (qzr) {
-            delete qzr;
-        }
-        if (base_index) {
-            delete base_index;
-        }
         LOG_KNOWHERE_WARNING_ << "faiss inner error: " << e.what();
         return Status::faiss_inner_error;
     }
@@ -416,9 +490,18 @@ IvfIndexNode<T>::Search(const DataSet& dataset, const Config& cfg, const BitsetV
                 ThreadPool::ScopedOmpSetter setter(1);
                 auto offset = k * index;
                 std::unique_ptr<float[]> copied_query = nullptr;
+
+                BitsetViewIDSelector bw_idselector(bitset);
+                faiss::IDSelector* id_selector = (bitset.empty()) ? nullptr : &bw_idselector;
+
                 if constexpr (std::is_same<T, faiss::IndexBinaryIVF>::value) {
                     auto cur_data = (const uint8_t*)data + index * dim / 8;
-                    index_->search_thread_safe(1, cur_data, k, i_distances + offset, ids + offset, nprobe, bitset);
+
+                    faiss::IVFSearchParameters ivf_search_params;
+                    ivf_search_params.nprobe = nprobe;
+                    ivf_search_params.sel = id_selector;
+                    index_->search(1, cur_data, k, i_distances + offset, ids + offset, &ivf_search_params);
+
                     if (index_->metric_type == faiss::METRIC_Hamming) {
                         for (int64_t i = 0; i < k; i++) {
                             distances[i + offset] = static_cast<float>(i_distances[i + offset]);
@@ -430,7 +513,13 @@ IvfIndexNode<T>::Search(const DataSet& dataset, const Config& cfg, const BitsetV
                         copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                         cur_query = copied_query.get();
                     }
-                    index_->search_thread_safe(1, cur_query, k, distances + offset, ids + offset, nprobe, 0, bitset);
+
+                    faiss::IVFSearchParameters ivf_search_params;
+                    ivf_search_params.nprobe = nprobe;
+                    ivf_search_params.max_codes = 0;
+                    ivf_search_params.sel = id_selector;
+
+                    index_->search(1, cur_query, k, distances + offset, ids + offset, &ivf_search_params);
                 } else if constexpr (std::is_same<T, faiss::IndexScaNN>::value) {
                     auto cur_query = (const float*)data + index * dim;
                     const ScannConfig& scann_cfg = static_cast<const ScannConfig&>(cfg);
@@ -438,15 +527,30 @@ IvfIndexNode<T>::Search(const DataSet& dataset, const Config& cfg, const BitsetV
                         copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                         cur_query = copied_query.get();
                     }
-                    index_->search_thread_safe(1, cur_query, k, distances + offset, ids + offset, nprobe,
-                                               scann_cfg.reorder_k.value(), bitset);
+
+                    // todo aguzhva: this is somewhat alogical. Refactor?
+                    faiss::IVFSearchParameters base_search_params;
+                    base_search_params.sel = id_selector;
+                    base_search_params.nprobe = nprobe;
+
+                    faiss::IndexScaNNSearchParameters scann_search_params;
+                    scann_search_params.base_index_params = &base_search_params;
+                    scann_search_params.reorder_k = scann_cfg.reorder_k.value();
+
+                    index_->search(1, cur_query, k, distances + offset, ids + offset, &scann_search_params);
                 } else {
                     auto cur_query = (const float*)data + index * dim;
                     if (is_cosine) {
                         copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                         cur_query = copied_query.get();
                     }
-                    index_->search_thread_safe(1, cur_query, k, distances + offset, ids + offset, nprobe, 0, bitset);
+
+                    faiss::IVFSearchParameters ivf_search_params;
+                    ivf_search_params.nprobe = nprobe;
+                    ivf_search_params.max_codes = 0;
+                    ivf_search_params.sel = id_selector;
+
+                    index_->search(1, cur_query, k, distances + offset, ids + offset, &ivf_search_params);
                 }
             }));
         }
@@ -510,30 +614,59 @@ IvfIndexNode<T>::RangeSearch(const DataSet& dataset, const Config& cfg, const Bi
                 ThreadPool::ScopedOmpSetter setter(1);
                 faiss::RangeSearchResult res(1);
                 std::unique_ptr<float[]> copied_query = nullptr;
+
+                BitsetViewIDSelector bw_idselector(bitset);
+                faiss::IDSelector* id_selector = (bitset.empty()) ? nullptr : &bw_idselector;
+
                 if constexpr (std::is_same<T, faiss::IndexBinaryIVF>::value) {
                     auto cur_data = (const uint8_t*)xq + index * dim / 8;
-                    index_->range_search_thread_safe(1, cur_data, radius, &res, index_->nlist, bitset);
+
+                    faiss::IVFSearchParameters ivf_search_params;
+                    ivf_search_params.nprobe = index_->nlist;
+                    ivf_search_params.sel = id_selector;
+
+                    index_->range_search(1, cur_data, radius, &res, &ivf_search_params);
                 } else if constexpr (std::is_same<T, faiss::IndexIVFFlat>::value) {
                     auto cur_query = (const float*)xq + index * dim;
                     if (is_cosine) {
                         copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                         cur_query = copied_query.get();
                     }
-                    index_->range_search_thread_safe(1, cur_query, radius, &res, index_->nlist, 0, bitset);
+
+                    faiss::IVFSearchParameters ivf_search_params;
+                    ivf_search_params.nprobe = index_->nlist;
+                    ivf_search_params.max_codes = 0;
+                    ivf_search_params.sel = id_selector;
+
+                    index_->range_search(1, cur_query, radius, &res, &ivf_search_params);
                 } else if constexpr (std::is_same<T, faiss::IndexScaNN>::value) {
                     auto cur_query = (const float*)xq + index * dim;
                     if (is_cosine) {
                         copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                         cur_query = copied_query.get();
                     }
-                    index_->range_search_thread_safe(1, cur_query, radius, &res, bitset);
+
+                    // todo aguzhva: this is somewhat alogical. Refactor?
+                    faiss::IVFSearchParameters base_search_params;
+                    base_search_params.sel = id_selector;
+
+                    faiss::IndexScaNNSearchParameters scann_search_params;
+                    scann_search_params.base_index_params = &base_search_params;
+
+                    index_->range_search(1, cur_query, radius, &res, &scann_search_params);
                 } else {
                     auto cur_query = (const float*)xq + index * dim;
                     if (is_cosine) {
                         copied_query = CopyAndNormalizeVecs(cur_query, 1, dim);
                         cur_query = copied_query.get();
                     }
-                    index_->range_search_thread_safe(1, cur_query, radius, &res, index_->nlist, 0, bitset);
+
+                    faiss::IVFSearchParameters ivf_search_params;
+                    ivf_search_params.nprobe = index_->nlist;
+                    ivf_search_params.max_codes = 0;
+                    ivf_search_params.sel = id_selector;
+
+                    index_->range_search(1, cur_query, radius, &res, &ivf_search_params);
                 }
                 auto elem_cnt = res.lims[1];
                 result_dist_array[index].resize(elem_cnt);
diff --git a/src/index/ivf/ivf_config.h b/src/index/ivf/ivf_config.h
index ee900069e..fabd1d6a2 100644
--- a/src/index/ivf/ivf_config.h
+++ b/src/index/ivf/ivf_config.h
@@ -20,6 +20,7 @@ class IvfConfig : public BaseConfig {
  public:
     CFG_INT nlist;
     CFG_INT nprobe;
+    CFG_BOOL use_elkan;
     KNOHWERE_DECLARE_CONFIG(IvfConfig) {
         KNOWHERE_CONFIG_DECLARE_FIELD(nlist)
             .set_default(128)
@@ -31,6 +32,10 @@ class IvfConfig : public BaseConfig {
             .description("number of probes at query time.")
             .for_search()
             .set_range(1, 65536);
+        KNOWHERE_CONFIG_DECLARE_FIELD(use_elkan)
+            .set_default(true)
+            .description("whether to use elkan algorithm")
+            .for_train();
     }
 };
 
diff --git a/src/simd/distances_avx.cc b/src/simd/distances_avx.cc
index b92068143..1e3ec0f04 100644
--- a/src/simd/distances_avx.cc
+++ b/src/simd/distances_avx.cc
@@ -17,6 +17,8 @@
 
 #include <cassert>
 
+#include "faiss/impl/platform_macros.h"
+
 namespace faiss {
 
 #define ALIGNED(x) __attribute__((aligned(x)))
@@ -38,80 +40,34 @@ masked_read(int d, const float* x) {
     // cannot use AVX2 _mm_mask_set1_epi32
 }
 
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_inner_product_avx(const float* x, const float* y, size_t d) {
-    __m256 msum1 = _mm256_setzero_ps();
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        msum1 = _mm256_add_ps(msum1, _mm256_mul_ps(mx, my));
-        d -= 8;
+    size_t i;
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (i = 0; i < d; i++) {
+        res += x[i] * y[i];
     }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 = _mm_add_ps(msum2, _mm256_extractf128_ps(msum1, 0));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(mx, my));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(mx, my));
-    }
-
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    return _mm_cvtss_f32(msum2);
+    return res;
 }
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_L2sqr_avx(const float* x, const float* y, size_t d) {
-    __m256 msum1 = _mm256_setzero_ps();
-
-    while (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        const __m256 a_m_b1 = _mm256_sub_ps(mx, my);
-        msum1 = _mm256_add_ps(msum1, _mm256_mul_ps(a_m_b1, a_m_b1));
-        d -= 8;
+    size_t i;
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (i = 0; i < d; i++) {
+        const float tmp = x[i] - y[i];
+        res += tmp * tmp;
     }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 = _mm_add_ps(msum2, _mm256_extractf128_ps(msum1, 0));
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        const __m128 a_m_b1 = _mm_sub_ps(mx, my);
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(a_m_b1, a_m_b1));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        __m128 a_m_b1 = _mm_sub_ps(mx, my);
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(a_m_b1, a_m_b1));
-    }
-
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    return _mm_cvtss_f32(msum2);
+    return res;
 }
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
 float
 fvec_L1_avx(const float* x, const float* y, size_t d) {
@@ -195,5 +151,69 @@ fvec_Linf_avx(const float* x, const float* y, size_t d) {
     return _mm_cvtss_f32(msum2);
 }
 
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+fvec_madd_avx(size_t n, const float* a, float bf, const float* b, float* c) {
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < n; i++) {
+        c[i] = a[i] + bf * b[i];
+    }
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+fvec_inner_product_batch_4_avx(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
+                               const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
+                               float& dis1, float& dis2, float& dis3) {
+    float d0 = 0;
+    float d1 = 0;
+    float d2 = 0;
+    float d3 = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        d0 += x[i] * y0[i];
+        d1 += x[i] * y1[i];
+        d2 += x[i] * y2[i];
+        d3 += x[i] * y3[i];
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+fvec_L2sqr_batch_4_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                       const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    float d0 = 0;
+    float d1 = 0;
+    float d2 = 0;
+    float d3 = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        const float q0 = x[i] - y0[i];
+        const float q1 = x[i] - y1[i];
+        const float q2 = x[i] - y2[i];
+        const float q3 = x[i] - y3[i];
+        d0 += q0 * q0;
+        d1 += q1 * q1;
+        d2 += q2 * q2;
+        d3 += q3 * q3;
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
 }  // namespace faiss
 #endif
diff --git a/src/simd/distances_avx.h b/src/simd/distances_avx.h
index 4934d9d30..c89b964d6 100644
--- a/src/simd/distances_avx.h
+++ b/src/simd/distances_avx.h
@@ -33,6 +33,17 @@ fvec_L1_avx(const float* x, const float* y, size_t d);
 float
 fvec_Linf_avx(const float* x, const float* y, size_t d);
 
+void
+fvec_madd_avx(size_t n, const float* a, float bf, const float* b, float* c);
+
+void
+fvec_inner_product_batch_4_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                               const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
+void
+fvec_L2sqr_batch_4_avx(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                       const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
 }  // namespace faiss
 
 #endif /* DISTANCES_AVX_H */
diff --git a/src/simd/distances_avx512.cc b/src/simd/distances_avx512.cc
index 3855f173e..bbf1ce80c 100644
--- a/src/simd/distances_avx512.cc
+++ b/src/simd/distances_avx512.cc
@@ -18,6 +18,8 @@
 #include <cstdio>
 #include <string>
 
+#include "faiss/impl/platform_macros.h"
+
 namespace faiss {
 
 // reads 0 <= d < 4 floats as __m128
@@ -37,107 +39,34 @@ masked_read(int d, const float* x) {
     // cannot use AVX2 _mm_mask_set1_epi32
 }
 
-extern uint8_t lookup8bit[256];
-
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_inner_product_avx512(const float* x, const float* y, size_t d) {
-    __m512 msum0 = _mm512_setzero_ps();
-
-    while (d >= 16) {
-        __m512 mx = _mm512_loadu_ps(x);
-        x += 16;
-        __m512 my = _mm512_loadu_ps(y);
-        y += 16;
-        msum0 = _mm512_add_ps(msum0, _mm512_mul_ps(mx, my));
-        d -= 16;
-    }
-
-    __m256 msum1 = _mm512_extractf32x8_ps(msum0, 1);
-    msum1 += _mm512_extractf32x8_ps(msum0, 0);
-
-    if (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        msum1 = _mm256_add_ps(msum1, _mm256_mul_ps(mx, my));
-        d -= 8;
+    size_t i;
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (i = 0; i < d; i++) {
+        res += x[i] * y[i];
     }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 += _mm256_extractf128_ps(msum1, 0);
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(mx, my));
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        msum2 = _mm_add_ps(msum2, _mm_mul_ps(mx, my));
-    }
-
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    return _mm_cvtss_f32(msum2);
+    return res;
 }
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 float
 fvec_L2sqr_avx512(const float* x, const float* y, size_t d) {
-    __m512 msum0 = _mm512_setzero_ps();
-
-    while (d >= 16) {
-        __m512 mx = _mm512_loadu_ps(x);
-        x += 16;
-        __m512 my = _mm512_loadu_ps(y);
-        y += 16;
-        const __m512 a_m_b1 = mx - my;
-        msum0 += a_m_b1 * a_m_b1;
-        d -= 16;
-    }
-
-    __m256 msum1 = _mm512_extractf32x8_ps(msum0, 1);
-    msum1 += _mm512_extractf32x8_ps(msum0, 0);
-
-    if (d >= 8) {
-        __m256 mx = _mm256_loadu_ps(x);
-        x += 8;
-        __m256 my = _mm256_loadu_ps(y);
-        y += 8;
-        const __m256 a_m_b1 = mx - my;
-        msum1 += a_m_b1 * a_m_b1;
-        d -= 8;
-    }
-
-    __m128 msum2 = _mm256_extractf128_ps(msum1, 1);
-    msum2 += _mm256_extractf128_ps(msum1, 0);
-
-    if (d >= 4) {
-        __m128 mx = _mm_loadu_ps(x);
-        x += 4;
-        __m128 my = _mm_loadu_ps(y);
-        y += 4;
-        const __m128 a_m_b1 = mx - my;
-        msum2 += a_m_b1 * a_m_b1;
-        d -= 4;
-    }
-
-    if (d > 0) {
-        __m128 mx = masked_read(d, x);
-        __m128 my = masked_read(d, y);
-        __m128 a_m_b1 = mx - my;
-        msum2 += a_m_b1 * a_m_b1;
-    }
-
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    msum2 = _mm_hadd_ps(msum2, msum2);
-    return _mm_cvtss_f32(msum2);
+    size_t i;
+    float res = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (i = 0; i < d; i++) {
+        const float tmp = x[i] - y[i];
+        res += tmp * tmp;
+    }
+    return res;
 }
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
 float
 fvec_L1_avx512(const float* x, const float* y, size_t d) {
@@ -249,6 +178,70 @@ fvec_Linf_avx512(const float* x, const float* y, size_t d) {
     return _mm_cvtss_f32(msum2);
 }
 
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+fvec_madd_avx512(size_t n, const float* a, float bf, const float* b, float* c) {
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < n; i++) {
+        c[i] = a[i] + bf * b[i];
+    }
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+fvec_inner_product_batch_4_avx512(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
+                                  const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
+                                  float& dis1, float& dis2, float& dis3) {
+    float d0 = 0;
+    float d1 = 0;
+    float d2 = 0;
+    float d3 = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        d0 += x[i] * y0[i];
+        d1 += x[i] * y1[i];
+        d2 += x[i] * y2[i];
+        d3 += x[i] * y3[i];
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
+// trust the compiler to unroll this properly
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+void
+fvec_L2sqr_batch_4_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                          const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    float d0 = 0;
+    float d1 = 0;
+    float d2 = 0;
+    float d3 = 0;
+    FAISS_PRAGMA_IMPRECISE_LOOP
+    for (size_t i = 0; i < d; ++i) {
+        const float q0 = x[i] - y0[i];
+        const float q1 = x[i] - y1[i];
+        const float q2 = x[i] - y2[i];
+        const float q3 = x[i] - y3[i];
+        d0 += q0 * q0;
+        d1 += q1 * q1;
+        d2 += q2 * q2;
+        d3 += q3 * q3;
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+
 }  // namespace faiss
 
 #endif
diff --git a/src/simd/distances_avx512.h b/src/simd/distances_avx512.h
index cf350b5e0..bfde69af1 100644
--- a/src/simd/distances_avx512.h
+++ b/src/simd/distances_avx512.h
@@ -32,6 +32,17 @@ fvec_L1_avx512(const float* x, const float* y, size_t d);
 float
 fvec_Linf_avx512(const float* x, const float* y, size_t d);
 
+void
+fvec_madd_avx512(size_t n, const float* a, float bf, const float* b, float* c);
+
+void
+fvec_inner_product_batch_4_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                                  const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
+void
+fvec_L2sqr_batch_4_avx512(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                          const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
 }  // namespace faiss
 
 #endif /* DISTANCES_AVX512_H */
diff --git a/src/simd/distances_ref.cc b/src/simd/distances_ref.cc
index 87586a5dc..84fb049a9 100644
--- a/src/simd/distances_ref.cc
+++ b/src/simd/distances_ref.cc
@@ -10,6 +10,7 @@
 #include "distances_ref.h"
 
 #include <cmath>
+
 namespace faiss {
 
 float
@@ -48,7 +49,9 @@ float
 fvec_inner_product_ref(const float* x, const float* y, size_t d) {
     size_t i;
     float res = 0;
-    for (i = 0; i < d; i++) res += x[i] * y[i];
+    for (i = 0; i < d; i++) {
+        res += x[i] * y[i];
+    }
     return res;
 }
 
@@ -56,7 +59,9 @@ float
 fvec_norm_L2sqr_ref(const float* x, size_t d) {
     size_t i;
     double res = 0;
-    for (i = 0; i < d; i++) res += x[i] * x[i];
+    for (i = 0; i < d; i++) {
+        res += x[i] * x[i];
+    }
     return res;
 }
 
@@ -76,9 +81,75 @@ fvec_inner_products_ny_ref(float* ip, const float* x, const float* y, size_t d,
     }
 }
 
+/// compute ny square L2 distance between x and a set of transposed contiguous
+/// y vectors. squared lengths of y should be provided as well
+void
+fvec_L2sqr_ny_transposed_ref(float* __restrict dis, const float* __restrict x, const float* __restrict y,
+                             const float* __restrict y_sqlen, size_t d, size_t d_offset, size_t ny) {
+    float x_sqlen = 0;
+    for (size_t j = 0; j < d; j++) {
+        x_sqlen += x[j] * x[j];
+    }
+
+    for (size_t i = 0; i < ny; i++) {
+        float dp = 0;
+        for (size_t j = 0; j < d; j++) {
+            dp += x[j] * y[i + j * d_offset];
+        }
+
+        dis[i] = x_sqlen + y_sqlen[i] - 2 * dp;
+    }
+}
+
+/// compute ny square L2 distance between x and a set of contiguous y vectors
+/// and return the index of the nearest vector.
+/// return 0 if ny == 0.
+size_t
+fvec_L2sqr_ny_nearest_ref(float* __restrict distances_tmp_buffer, const float* __restrict x, const float* __restrict y,
+                          size_t d, size_t ny) {
+    fvec_L2sqr_ny_ref(distances_tmp_buffer, x, y, d, ny);
+
+    size_t nearest_idx = 0;
+    float min_dis = HUGE_VALF;
+
+    for (size_t i = 0; i < ny; i++) {
+        if (distances_tmp_buffer[i] < min_dis) {
+            min_dis = distances_tmp_buffer[i];
+            nearest_idx = i;
+        }
+    }
+
+    return nearest_idx;
+}
+
+/// compute ny square L2 distance between x and a set of transposed contiguous
+/// y vectors and return the index of the nearest vector.
+/// squared lengths of y should be provided as well
+/// return 0 if ny == 0.
+size_t
+fvec_L2sqr_ny_nearest_y_transposed_ref(float* __restrict distances_tmp_buffer, const float* __restrict x,
+                                       const float* __restrict y, const float* __restrict y_sqlen, size_t d,
+                                       size_t d_offset, size_t ny) {
+    fvec_L2sqr_ny_transposed_ref(distances_tmp_buffer, x, y, y_sqlen, d, d_offset, ny);
+
+    size_t nearest_idx = 0;
+    float min_dis = HUGE_VALF;
+
+    for (size_t i = 0; i < ny; i++) {
+        if (distances_tmp_buffer[i] < min_dis) {
+            min_dis = distances_tmp_buffer[i];
+            nearest_idx = i;
+        }
+    }
+
+    return nearest_idx;
+}
+
 void
 fvec_madd_ref(size_t n, const float* a, float bf, const float* b, float* c) {
-    for (size_t i = 0; i < n; i++) c[i] = a[i] + bf * b[i];
+    for (size_t i = 0; i < n; i++) {
+        c[i] = a[i] + bf * b[i];
+    }
 }
 
 int
@@ -96,4 +167,49 @@ fvec_madd_and_argmin_ref(size_t n, const float* a, float bf, const float* b, flo
     return imin;
 }
 
+void
+fvec_inner_product_batch_4_ref(const float* __restrict x, const float* __restrict y0, const float* __restrict y1,
+                               const float* __restrict y2, const float* __restrict y3, const size_t d, float& dis0,
+                               float& dis1, float& dis2, float& dis3) {
+    float d0 = 0;
+    float d1 = 0;
+    float d2 = 0;
+    float d3 = 0;
+    for (size_t i = 0; i < d; ++i) {
+        d0 += x[i] * y0[i];
+        d1 += x[i] * y1[i];
+        d2 += x[i] * y2[i];
+        d3 += x[i] * y3[i];
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+
+void
+fvec_L2sqr_batch_4_ref(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                       const size_t d, float& dis0, float& dis1, float& dis2, float& dis3) {
+    float d0 = 0;
+    float d1 = 0;
+    float d2 = 0;
+    float d3 = 0;
+    for (size_t i = 0; i < d; ++i) {
+        const float q0 = x[i] - y0[i];
+        const float q1 = x[i] - y1[i];
+        const float q2 = x[i] - y2[i];
+        const float q3 = x[i] - y3[i];
+        d0 += q0 * q0;
+        d1 += q1 * q1;
+        d2 += q2 * q2;
+        d3 += q3 * q3;
+    }
+
+    dis0 = d0;
+    dis1 = d1;
+    dis2 = d2;
+    dis3 = d3;
+}
+
 }  // namespace faiss
diff --git a/src/simd/distances_ref.h b/src/simd/distances_ref.h
index fbd87640b..fefb04999 100644
--- a/src/simd/distances_ref.h
+++ b/src/simd/distances_ref.h
@@ -33,12 +33,44 @@ fvec_L2sqr_ny_ref(float* dis, const float* x, const float* y, size_t d, size_t n
 void
 fvec_inner_products_ny_ref(float* ip, const float* x, const float* y, size_t d, size_t ny);
 
+/// compute ny square L2 distance between x and a set of transposed contiguous
+/// y vectors. squared lengths of y should be provided as well
+void
+fvec_L2sqr_ny_transposed_ref(float* dis, const float* x, const float* y, const float* y_sqlen, size_t d,
+                             size_t d_offset, size_t ny);
+
+/// compute ny square L2 distance between x and a set of contiguous y vectors
+/// and return the index of the nearest vector.
+/// return 0 if ny == 0.
+size_t
+fvec_L2sqr_ny_nearest_ref(float* distances_tmp_buffer, const float* x, const float* y, size_t d, size_t ny);
+
+/// compute ny square L2 distance between x and a set of transposed contiguous
+/// y vectors and return the index of the nearest vector.
+/// squared lengths of y should be provided as well
+/// return 0 if ny == 0.
+size_t
+fvec_L2sqr_ny_nearest_y_transposed_ref(float* distances_tmp_buffer, const float* x, const float* y,
+                                       const float* y_sqlen, size_t d, size_t d_offset, size_t ny);
+
 void
 fvec_madd_ref(size_t n, const float* a, float bf, const float* b, float* c);
 
 int
 fvec_madd_and_argmin_ref(size_t n, const float* a, float bf, const float* b, float* c);
 
+/// Special version of inner product that computes 4 distances
+/// between x and yi, which is performance oriented.
+void
+fvec_inner_product_batch_4_ref(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                               const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
+/// Special version of L2sqr that computes 4 distances
+/// between x and yi, which is performance oriented.
+void
+fvec_L2sqr_batch_4_ref(const float* x, const float* y0, const float* y1, const float* y2, const float* y3,
+                       const size_t d, float& dis0, float& dis1, float& dis2, float& dis3);
+
 }  // namespace faiss
 
 #endif /* DISTANCES_REF_H */
diff --git a/src/simd/hook.cc b/src/simd/hook.cc
index f8132b4e4..eedf1b086 100644
--- a/src/simd/hook.cc
+++ b/src/simd/hook.cc
@@ -51,6 +51,14 @@ decltype(fvec_inner_products_ny) fvec_inner_products_ny = fvec_inner_products_ny
 decltype(fvec_madd) fvec_madd = fvec_madd_ref;
 decltype(fvec_madd_and_argmin) fvec_madd_and_argmin = fvec_madd_and_argmin_ref;
 
+decltype(fvec_L2sqr_ny_nearest) fvec_L2sqr_ny_nearest = fvec_L2sqr_ny_nearest_ref;
+decltype(fvec_L2sqr_ny_nearest_y_transposed) fvec_L2sqr_ny_nearest_y_transposed =
+    fvec_L2sqr_ny_nearest_y_transposed_ref;
+decltype(fvec_L2sqr_ny_transposed) fvec_L2sqr_ny_transposed = fvec_L2sqr_ny_transposed_ref;
+
+decltype(fvec_inner_product_batch_4) fvec_inner_product_batch_4 = fvec_inner_product_batch_4_ref;
+decltype(fvec_L2sqr_batch_4) fvec_L2sqr_batch_4 = fvec_L2sqr_batch_4_ref;
+
 #if defined(__x86_64__)
 bool
 cpu_support_avx512() {
@@ -85,9 +93,12 @@ fvec_hook(std::string& simd_type) {
         fvec_norm_L2sqr = fvec_norm_L2sqr_sse;
         fvec_L2sqr_ny = fvec_L2sqr_ny_sse;
         fvec_inner_products_ny = fvec_inner_products_ny_sse;
-        fvec_madd = fvec_madd_sse;
+        fvec_madd = fvec_madd_avx512;
         fvec_madd_and_argmin = fvec_madd_and_argmin_sse;
 
+        fvec_inner_product_batch_4 = fvec_inner_product_batch_4_avx512;
+        fvec_L2sqr_batch_4 = fvec_L2sqr_batch_4_avx512;
+
         simd_type = "AVX512";
         support_pq_fast_scan = true;
     } else if (use_avx2 && cpu_support_avx2()) {
@@ -99,9 +110,12 @@ fvec_hook(std::string& simd_type) {
         fvec_norm_L2sqr = fvec_norm_L2sqr_sse;
         fvec_L2sqr_ny = fvec_L2sqr_ny_sse;
         fvec_inner_products_ny = fvec_inner_products_ny_sse;
-        fvec_madd = fvec_madd_sse;
+        fvec_madd = fvec_madd_avx;
         fvec_madd_and_argmin = fvec_madd_and_argmin_sse;
 
+        fvec_inner_product_batch_4 = fvec_inner_product_batch_4_avx;
+        fvec_L2sqr_batch_4 = fvec_L2sqr_batch_4_avx;
+
         simd_type = "AVX2";
         support_pq_fast_scan = true;
     } else if (use_sse4_2 && cpu_support_sse4_2()) {
@@ -116,6 +130,9 @@ fvec_hook(std::string& simd_type) {
         fvec_madd = fvec_madd_sse;
         fvec_madd_and_argmin = fvec_madd_and_argmin_sse;
 
+        fvec_inner_product_batch_4 = fvec_inner_product_batch_4_ref;
+        fvec_L2sqr_batch_4 = fvec_L2sqr_batch_4_ref;
+
         simd_type = "SSE4_2";
         support_pq_fast_scan = false;
     } else {
@@ -130,6 +147,9 @@ fvec_hook(std::string& simd_type) {
         fvec_madd = fvec_madd_ref;
         fvec_madd_and_argmin = fvec_madd_and_argmin_ref;
 
+        fvec_inner_product_batch_4 = fvec_inner_product_batch_4_ref;
+        fvec_L2sqr_batch_4 = fvec_L2sqr_batch_4_ref;
+
         simd_type = "GENERIC";
         support_pq_fast_scan = false;
     }
diff --git a/src/simd/hook.h b/src/simd/hook.h
index 5b2c93277..07763ea8d 100644
--- a/src/simd/hook.h
+++ b/src/simd/hook.h
@@ -15,16 +15,61 @@
 #include <string>
 namespace faiss {
 
+/// inner product
 extern float (*fvec_inner_product)(const float*, const float*, size_t);
+
+/// Squared L2 distance between two vectors
 extern float (*fvec_L2sqr)(const float*, const float*, size_t);
+
+/// L1 distance
 extern float (*fvec_L1)(const float*, const float*, size_t);
+
+/// infinity distance
 extern float (*fvec_Linf)(const float*, const float*, size_t);
+
+/// squared norm of a vector
 extern float (*fvec_norm_L2sqr)(const float*, size_t);
+
+/// compute ny square L2 distance between x and a set of contiguous y vectors
 extern void (*fvec_L2sqr_ny)(float*, const float*, const float*, size_t, size_t);
+
+/// compute the inner product between nx vectors x and one y
 extern void (*fvec_inner_products_ny)(float*, const float*, const float*, size_t, size_t);
+
+/// compute ny square L2 distance between x and a set of transposed contiguous
+/// y vectors. squared lengths of y should be provided as well
+/// todo aguzhva: bring non-ref versions
+extern void (*fvec_L2sqr_ny_transposed)(float*, const float*, const float*, const float*, size_t, size_t, size_t);
+
+/// compute ny square L2 distance between x and a set of contiguous y vectors
+/// and return the index of the nearest vector.
+/// return 0 if ny == 0.
+/// todo aguzhva: bring non-ref versions
+extern size_t (*fvec_L2sqr_ny_nearest)(float*, const float*, const float*, size_t, size_t);
+
+/// compute ny square L2 distance between x and a set of transposed contiguous
+/// y vectors and return the index of the nearest vector.
+/// squared lengths of y should be provided as well
+/// return 0 if ny == 0.
+/// todo aguzhva: bring non-ref versions
+extern size_t (*fvec_L2sqr_ny_nearest_y_transposed)(float*, const float*, const float*, const float*, size_t, size_t,
+                                                    size_t);
+
 extern void (*fvec_madd)(size_t, const float*, float, const float*, float*);
 extern int (*fvec_madd_and_argmin)(size_t, const float*, float, const float*, float*);
 
+/// Special version of inner product that computes 4 distances
+/// between x and yi, which is performance oriented.
+/// todo aguzhva: bring non-ref versions
+extern void (*fvec_inner_product_batch_4)(const float*, const float*, const float*, const float*, const float*,
+                                          const size_t, float&, float&, float&, float&);
+
+/// Special version of L2sqr that computes 4 distances
+/// between x and yi, which is performance oriented.
+/// todo aguzhva: bring non-ref versions
+extern void (*fvec_L2sqr_batch_4)(const float*, const float*, const float*, const float*, const float*, const size_t,
+                                  float&, float&, float&, float&);
+
 #if defined(__x86_64__)
 extern bool use_avx512;
 extern bool use_avx2;
diff --git a/tests/faiss/CMakeLists.txt b/tests/faiss/CMakeLists.txt
new file mode 100644
index 000000000..fa391844a
--- /dev/null
+++ b/tests/faiss/CMakeLists.txt
@@ -0,0 +1,67 @@
+set(FAISS_TEST_SRCS
+  ../../thirdparty/faiss/tests/test_omp_threads.cpp
+  ../../thirdparty/faiss/tests/test_heap.cpp
+  ../../thirdparty/faiss/tests/test_partitioning.cpp
+  ../../thirdparty/faiss/tests/test_simdlib.cpp
+  ../../thirdparty/faiss/tests/test_approx_topk.cpp
+  ../../thirdparty/faiss/tests/test_code_distance.cpp
+  ../../thirdparty/faiss/tests/test_cppcontrib_uintreader.cpp
+  ../../thirdparty/faiss/tests/test_distances_simd.cpp
+  ../../thirdparty/faiss/tests/test_hnsw.cpp
+  ../../thirdparty/faiss/tests/test_binary_flat.cpp
+  ../../thirdparty/faiss/tests/test_cppcontrib_sa_decode.cpp
+  ../../thirdparty/faiss/tests/test_ondisk_ivf.cpp
+  ../../thirdparty/faiss/tests/test_pq_encoding.cpp
+  ../../thirdparty/faiss/tests/test_RCQ_cropping.cpp
+  ../../thirdparty/faiss/tests/test_ivfpq_codec.cpp
+  ../../thirdparty/faiss/tests/test_ivfpq_indexing.cpp
+  ../../thirdparty/faiss/tests/test_merge.cpp
+  ../../thirdparty/faiss/tests/test_threaded_index.cpp
+  ../../thirdparty/faiss/tests/test_pairs_decoding.cpp
+  ../../thirdparty/faiss/tests/test_dealloc_invlists.cpp
+  ../../thirdparty/faiss/tests/test_params_override.cpp
+  ../../thirdparty/faiss/tests/test_sliding_ivf.cpp
+  ../../thirdparty/faiss/tests/test_transfer_invlists.cpp
+  ../../thirdparty/faiss/tests/test_distances_if.cpp
+)
+
+find_package(GTest REQUIRED)
+
+if(__X86_64)
+  add_executable(faiss_tests ${FAISS_TEST_SRCS})
+
+  add_dependencies(faiss_tests faiss)
+  target_compile_options(
+    faiss_tests
+    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:
+            -msse4.2
+            -mavx2
+            -mfma
+            -mf16c
+            -Wno-sign-compare
+            -Wno-unused-variable
+            -Wno-reorder
+            -Wno-unused-local-typedefs
+            -Wno-unused-function
+            -Wno-strict-aliasing>)
+  target_link_libraries(
+    faiss_tests PUBLIC faiss knowhere GTest::gtest GTest::gtest_main)
+  target_compile_definitions(faiss_tests PRIVATE FINTEGER=int)
+endif()
+
+if(__AARCH64)
+  add_executable(faiss_tests ${FAISS_TEST_SRCS})
+  target_compile_options(
+    faiss_tests
+    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:
+            -Wno-sign-compare
+            -Wno-unused-variable
+            -Wno-reorder
+            -Wno-unused-local-typedefs
+            -Wno-unused-function
+            -Wno-strict-aliasing>)
+
+  add_dependencies(faiss_tests faiss)
+  target_link_libraries(faiss_tests PUBLIC faiss knowhere GTest::gtest GTest::gtest_main)
+  target_compile_definitions(faiss_tests PRIVATE FINTEGER=int)
+endif()
diff --git a/tests/faiss_isolated/cmake/utils/platform_check.cmake b/tests/faiss_isolated/cmake/utils/platform_check.cmake
new file mode 100644
index 000000000..d713a2d44
--- /dev/null
+++ b/tests/faiss_isolated/cmake/utils/platform_check.cmake
@@ -0,0 +1,12 @@
+include(CheckSymbolExists)
+
+macro(detect_target_arch)
+  check_symbol_exists(__aarch64__ "" __AARCH64)
+  check_symbol_exists(__x86_64__ "" __X86_64)
+
+  if(NOT __AARCH64 AND NOT __X86_64)
+    message(FATAL "knowhere only support amd64 and arm64.")
+  endif()
+endmacro()
+
+detect_target_arch()
diff --git a/tests/faiss_isolated/cmake/utils/utils.cmake b/tests/faiss_isolated/cmake/utils/utils.cmake
new file mode 100644
index 000000000..ff7c01e22
--- /dev/null
+++ b/tests/faiss_isolated/cmake/utils/utils.cmake
@@ -0,0 +1,60 @@
+macro(__knowhere_option variable description value)
+  if(NOT DEFINED ${variable})
+    set(${variable}
+        ${value}
+        CACHE STRING ${description})
+  endif()
+endmacro()
+
+set(KNOWHERE_ALL_OPTIONS)
+
+macro(knowhere_option variable description value)
+  set(__value ${value})
+  set(__condition "")
+  set(__varname "__value")
+  list(APPEND knowhere_ALL_OPTIONS ${variable})
+
+  foreach(arg ${ARGN})
+    if(arg STREQUAL "IF" OR arg STREQUAL "if")
+      set(__varname "__condition")
+    else()
+      list(APPEND ${__varname} ${arg})
+    endif()
+  endforeach()
+
+  unset(__varname)
+
+  if("${__condition}" STREQUAL "")
+    set(__condition 2 GREATER 1)
+  endif()
+
+  if(${__condition})
+    if("${__value}" MATCHES ";")
+      if(${__value})
+        __knowhere_option(${variable} "${description}" ON)
+      else()
+        __knowhere_option(${variable} "${description}" OFF)
+      endif()
+    elseif(DEFINED ${__value})
+      if(${__value})
+        __knowhere_option(${variable} "${description}" ON)
+      else()
+        __knowhere_option(${variable} "${description}" OFF)
+      endif()
+    else()
+      __knowhere_option(${variable} "${description}" "${__value}")
+    endif()
+  else()
+    unset(${variable} CACHE)
+  endif()
+endmacro()
+
+if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0")
+  macro(knowhere_file_glob glob variable)
+    file(${glob} ${variable} CONFIGURE_DEPENDS ${ARGN})
+  endmacro()
+else()
+  macro(knowhere_file_glob)
+    file(${glob} ${variable} ${ARGN})
+  endmacro()
+endif()
diff --git a/tests/ut/test_ivfflat_cc.cc b/tests/ut/test_ivfflat_cc.cc
index 0de8c9771..442dd1383 100644
--- a/tests/ut/test_ivfflat_cc.cc
+++ b/tests/ut/test_ivfflat_cc.cc
@@ -78,7 +78,7 @@ TEST_CASE("Test Build Search Concurrency", "[Concurrency]") {
                 for (size_t i = 0; i < nlist; i++) {
                     std::mt19937_64 rng(i);
                     int64_t add_size = distribution(rng);
-                    std::vector<faiss::Index::idx_t> ids(add_size, i);
+                    std::vector<faiss::idx_t> ids(add_size, i);
                     float value = i;
                     std::vector<float> codes(add_size * dim, value);
                     std::vector<float> code_normals = knowhere::NormalizeVecs(codes.data(), add_size, dim);
@@ -95,7 +95,7 @@ TEST_CASE("Test Build Search Concurrency", "[Concurrency]") {
                 for (size_t i = 0; i < nlist; i++) {
                     std::mt19937_64 rng(i * i);
                     int64_t add_size = distribution(rng);
-                    std::vector<faiss::Index::idx_t> ids(add_size, i);
+                    std::vector<faiss::idx_t> ids(add_size, i);
                     float value = i;
                     std::vector<float> codes(add_size * dim, value);
                     std::vector<float> code_normals = knowhere::NormalizeVecs(codes.data(), add_size, dim);
diff --git a/thirdparty/faiss/.circleci/Dockerfile.cpu b/thirdparty/faiss/.circleci/Dockerfile.cpu
deleted file mode 100644
index a39edad1c..000000000
--- a/thirdparty/faiss/.circleci/Dockerfile.cpu
+++ /dev/null
@@ -1,11 +0,0 @@
-FROM cimg/base:stable-20.04
-
-# Install python3, swig, and MKL.
-RUN sudo apt-get update && \
-sudo apt-get install -y python3-dev python3-pip swig libmkl-dev
-
-# Install recent CMake.
-RUN wget -nv -O - https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.tar.gz | sudo tar xzf - --strip-components=1 -C /usr
-
-# Install numpy/scipy/pytorch for python tests.
-RUN pip3 install numpy scipy torch
diff --git a/thirdparty/faiss/.circleci/Dockerfile.faiss_gpu b/thirdparty/faiss/.circleci/Dockerfile.faiss_gpu
deleted file mode 100644
index a82960ff6..000000000
--- a/thirdparty/faiss/.circleci/Dockerfile.faiss_gpu
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM nvidia/cuda:10.2-devel-ubuntu18.04
-
-# Install python3, wget, and openblas.
-RUN apt-get update && \
-        apt-get install -y python3-dev python3-pip libopenblas-dev wget libpcre3-dev
-
-# Install swig 4.0.2.
-RUN wget -nv -O - https://sourceforge.net/projects/swig/files/swig/swig-4.0.2/swig-4.0.2.tar.gz/download | tar zxf - && cd swig-4.0.2 && ./configure && make -j && make install
-
-# Install recent CMake.
-RUN wget -nv -O - https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.tar.gz | tar xzf - --strip-components=1 -C /usr
-
-# Install numpy/scipy/pytorch for python tests.
-RUN pip3 install numpy scipy torch
-
-COPY . /faiss
-
-WORKDIR /faiss
-
-RUN cmake -B build \
-        -DFAISS_ENABLE_GPU=ON \
-        -DFAISS_ENABLE_C_API=ON \
-        -DFAISS_ENABLE_PYTHON=ON \
-        -DBUILD_TESTING=ON \
-        -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
-        .
-
-RUN make -C build -j8
diff --git a/thirdparty/faiss/.circleci/config.yml b/thirdparty/faiss/.circleci/config.yml
index 4d440c5b1..e105d7914 100644
--- a/thirdparty/faiss/.circleci/config.yml
+++ b/thirdparty/faiss/.circleci/config.yml
@@ -1,12 +1,38 @@
 version: 2.1
 
-orbs:
-  win: circleci/windows@2.4.0
+executors:
+  linux-x86_64-cpu:
+    docker:
+      - image: continuumio/miniconda3
+    resource_class: large
+  linux-x86_64-gpu:
+    environment:
+      CONDA_ARCH: Linux-x86_64
+    machine:
+      image: linux-cuda-11:2023.02.1
+    resource_class: gpu.nvidia.medium
+  linux-arm64-cpu:
+    environment:
+      CONDA_ARCH: Linux-aarch64
+    machine:
+      image: ubuntu-2004:current
+    resource_class: arm.medium
+  macosx-arm64-cpu:
+    environment:
+      CONDA_ARCH: MacOSX-arm64
+    macos:
+      xcode: 14.2.0 # minimum supported for M1
+    resource_class: macos.m1.large.gen1
+  windows-x86_64-cpu:
+    machine:
+      image: windows-server-2019-vs2019:stable
+      shell: bash.exe
+    resource_class: windows.medium
 
 jobs:
   format:
     docker:
-      - image: ubuntu:20.04
+      - image: ubuntu:22.04
     steps:
       - checkout
       - run:
@@ -27,317 +53,253 @@ jobs:
                exit 1
              fi
 
-  build_linux:
+  build_conda:
     parameters:
-      opt_level:
+      label:
         type: string
-        default: generic
-      resource_class:
+        default: ""
+      cuda:
         type: string
-        default: medium
-    docker:
-      - image: beauby/faiss-circleci:cpu
-    resource_class: << parameters.resource_class >>
+        default: ""
+      raft:
+        type: string
+        default: ""
+      cuda_archs:
+        type: string
+        default: ""
+      compiler_version:
+        type: string
+        default: ""
+      exec:
+        type: executor
+    executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
-      MKL_THREADING_LAYER: GNU
+      PACKAGE_TYPE: <<parameters.label>>
+      CUDA_ARCHS: <<parameters.cuda_archs>>
     steps:
       - checkout
       - run:
-          name: Build faiss library
+          name: Install conda
           command: |
-            cmake -B build -DBUILD_TESTING=ON -DFAISS_ENABLE_GPU=OFF \
-                  -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
-                  -DFAISS_ENABLE_C_API=ON \
-                  -DCMAKE_BUILD_TYPE=Release -DBLA_VENDOR=Intel10_64_dyn .
-            make -k -C build -j3 faiss
+            if [ -n "${CONDA_ARCH}" ]
+            then
+              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
+              bash miniconda.sh -b -p $HOME/miniconda
+              ~/miniconda/bin/conda init
+            fi
+      - run:
+          name: Install conda build tools
+          command: |
+            conda config --set solver libmamba
+            # conda config --set verbosity 3
+            conda update -y -q conda
+            conda install -y -q conda-build
+      - when:
+          condition: << parameters.label >>
+          steps:
+            - run:
+                name: Enable anaconda uploads
+                command: |
+                  conda install -y -q anaconda-client
+                  conda config --set anaconda_upload yes
       - when:
           condition:
-            equal: [ "avx2", << parameters.opt_level >> ]
+            and:
+              - not: << parameters.label >>
+              - not: << parameters.cuda >>
           steps:
             - run:
-                name: Build faiss_avx2 library
-                command: make -k -C build -j3 faiss_avx2 swigfaiss_avx2
-      - run:
-          name: Test faiss library
-          command: |
-            make -C build -j3 faiss_test
-            export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
-            make -C build test
-      - run:
-          name: Build python extension
-          command: |
-            make -C build -j3 swigfaiss
-            cd build/faiss/python
-            python3 setup.py build
-      - run:
-          name: Test python extension
-          command: |
-            pip3 install pytest
-            export PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)"
-            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - store_test_results:
-          path: test-results
-      - run:
-          name: Build C API
-          command: |
-            make -k -C build -j faiss_c
-
-  build_linux_conda:
-    docker:
-      - image: continuumio/miniconda3
-    steps:
-      - checkout
-      - run:
-          name: Conda build
-          command: |
-            conda install -y -q conda-build
-            cd conda
-            conda build faiss --python 3.7 -c pytorch
-
-  build_osx:
-    macos:
-      xcode: 12.4.0
-    environment:
-      OMP_NUM_THREADS: 10
-    steps:
-      - checkout
-      - run:
-          name: Install Homebrew packages
-          command: |
-            brew install cmake swig libomp
-      - run:
-          name: Install numpy/scipy
-          command: |
-            pip3 install numpy scipy
-      - run:
-          name: Generate Makefiles
-          command: |
-            cmake -B build -DBUILD_TESTING=ON -DFAISS_ENABLE_GPU=OFF \
-                  -DCMAKE_BUILD_TYPE=Release -DFAISS_ENABLE_C_API=ON \
-                  -DPython_EXECUTABLE=/usr/local/bin/python3 .
-      - run:
-          name: Build faiss library
-          command: |
-            make -k -C build -j faiss
-      - run:
-          name: Test faiss library
-          command: |
-            make -C build -j faiss_test
-            export GTEST_OUTPUT="xml:$PWD/test-results/googletest/"
-            make -C build test
-      - run:
-          name: Build python extension
-          command: |
-            make -C build -j swigfaiss
-            cd build/faiss/python
-            python3 setup.py build
-      - run:
-          name: Test python extension
-          command: |
-            pip3 install pytest torch
-            export PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)"
-            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - store_test_results:
-          path: test-results
-      - run:
-          name: Build C API
-          command: |
-            make -k -C build -j faiss_c
-
-  build_windows:
-    executor:
-      name: win/default
-      shell: bash.exe
-    steps:
-      - checkout
-      - run:
-          name: Build/test
-          command: |
-            conda install conda-build
-            cd conda
-            conda build faiss --python 3.7 -c pytorch
+                name: Conda build (CPU)
+                no_output_timeout: 30m
+                command: |
+                  cd conda
+                  conda build faiss --python 3.11 -c pytorch
+      - when:
+          condition:
+            and:
+              - << parameters.label >>
+              - not: << parameters.cuda >>
+          steps:
+            - run:
+                name: Conda build (CPU) w/ anaconda upload
+                no_output_timeout: 30m
+                command: |
+                  cd conda
+                  conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
+      - when:
+          condition:
+            and:
+              - not: << parameters.label >>
+              - << parameters.cuda >>
+              - not: << parameters.raft >>
+          steps:
+            - run:
+                name: Conda build (GPU)
+                no_output_timeout: 60m
+                command: |
+                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
+                  cd conda
+                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
+                      -c pytorch -c nvidia
+      - when:
+          condition:
+            and:
+              - << parameters.label >>
+              - << parameters.cuda >>
+              - not: << parameters.raft >>
+          steps:
+            - run:
+                name: Conda build (GPU) w/ anaconda upload
+                no_output_timeout: 60m
+                command: |
+                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
+                  cd conda
+                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
+                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia
+      - when:
+          condition:
+            and:
+              - not: << parameters.label >>
+              - << parameters.cuda >>
+              - << parameters.raft >>
+          steps:
+            - run:
+                name: Conda build (GPU w/ RAFT)
+                no_output_timeout: 60m
+                command: |
+                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
+                  cd conda
+                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
+                      -c pytorch -c nvidia -c rapidsai -c conda-forge
+      - when:
+          condition:
+            and:
+              - << parameters.label >>
+              - << parameters.cuda >>
+              - << parameters.raft >>
+          steps:
+            - run:
+                name: Conda build (GPU w/ RAFT) w/ anaconda upload
+                no_output_timeout: 60m
+                command: |
+                  sudo update-alternatives --set cuda /usr/local/cuda-<<parameters.cuda>>
+                  cd conda
+                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
+                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia -c rapidsai -c conda-forge
 
-  build_arm:
-    machine:
-      image: ubuntu-2004:202101-01
-    resource_class: arm.medium
+  build_cmake:
     parameters:
+      exec:
+        type: executor
       opt_level:
         type: string
         default: generic
+      gpu:
+        type: string
+        default: "OFF"
+      raft:
+        type: string
+        default: "OFF"
+    executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
-      CONDA_HOME: /home/circleci/miniconda3
-      PYTHON: /home/circleci/miniconda3/bin/python
+      MKL_THREADING_LAYER: GNU
     steps:
       - checkout
       - run:
-          name: Install dependencies
+          name: Install conda
           command: |
-            sudo apt-get update && sudo apt-get install -y swig
-            wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.9.2-Linux-aarch64.sh
-            bash Miniconda3-py39_4.9.2-Linux-aarch64.sh -b -p $CONDA_HOME
-            pip3 install cmake
-            $CONDA_HOME/bin/conda install -y numpy scipy
-            $CONDA_HOME/bin/pip install pytest torch
+            if [ -n "${CONDA_ARCH}" ]
+            then
+              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
+              bash miniconda.sh -b -p $HOME/miniconda
+              ~/miniconda/bin/conda init
+            fi
+      - when:
+          condition:
+            equal: [ "ON", << parameters.gpu >> ]
+          steps:
+            - run:
+                name: Configure CUDA
+                command: sudo update-alternatives --set cuda /usr/local/cuda-11.4
       - run:
-          name: Build faiss library
+          name: Set up environment
           command: |
-            cmake -B build -DBUILD_TESTING=ON -DFAISS_ENABLE_GPU=OFF \
+            conda config --set solver libmamba
+            conda update -y -q conda
+            conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
+      - when:
+          condition:
+            equal: [ "ON", << parameters.raft >> ]
+          steps:
+            - run:
+                name: Install libraft
+                command: |
+                  conda install -y -q libraft cudatoolkit=11.4 -c rapidsai-nightly -c nvidia -c pkgs/main -c conda-forge
+      - run:
+          name: Build all targets
+          no_output_timeout: 30m
+          command: |
+            eval "$(conda shell.bash hook)"
+            conda activate
+            cmake -B build \
+                  -DBUILD_TESTING=ON \
+                  -DBUILD_SHARED_LIBS=OFF \
+                  -DFAISS_ENABLE_GPU=<< parameters.gpu >> \
+                  -DFAISS_ENABLE_RAFT=<< parameters.raft >> \
                   -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
                   -DFAISS_ENABLE_C_API=ON \
+                  -DPYTHON_EXECUTABLE=$(which python) \
                   -DCMAKE_BUILD_TYPE=Release \
-                  -DPython_EXECUTABLE=$PYTHON .
-            make -k -C build -j3 faiss
+                  -DBLA_VENDOR=Intel10_64_dyn \
+                  -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
+                  .
+            make -k -C build -j$(nproc)
       - run:
-          name: Test faiss library
+          name: C++ tests
           command: |
-            make -C build -j3 faiss_test
             export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
             make -C build test
       - run:
-          name: Build python extension
+          name: Install Python extension
           command: |
-            make -C build -j3 swigfaiss
             cd build/faiss/python
-            $PYTHON setup.py build
-      - run:
-          name: Test python extension
-          command: |
-            export PYTHONPATH="$(ls -d ./build/faiss/python/build/lib*/)"
-            $PYTHON -c "import faiss; assert 'NEON' in faiss.get_compile_options()"
-            $PYTHON -m pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-            $PYTHON -m pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+            python setup.py install
+      - when:
+          condition:
+            equal: [ "OFF", << parameters.gpu >> ]
+          steps:
+            - run:
+                name: Python tests (CPU only)
+                command: |
+                  conda install -y -q pytorch -c pytorch
+                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+      - when:
+          condition:
+            equal: [ "ON", << parameters.gpu >> ]
+          steps:
+            - run:
+                name: Python tests (CPU + GPU)
+                command: |
+                  conda install -y -q pytorch pytorch-cuda -c pytorch -c nvidia
+                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+                  cp tests/common_faiss_tests.py faiss/gpu/test
+                  pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
+                  pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
+      - when:
+          condition:
+            equal: [ "avx2", << parameters.opt_level >> ]
+          steps:
+            - run:
+                name: Test avx2 loading
+                command: |
+                  FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss.so
+                  LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss_avx2.so
       - store_test_results:
           path: test-results
-      - run:
-          name: Build C API
-          command: |
-            make -k -C build -j faiss_c
-
-  build_linux_gpu:
-    machine:
-      resource_class: gpu.nvidia.medium
-      image: ubuntu-2004-cuda-11.4:202110-01
-      docker_layer_caching: true
-    steps:
-      - checkout
-      - run:
-          name: Build/test
-          command: |
-            docker build -t faiss -f .circleci/Dockerfile.faiss_gpu .
-            docker run --gpus all faiss make -C build test
-            docker run --gpus all faiss sh -c '(pwd; find)'
-            docker run --gpus all faiss sh -c '(cd build/faiss/python; python3 setup.py install) && cp tests/common_faiss_tests.py faiss/gpu/test && python3 -m unittest discover -s faiss/gpu/test -p "test_*"'
-            docker run --gpus all faiss sh -c '(cd build/faiss/python; python3 setup.py install) && cp tests/common_faiss_tests.py faiss/gpu/test && python3 -m unittest discover -s faiss/gpu/test -p "torch_*.py"'
-          no_output_timeout: 60m
-
-  deploy_linux:
-    parameters:
-      label:
-        type: string
-        default: main
-    docker:
-      - image: continuumio/miniconda3
-    steps:
-      - checkout
-      - run:
-          name: Install conda-build/anaconda-client
-          command: |
-            conda install -y -q conda-build anaconda-client
-            conda config --set anaconda_upload yes
-      - run:
-          name: Build packages
-          environment:
-            PACKAGE_TYPE: <<parameters.label>>
-          command: |
-            cd conda
-            conda build faiss --user pytorch --label <<parameters.label>>
-
-  deploy_linux_gpu:
-    parameters:
-      label:
-        type: string
-        default: main
-      cuda:
-        type: string
-      cuda_archs:
-        type: string
-      compiler_version:
-        type: string
-    machine:
-      resource_class: gpu.nvidia.medium
-      image: ubuntu-2004-cuda-11.4:202110-01
-      docker_layer_caching: true
-    steps:
-      - checkout
-      - run:
-          name: Build packages
-          command: |
-            docker build -t faiss -f conda/Dockerfile.cuda<<parameters.cuda>> .
-            docker run --gpus all \
-              -e PACKAGE_TYPE="<<parameters.label>>" \
-              -e CUDA_ARCHS="<<parameters.cuda_archs>>" \
-              -e ANACONDA_API_TOKEN=$ANACONDA_API_TOKEN \
-              faiss \
-              conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                --user pytorch --label <<parameters.label>>
-          no_output_timeout: 60m
-
-  deploy_osx:
-    parameters:
-      label:
-        type: string
-        default: main
-    macos:
-      xcode: 12.4.0
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            curl https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh --output miniconda.sh
-            bash miniconda.sh -b -p $HOME/miniconda
-            $HOME/miniconda/bin/conda install -y -q conda-build anaconda-client
-            $HOME/miniconda/bin/conda config --set anaconda_upload yes
-      - run:
-          name: Install MacOSX10.9 SDK
-          command: |
-            curl -L -o - https://github.com/phracker/MacOSX-SDKs/releases/download/10.15/MacOSX10.9.sdk.tar.xz | sudo tar xJf - -C /opt
-      - run:
-          name: Build packages
-          environment:
-            PACKAGE_TYPE: <<parameters.label>>
-          command: |
-            export PATH=~/miniconda/bin:$PATH
-            cd conda
-            conda build faiss --user pytorch --label <<parameters.label>>
-
-  deploy_windows:
-    parameters:
-      label:
-        type: string
-        default: main
-    executor:
-      name: win/default
-      shell: bash.exe
-    steps:
-      - checkout
-      - run:
-          name: Install conda-build/anaconda-client
-          command: |
-            conda install -y -q conda-build anaconda-client
-            conda config --set anaconda_upload yes
-      - run:
-          name: Build packages
-          environment:
-            PACKAGE_TYPE: <<parameters.label>>
-          command: |
-            cd conda
-            conda build faiss --user pytorch --label <<parameters.label>>
 
 workflows:
   version: 2
@@ -345,60 +307,91 @@ workflows:
     jobs:
       - format:
           name: Format
-      - build_linux:
-          name: Linux
-      - build_linux:
-          name: Linux (AVX2)
+      - build_cmake:
+          name: Linux x86_64 (cmake)
+          exec: linux-x86_64-cpu
+      - build_cmake:
+          name: Linux x86_64 AVX2 (cmake)
+          exec: linux-x86_64-cpu
           opt_level: "avx2"
-          resource_class: "medium+"
-      - build_linux_conda:
-          name: Linux (conda)
-      - build_linux_gpu:
-          name: Linux GPU
+      - build_cmake:
+          name: Linux x86_64 GPU (cmake)
+          exec: linux-x86_64-gpu
+          gpu: "ON"
           requires:
-            - Linux
-      - build_osx:
-          name: OSX
-      - build_windows:
-          name: Windows
-      - build_arm:
-          name: ARM64
-      - deploy_linux:
-          name: Linux packages
+            - Linux x86_64 AVX2 (cmake)
+      - build_cmake:
+          name: Linux x86_64 GPU w/ RAFT (cmake)
+          exec: linux-x86_64-gpu
+          gpu: "ON"
+          raft: "ON"
+          requires:
+            - Linux x86_64 GPU (cmake)
+      - build_conda:
+          name: Linux x86_64 (conda)
+          exec: linux-x86_64-cpu
+      - build_conda:
+          name: Windows x86_64 (conda)
+          exec: windows-x86_64-cpu
+      - build_conda:
+          name: Linux arm64 (conda)
+          exec: linux-arm64-cpu
+      - build_conda:
+          name: Linux x86_64 packages
+          exec: linux-x86_64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_linux_gpu:
-          name: Linux GPU packages (CUDA 10.2)
-          cuda: "10.2"
-          cuda_archs: "35;52;60;61;70;72;75"
-          compiler_version: "8.4"
+      - build_conda:
+          name: Linux x86_64 GPU packages (CUDA 11.4)
+          exec: linux-x86_64-gpu
+          label: main
+          cuda: "11.4"
+          cuda_archs: "60;61;70;72;75;80;86"
+          compiler_version: "11.2"
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_linux_gpu:
-          name: Linux GPU packages (CUDA 11.3)
-          cuda: "11.3"
+      - build_conda:
+          name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.4)
+          exec: linux-x86_64-gpu
+          label: main
+          raft: "ON"
+          cuda: "11.4"
           cuda_archs: "60;61;70;72;75;80;86"
-          compiler_version: "9.3"
+          compiler_version: "11.2"
+          filters:
+            tags:
+              only: /^v.*/
+            branches:
+              ignore: /.*/
+      - build_conda:
+          name: Windows x86_64 packages
+          exec: windows-x86_64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_windows:
-          name: Windows packages
+      - build_conda:
+          name: OSX arm64 packages
+          exec: macosx-arm64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
             branches:
               ignore: /.*/
-      - deploy_osx:
-          name: OSX packages
+      - build_conda:
+          name: Linux arm64 packages
+          exec: linux-arm64-cpu
+          label: main
           filters:
             tags:
               only: /^v.*/
@@ -414,24 +407,34 @@ workflows:
               only:
                 - main
     jobs:
-      - deploy_linux:
-          name: Linux nightlies
+      - build_conda:
+          name: Linux x86_64 nightlies
+          exec: linux-x86_64-cpu
           label: nightly
-      - deploy_linux_gpu:
-          name: Linux GPU nightlies (CUDA 10.2)
-          cuda: "10.2"
-          cuda_archs: "35;52;60;61;70;72;75"
-          compiler_version: "8.4"
+      - build_conda:
+          name: Linux x86_64 GPU nightlies (CUDA 11.4)
+          exec: linux-x86_64-gpu
+          cuda: "11.4"
+          cuda_archs: "60;61;70;72;75;80;86"
+          compiler_version: "11.2"
           label: nightly
-      - deploy_linux_gpu:
-          name: Linux GPU nightlies (CUDA 11.3)
-          cuda: "11.3"
+      - build_conda:
+          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.4)
+          exec: linux-x86_64-gpu
+          raft: "ON"
+          cuda: "11.4"
           cuda_archs: "60;61;70;72;75;80;86"
-          compiler_version: "9.3"
+          compiler_version: "11.2"
+          label: nightly
+      - build_conda:
+          name: Windows x86_64 nightlies
+          exec: windows-x86_64-cpu
           label: nightly
-      - deploy_windows:
-          name: Windows nightlies
+      - build_conda:
+          name: OSX arm64 nightlies
+          exec: macosx-arm64-cpu
           label: nightly
-      - deploy_osx:
-          name: OSX nightlies
+      - build_conda:
+          name: Linux arm64 nightlies
+          exec: linux-arm64-cpu
           label: nightly
diff --git a/thirdparty/faiss/CHANGELOG.md b/thirdparty/faiss/CHANGELOG.md
index 3e32a5cc2..9a38f4e2d 100644
--- a/thirdparty/faiss/CHANGELOG.md
+++ b/thirdparty/faiss/CHANGELOG.md
@@ -9,6 +9,58 @@ We try to indicate most contributions here with the contributor names who are no
 the Facebook Faiss team.  Feel free to add entries here if you submit a PR.
 
 ## [Unreleased]
+## [1.7.4] - 2023-04-12
+### Added
+- Added big batch IVF search for conducting efficient search with big batches of queries
+- Checkpointing in big batch search support
+- Precomputed centroids support
+- Support for iterable inverted lists for eg. key value stores
+- 64-bit indexing arithmetic support in FAISS GPU
+- IndexIVFShards now handle IVF indexes with a common quantizer
+- Jaccard distance support
+- CodePacker for non-contiguous code layouts
+- Approximate evaluation of top-k distances for ResidualQuantizer and IndexBinaryFlat
+- Added support for 12-bit PQ / IVFPQ fine quantizer decoders for standalone vector codecs (faiss/cppcontrib)
+- Conda packages for osx-arm64 (Apple M1) and linux-aarch64 (ARM64) architectures
+- Support for Python 3.10
+
+### Removed
+- CUDA 10 is no longer supported in precompiled packages
+- Removed Python 3.7 support for precompiled packages
+- Removed constraint for using fine quantizer with no greater than 8 bits for IVFPQ, for example, now it is possible to use IVF256,PQ10x12 for a CPU index
+
+### Changed
+- Various performance optimizations for PQ / IVFPQ for AVX2 and ARM for training (fused distance+nearest kernel), search (faster kernels for distance_to_code() and scan_list_*()) and vector encoding
+- A magnitude faster CPU code for LSQ/PLSQ training and vector encoding (reworked code)
+- Performance improvements for Hamming Code computations for AVX2 and ARM (reworked code)
+- Improved auto-vectorization support for IP and L2 distance computations (better handling of pragmas)
+- Improved ResidualQuantizer vector encoding (pooling memory allocations, avoid r/w to a temporary buffer)
+
+### Fixed
+- HSNW bug fixed which improves the recall rate! Special thanks to zh Wang @hhy3 for this.
+- Faiss GPU IVF large query batch fix
+- Faiss + Torch fixes, re-enable k = 2048
+- Fix the number of distance computations to match max_codes parameter
+- Fix decoding of large fast_scan blocks
+
+
+## [1.7.3] - 2022-11-3
+### Added
+- Added sparse k-means routines and moved the generic kmeans to contrib
+- Added FlatDistanceComputer for all FlatCodes indexes
+- Support for fast accumulation of 4-bit LSQ and RQ
+- Added product additive quantization
+- Support per-query search parameters for many indexes + filtering by ids
+- write_VectorTransform and read_vectorTransform were added to the public API (by @AbdelrahmanElmeniawy)
+- Support for IDMap2 in index_factory by adding "IDMap2" to prefix or suffix of the input String (by @AbdelrahmanElmeniawy)
+- Support for merging all IndexFlatCodes descendants (by @AbdelrahmanElmeniawy)
+- Remove and merge features for IndexFastScan (by @AbdelrahmanElmeniawy)
+- Performance improvements: 1) specialized the AVX2 pieces of code speeding up certain hotspots, 2) specialized kernels for vector codecs (this can be found in faiss/cppcontrib)
+
+
+### Fixed
+- Fixed memory leak in OnDiskInvertedLists::do_mmap when the file is not closed (by @AbdelrahmanElmeniawy)
+- LSH correctly throws error for metric types other than METRIC_L2 (by @AbdelrahmanElmeniawy)
 
 ## [1.7.2] - 2021-12-15
 ### Added
@@ -206,7 +258,9 @@ by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
 - C bindings.
 - Extended tutorial to GPU indices.
 
-[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.7.2...HEAD
+[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.7.4...HEAD
+[1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
+[1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
 [1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
 [1.7.1]: https://github.com/facebookresearch/faiss/compare/v1.7.0...v1.7.1
 [1.7.0]: https://github.com/facebookresearch/faiss/compare/v1.6.5...v1.7.0
diff --git a/thirdparty/faiss/CMakeLists.txt b/thirdparty/faiss/CMakeLists.txt
index dfdf50135..f9a180839 100644
--- a/thirdparty/faiss/CMakeLists.txt
+++ b/thirdparty/faiss/CMakeLists.txt
@@ -4,22 +4,56 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
+# =============================================================================
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+
+set(FAISS_LANGUAGES CXX)
+
+if(FAISS_ENABLE_GPU)
+  list(APPEND FAISS_LANGUAGES CUDA)
+endif()
+
+if(FAISS_ENABLE_RAFT)
+include(cmake/thirdparty/fetch_rapids.cmake)
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
+
+rapids_cuda_init_architectures(faiss)
+rapids_cuda_init_architectures(pyfaiss)
+rapids_cuda_init_architectures(faiss_c_library)
+endif()
 
 project(faiss
-  VERSION 1.7.2
+  VERSION 1.7.4
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
-  LANGUAGES CXX)
+  LANGUAGES ${FAISS_LANGUAGES})
 include(GNUInstallDirs)
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
 # Valid values are "generic", "avx2".
 option(FAISS_OPT_LEVEL "" "generic")
 option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
+option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
 option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
 option(FAISS_ENABLE_C_API "Build C API." OFF)
 option(FAISS_ENABLE_CCACHE "Build with ccache." ON)
@@ -42,6 +76,10 @@ if(FAISS_ENABLE_GPU)
   enable_language(CUDA)
 endif()
 
+if(FAISS_ENABLE_RAFT)
+  find_package(raft COMPONENTS compiled distributed)
+endif()
+
 add_subdirectory(faiss)
 
 if(FAISS_ENABLE_GPU)
@@ -57,6 +95,7 @@ if(FAISS_ENABLE_C_API)
 endif()
 
 add_subdirectory(demos)
+add_subdirectory(benchs)
 add_subdirectory(tutorial/cpp)
 
 # CTest must be included in the top level to enable `make test` target.
diff --git a/thirdparty/faiss/CONTRIBUTING.md b/thirdparty/faiss/CONTRIBUTING.md
index 8577a55cb..5ef204b94 100644
--- a/thirdparty/faiss/CONTRIBUTING.md
+++ b/thirdparty/faiss/CONTRIBUTING.md
@@ -44,7 +44,7 @@ outlined on that page and do not file a public issue.
 
 * 4 or 2 spaces for indentation in C++ (no tabs)
 * 80 character line length (both for C++ and Python)
-* C++ language level: C++11
+* C++ language level: C++17
 
 ## License
 
diff --git a/thirdparty/faiss/Doxyfile b/thirdparty/faiss/Doxyfile
index 25227b73a..3a112d016 100644
--- a/thirdparty/faiss/Doxyfile
+++ b/thirdparty/faiss/Doxyfile
@@ -786,7 +786,7 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = */impl/*
+EXCLUDE_PATTERNS       = 
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
diff --git a/thirdparty/faiss/INSTALL.md b/thirdparty/faiss/INSTALL.md
index e7ddc9559..77c9e2896 100644
--- a/thirdparty/faiss/INSTALL.md
+++ b/thirdparty/faiss/INSTALL.md
@@ -6,29 +6,42 @@ pre-release nightly builds.
 
 The CPU-only `faiss-cpu` conda package is currently available on Linux, OSX, and
 Windows. The `faiss-gpu`, containing both CPU and GPU indices, is available on
-Linux systems, for various versions of CUDA.
+Linux systems, for CUDA 11.4. Packages are built for Python versions 3.8-3.10.
 
 To install the latest stable release:
 
 ``` shell
 # CPU-only version
-$ conda install -c pytorch faiss-cpu
+$ conda install -c pytorch faiss-cpu=1.7.4 mkl=2021 blas=1.0=mkl
 
 # GPU(+CPU) version
-$ conda install -c pytorch faiss-gpu
-
-# or for a specific CUDA version
-$ conda install -c pytorch faiss-gpu cudatoolkit=10.2 # for CUDA 10.2
+$ conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl
 ```
 
-Nightly pre-release packages can be installed as follows:
+For faiss-gpu, the nvidia channel is required for cudatoolkit=11.4, which is not
+published in the main anaconda channel.
+
+NOTE: due to a bug in the latest 1.7.4 release, Intel MKL 2021 needs to be installed
+separately where applicable. Remove the MKL reference when installing on
+non-Intel platforms.
+
+Nightly pre-release packages can be installed as follows. There is no need to
+install MKL separately, the correct package is automatically installed as a
+dependency where necessary:
 
 ``` shell
 # CPU-only version
 $ conda install -c pytorch/label/nightly faiss-cpu
 
 # GPU(+CPU) version
-$ conda install -c pytorch/label/nightly faiss-gpu
+$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4
+```
+
+A combination of versions that installs GPU Faiss with CUDA 11.4 and Pytorch (as of 2023-06-19):
+```
+conda create --name faiss_1.7.4 python=3.10
+conda activate faiss_1.7.4
+conda install faiss-gpu=1.7.4 mkl=2021 pytorch pytorch-cuda numpy -c pytorch -c nvidia
 ```
 
 ## Installing from conda-forge
@@ -63,7 +76,7 @@ found to run on other platforms as well, see
 [other platforms](https://github.com/facebookresearch/faiss/wiki/Related-projects#bindings-to-other-languages-and-porting-to-other-platforms).
 
 The basic requirements are:
-- a C++11 compiler (with support for OpenMP support version 2 or higher),
+- a C++17 compiler (with support for OpenMP support version 2 or higher),
 - a BLAS implementation (we strongly recommend using Intel MKL for best
 performance).
 
@@ -94,15 +107,20 @@ Several options can be passed to CMake, among which:
   values are `ON` and `OFF`),
   - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
   (possible values are `ON` and `OFF`),
+  - `-DFAISS_ENABLE_RAFT=ON` in order to enable building the RAFT implementations
+    of the IVF-Flat and IVF-PQ GPU-accelerated indices (default is `OFF`, possible
+    values are `ON` and `OFF`)
   - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
   - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values
   are `ON` and `OFF`),
+  - `-DFAISS_ENABLE_C_API=ON` in order to enable building [C API](c_api/INSTALL.md) (possible values
+    are `ON` and `OFF`),
 - optimization-related options:
   - `-DCMAKE_BUILD_TYPE=Release` in order to enable generic compiler
   optimization options (enables `-O3` on gcc for instance),
   - `-DFAISS_OPT_LEVEL=avx2` in order to enable the required compiler flags to
-  generate code using optimized SIMD instructions (possible values are `generic`,
-  `sse4`, and `avx2`, by increasing order of optimization),
+  generate code using optimized SIMD instructions (possible values are `generic`
+  and `avx2`, by increasing order of optimization),
 - BLAS-related options:
   - `-DBLA_VENDOR=Intel10_64_dyn -DMKL_LIBRARIES=/path/to/mkl/libs` to use the
   Intel MKL BLAS implementation, which is significantly faster than OpenBLAS
diff --git a/thirdparty/faiss/README.md b/thirdparty/faiss/README.md
index 8d04fd5e7..0db380b80 100644
--- a/thirdparty/faiss/README.md
+++ b/thirdparty/faiss/README.md
@@ -1,6 +1,6 @@
 # Faiss
 
-Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed by [Facebook AI Research](https://research.fb.com/category/facebook-ai-research-fair/).
+Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning. Faiss is written in C++ with complete wrappers for Python/numpy. Some of the most useful algorithms are implemented on the GPU. It is developed primarily at Meta's [Fundamental AI Research](https://ai.facebook.com/) group.
 
 ## News
 
@@ -10,13 +10,13 @@ See [CHANGELOG.md](CHANGELOG.md) for detailed information about latest features.
 
 Faiss contains several methods for similarity search. It assumes that the instances are represented as vectors and are identified by an integer, and that the vectors can be compared with L2 (Euclidean) distances or dot products. Vectors that are similar to a query vector are those that have the lowest L2 distance or the highest dot product with the query vector. It also supports cosine similarity, since this is a dot product on normalized vectors.
 
-Most of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server.
+Some of the methods, like those based on binary vectors and compact quantization codes, solely use a compressed representation of the vectors and do not require to keep the original vectors. This generally comes at the cost of a less precise search but these methods can scale to billions of vectors in main memory on a single server. Other methods, like HNSW and NSG add an indexing structure on top of the raw vectors to make searching more efficient.
 
 The GPU implementation can accept input from either CPU or GPU memory. On a server with GPUs, the GPU indexes can be used a drop-in replacement for the CPU indexes (e.g., replace `IndexFlatL2` with `GpuIndexFlatL2`) and copies to/from GPU memory are handled automatically. Results will be faster however if both input and output remain resident on the GPU. Both single and multi-GPU usage is supported.
 
-## Building
+## Installing
 
-The library is mostly implemented in C++, with optional GPU support provided via CUDA, and an optional Python interface. The CPU version requires a BLAS library. It compiles with a Makefile and can be packaged in a docker image. See [INSTALL.md](INSTALL.md) for details.
+Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu) and [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA, and the Python interface is also optional. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details.
 
 ## How Faiss works
 
@@ -26,6 +26,7 @@ Faiss is built around an index type that stores a set of vectors, and provides a
 - search quality
 - memory used per index vector
 - training time
+- adding time
 - need for external data for unsupervised training
 
 The optional GPU implementation provides what is likely (as of March 2017) the fastest exact and approximate (compressed-domain) nearest neighbor search implementation for high-dimensional vectors, fastest Lloyd's k-means, and fastest small k-selection algorithm known. [The implementation is detailed here](https://arxiv.org/abs/1702.08734).
@@ -34,8 +35,8 @@ The optional GPU implementation provides what is likely (as of March 2017) the f
 
 The following are entry points for documentation:
 
-- the full documentation, including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting) can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki)
-- the [doxygen documentation](https://facebookresearch.github.io/faiss) gives per-class information
+- the full documentation can be found on the [wiki page](http://github.com/facebookresearch/faiss/wiki), including a [tutorial](https://github.com/facebookresearch/faiss/wiki/Getting-started), a [FAQ](https://github.com/facebookresearch/faiss/wiki/FAQ) and a [troubleshooting section](https://github.com/facebookresearch/faiss/wiki/Troubleshooting)
+- the [doxygen documentation](https://faiss.ai/) gives per-class information extracted from code comments
 - to reproduce results from our research papers, [Polysemous codes](https://arxiv.org/abs/1609.01882) and [Billion-scale similarity search with GPUs](https://arxiv.org/abs/1702.08734), refer to the [benchmarks README](benchs/README.md). For [
 Link and code: Fast indexing with graphs and compact regression codes](https://arxiv.org/abs/1804.09996), see the [link_and_code README](benchs/link_and_code)
 
@@ -45,18 +46,24 @@ The main authors of Faiss are:
 - [Hervé Jégou](https://github.com/jegou) initiated the Faiss project and wrote its first implementation
 - [Matthijs Douze](https://github.com/mdouze) implemented most of the CPU Faiss
 - [Jeff Johnson](https://github.com/wickedfoo) implemented all of the GPU Faiss
-- [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes
+- [Lucas Hosseini](https://github.com/beauby) implemented the binary indexes and the build system
+- [Chengqi Deng](https://github.com/KinglittleQ) implemented NSG, NNdescent and much of the additive quantization code.
+- [Alexandr Guzhva](https://github.com/alexanderguzhva) many optimizations: SIMD, memory allocation and layout, fast decoding kernels for vector codecs, etc.
 
 ## Reference
 
 Reference to cite when you use Faiss in a research paper:
 
 ```
-@article{JDH17,
-  title={Billion-scale similarity search with GPUs},
+@article{johnson2019billion,
+  title={Billion-scale similarity search with {GPUs}},
   author={Johnson, Jeff and Douze, Matthijs and J{\'e}gou, Herv{\'e}},
-  journal={arXiv preprint arXiv:1702.08734},
-  year={2017}
+  journal={IEEE Transactions on Big Data},
+  volume={7},
+  number={3},
+  pages={535--547},
+  year={2019},
+  publisher={IEEE}
 }
 ```
 
@@ -67,6 +74,8 @@ For public discussion of Faiss or for questions, there is a Facebook group at ht
 We monitor the [issues page](http://github.com/facebookresearch/faiss/issues) of the repository.
 You can report bugs, ask questions, etc.
 
-## License
+## Legal
 
-Faiss is MIT-licensed.
+Faiss is MIT-licensed, refer to the [LICENSE file](https://github.com/facebookresearch/faiss/blob/main/LICENSE) in the top level directory.
+
+Copyright © Meta Platforms, Inc. See the [Terms of Use](https://opensource.fb.com/legal/terms/) and [Privacy Policy](https://opensource.fb.com/legal/privacy/) for this project.
diff --git a/thirdparty/faiss/benchs/CMakeLists.txt b/thirdparty/faiss/benchs/CMakeLists.txt
new file mode 100644
index 000000000..46c81ae24
--- /dev/null
+++ b/thirdparty/faiss/benchs/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+
+add_executable(bench_ivf_selector EXCLUDE_FROM_ALL bench_ivf_selector.cpp)
+target_link_libraries(bench_ivf_selector PRIVATE faiss)
+
diff --git a/thirdparty/faiss/benchs/README.md b/thirdparty/faiss/benchs/README.md
index 7e95a7673..fe191ff11 100644
--- a/thirdparty/faiss/benchs/README.md
+++ b/thirdparty/faiss/benchs/README.md
@@ -75,7 +75,7 @@ http://corpus-texmex.irisa.fr/ to subdirectory bigann/
 
 ### Getting Deep1B
 
-The ground-truth and queries are available here 
+The ground-truth and queries are available here
 
 https://yadi.sk/d/11eDCm7Dsn9GA
 
@@ -145,7 +145,7 @@ The 8-byte results can be reproduced with the factory key `IMI2x12,PQ8`
 
 ### Experiments of the appendix
 
-The experiments in the appendix are only in the ArXiv version of the paper (table 3). 
+The experiments in the appendix are only in the ArXiv version of the paper (table 3).
 
 ```
 python bench_polysemous_1bn.py SIFT1000M OPQ8_64,IMI2x13,PQ8 nprobe={1,2,4,8,16,32,64,128},ht={20,24,26,28,30}
@@ -179,11 +179,11 @@ The original results were obtained with `nprobe=1024,ht=66,max_codes=262144`.
 
 ## GPU experiments
 
-The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss. 
+The benchmarks below run 1 or 4 Titan X GPUs and reproduce the results of the "GPU paper". They are also a good starting point on how to use GPU Faiss.
 
 ### Search on SIFT1M
 
-See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers. 
+See above on how to get SIFT1M into subdirectory sift1M/. The script [`bench_gpu_sift1m.py`](bench_gpu_sift1m.py) reproduces the "exact k-NN time" plot in the ArXiv paper, and the SIFT1M numbers.
 
 The output is:
 ```
@@ -245,14 +245,14 @@ nprobe= 512 0.527 s recalls= 0.9907 0.9987 0.9987
 
 To get the "infinite MNIST dataset", follow the instructions on [Léon Bottou's website](http://leon.bottou.org/projects/infimnist). The script assumes the file `mnist8m-patterns-idx3-ubyte` is in subdirectory `mnist8m`
 
-The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output: 
+The script [`kmeans_mnist.py`](kmeans_mnist.py) produces the following output:
 
 ```
 python kmeans_mnist.py 1 256
 ...
 Clustering 8100000 points in 784D to 256 clusters, redo 1 times, 20 iterations
   Preprocessing in 7.94526 s
-  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0        
+  Iteration 19 (131.697 s, search 114.78 s): objective=1.44881e+13 imbalance=1.05963 nsplit=0
 final objective: 1.449e+13
 total runtime: 140.615 s
 ```
@@ -263,7 +263,7 @@ The script [`bench_gpu_1bn.py`](bench_gpu_1bn.py) runs multi-gpu searches on the
 
 Even on multiple GPUs, building the 1B datasets can last several hours. It is often a good idea to validate that everything is working fine on smaller datasets like SIFT1M, SIFT2M, etc.
 
-The search results on SIFT1B in the "GPU paper" can be obtained with 
+The search results on SIFT1B in the "GPU paper" can be obtained with
 
 <!-- see P57124181 -->
 
@@ -285,7 +285,7 @@ We use the `-tempmem` option to reduce the temporary memory allocation to 1.5G,
 
 ### search on Deep1B
 
-The same script generates the GPU search results on Deep1B. 
+The same script generates the GPU search results on Deep1B.
 
 ```
 python bench_gpu_1bn.py  Deep1B OPQ20_80,IVF262144,PQ20 -nnn 10 -R 2 -ngpu 4 -altadd -noptables -tempmem $[1024*1024*1024]
@@ -336,3 +336,26 @@ search...
 999997440/1000000000 (36717.207 s, 0.6015)      probe=128: 36717.309 s rank-10 intersection results: 0.6015
 999997440/1000000000 (70616.392 s, 0.6047)      probe=256: 70616.581 s rank-10 intersection results: 0.6047
 ```
+
+# Additional benchmarks
+
+This directory also contains certain additional benchmarks (and serve as an additional source of examples of how to use the FAISS code).
+Certain tests / benchmarks might be outdated.
+
+* bench_6bit_codec.cpp - tests vector codecs for SQ6 quantization on a synthetic dataset
+* bench_cppcontrib_sa_decode.cpp - benchmarks specialized kernels for vector codecs for PQ, IVFPQ and Resudial+PQ on a synthetic dataset
+* bench_for_interrupt.py - evaluates the impact of the interrupt callback handler (which can be triggered from Python code)
+* bench_hamming_computer.cpp - specialized implementations for Hamming distance computations
+* bench_heap_replace.cpp - benchmarks different implementations of certain calls for a Heap data structure
+* bench_hnsw.py - benchmarks HNSW in combination with other ones for SIFT1M dataset
+* bench_index_flat.py - benchmarks IndexFlatL2 on a synthetic dataset
+* bench_index_pq.py - benchmarks PQ on SIFT1M dataset
+* bench_ivf_fastscan_single_query.py - benchmarks a single query for different nprobe levels for IVF{nlist},PQ{M}x4fs on BIGANN dataset
+* bench_ivf_fastscan.py - compares IVF{nlist},PQ{M}x4fs against other indices on SIFT1M dataset
+* bench_ivf_selector.cpp - checks the possible overhead when using faiss::IDSelectorAll interface
+* bench_pairwise_distances.py - benchmarks pairwise distance computation between two synthetic datasets
+* bench_partition.py - benchmarks partitioning functions
+* bench_pq_tables.py - benchmarks ProductQuantizer.compute_inner_prod_tables() and ProductQuantizer.compute_distance_tables() calls
+* bench_quantizer.py - benchmarks various quantizers for SIFT1M, Deep1B, BigANN datasets
+* bench_scalar_quantizer.py - benchmarks IVF+SQ on a Sift1M dataset
+* bench_vector_ops.py - benchmarks dot product and distances computations on a synthetic dataset
diff --git a/thirdparty/faiss/benchs/bench_6bit_codec.cpp b/thirdparty/faiss/benchs/bench_6bit_codec.cpp
index 1d8b48703..b4ac0b04b 100644
--- a/thirdparty/faiss/benchs/bench_6bit_codec.cpp
+++ b/thirdparty/faiss/benchs/bench_6bit_codec.cpp
@@ -8,6 +8,7 @@
 #include <omp.h>
 #include <cstdio>
 
+#include <benchmark/benchmark.h>
 #include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/random.h>
@@ -15,7 +16,7 @@
 
 using namespace faiss;
 
-int main() {
+static void bench(benchmark::State& state) {
     int d = 128;
     int n = 2000;
 
@@ -31,7 +32,7 @@ int main() {
     sq.train(n, x.data());
 
     size_t code_size = sq.code_size;
-    printf("code size: %ld\n", sq.code_size);
+    state.counters["code_size"] = sq.code_size;
 
     // encode
     std::vector<uint8_t> codes(code_size * n);
@@ -41,8 +42,8 @@ int main() {
     std::vector<float> x2(d * n);
     sq.decode(codes.data(), x2.data(), n);
 
-    printf("sqL2 recons error: %g\n",
-           fvec_L2sqr(x.data(), x2.data(), n * d) / n);
+    state.counters["sql2_recons_error"] =
+            fvec_L2sqr(x.data(), x2.data(), n * d) / n;
 
     // encode again
     std::vector<uint8_t> codes2(code_size * n);
@@ -54,25 +55,27 @@ int main() {
             ndiff++;
     }
 
-    printf("ndiff for idempotence: %ld / %ld\n", ndiff, codes.size());
+    state.counters["ndiff_for_idempotence"] = ndiff;
+
+    state.counters["code_size_two"] = codes.size();
 
     std::unique_ptr<ScalarQuantizer::SQDistanceComputer> dc(
             sq.get_distance_computer());
     dc->codes = codes.data();
     dc->code_size = sq.code_size;
-    printf("code size: %ld\n", dc->code_size);
-
-    double sum_dis = 0;
-    double t0 = getmillisecs();
-    for (int i = 0; i < n; i++) {
-        dc->set_query(&x[i * d]);
-        for (int j = 0; j < n; j++) {
-            sum_dis += (*dc)(j);
+    state.counters["code_size_three"] = dc->code_size;
+
+    for (auto _ : state) {
+        float sum_dis = 0;
+        for (int i = 0; i < n; i++) {
+            dc->set_query(&x[i * d]);
+            for (int j = 0; j < n; j++) {
+                benchmark::DoNotOptimize(sum_dis += (*dc)(j));
+            }
         }
     }
-    printf("distances computed in %.3f ms, checksum=%g\n",
-           getmillisecs() - t0,
-           sum_dis);
-
-    return 0;
 }
+// I think maybe n and d should be input arguments
+// for things to really make sense, idk.
+BENCHMARK(bench)->Iterations(20);
+BENCHMARK_MAIN();
diff --git a/thirdparty/faiss/benchs/bench_all_ivf/README.md b/thirdparty/faiss/benchs/bench_all_ivf/README.md
index 2f7c76b5a..ea6bbd670 100644
--- a/thirdparty/faiss/benchs/bench_all_ivf/README.md
+++ b/thirdparty/faiss/benchs/bench_all_ivf/README.md
@@ -11,7 +11,7 @@ The code is organized as:
 - `bench_all_ivf.py`: evaluate one type of inverted file
 
 - `run_on_cluster_generic.bash`: call `bench_all_ivf.py` for all tested types of indices. 
-Since the number of experiments is quite large the script is structued so that the benchmark can be run on a cluster.
+Since the number of experiments is quite large the script is structured so that the benchmark can be run on a cluster.
 
 - `parse_bench_all_ivf.py`: make nice tradeoff plots from all the results. 
 
diff --git a/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py b/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
index c3f668bee..e098e9527 100644
--- a/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
+++ b/thirdparty/faiss/benchs/bench_all_ivf/bench_all_ivf.py
@@ -3,15 +3,20 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import argparse
 import os
 import sys
 import time
-import pdb
-import numpy as np
+
 import faiss
-import argparse
-import datasets
-from datasets import sanitize
+import numpy as np
+
+try:
+    import datasets_fb as datasets
+except ModuleNotFoundError:
+    import datasets_oss as datasets
+
+sanitize = datasets.sanitize
 
 
 ######################################################
@@ -262,8 +267,7 @@ def apply_AQ_options(index, args):
         print("Getting centroids from", args.get_centroids_from)
         src_index = faiss.read_index(args.get_centroids_from)
         src_quant = faiss.downcast_index(src_index.quantizer)
-        centroids = faiss.vector_to_array(src_quant.xb)
-        centroids = centroids.reshape(-1, d)
+        centroids = src_quant.reconstruct_n()
         print("  centroid table shape", centroids.shape)
 
         if isinstance(vec_transform, faiss.VectorTransform):
@@ -333,7 +337,7 @@ def apply_AQ_options(index, args):
 
 xq = sanitize(ds.get_queries())
 gt = ds.get_groundtruth(k=args.k)
-assert gt.shape[1] == args.k, pdb.set_trace()
+assert gt.shape[1] == args.k
 
 if args.searchthreads != -1:
     print("Setting nb of threads to", args.searchthreads)
diff --git a/thirdparty/faiss/benchs/bench_all_ivf/cmp_with_scann.py b/thirdparty/faiss/benchs/bench_all_ivf/cmp_with_scann.py
index 8f17ece2b..fc6c75e1f 100644
--- a/thirdparty/faiss/benchs/bench_all_ivf/cmp_with_scann.py
+++ b/thirdparty/faiss/benchs/bench_all_ivf/cmp_with_scann.py
@@ -75,28 +75,37 @@ def aa(*args, **kwargs):
     k = args.k
     nrun = args.nrun
 
-    if args.lib == "faiss":
+    if not os.path.exists(cache_dir + "xb.npy"):
         # prepare cache
-        import faiss
         from datasets import load_dataset
-
         ds = load_dataset(args.db, download=args.download)
         print(ds)
-        if not os.path.exists(cache_dir + "xb.npy"):
-            # store for SCANN
-            os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
-            tosave = dict(
-                # xt = ds.get_train(10),
-                xb = ds.get_database(),
-                xq = ds.get_queries(),
-                gt = ds.get_groundtruth()
-            )
-            for name, v in tosave.items():
-                fname = cache_dir + "/" + name + ".npy"
-                print("save", fname)
-                np.save(fname, v)
-
-            open(cache_dir + "metric", "w").write(ds.metric)
+        # store for SCANN
+        os.system(f"rm -rf {cache_dir}; mkdir -p {cache_dir}")
+        tosave = dict(
+            xb = ds.get_database(),
+            xq = ds.get_queries(),
+            gt = ds.get_groundtruth()
+        )
+        for name, v in tosave.items():
+            fname = cache_dir + "/" + name + ".npy"
+            print("save", fname)
+            np.save(fname, v)
+
+        open(cache_dir + "metric", "w").write(ds.metric)
+        
+    dataset = {}
+    for kn in "xb xq gt".split():
+        fname = cache_dir + "/" + kn + ".npy"
+        print("load", fname)
+        dataset[kn] = np.load(fname)
+    xb = dataset["xb"]
+    xq = dataset["xq"]
+    gt = dataset["gt"] 
+    distance_measure = open(cache_dir + "metric").read()
+    
+    if args.lib == "faiss":
+        import faiss
 
         name1_to_metric = {
             "IP": faiss.METRIC_INNER_PRODUCT,
@@ -106,14 +115,10 @@ def aa(*args, **kwargs):
         index_fname = cache_dir + "index.faiss"
         if not os.path.exists(index_fname):
             index = faiss_make_index(
-                ds.get_database(), name1_to_metric[ds.metric], index_fname)
+                xb, name1_to_metric[distance_measure], index_fname)
         else:
             index = faiss.read_index(index_fname)
 
-        xb = ds.get_database()
-        xq = ds.get_queries()
-        gt = ds.get_groundtruth()
-
         faiss_eval_search(
                 index, xq, xb, nprobe_tab, pre_reorder_k_tab, k, gt,
                 nrun, args.measure
@@ -122,32 +127,22 @@ def aa(*args, **kwargs):
     if args.lib == "scann":
         from scann.scann_ops.py import scann_ops_pybind
 
-        dataset = {}
-        for kn in "xb xq gt".split():
-            fname = cache_dir + "/" + kn + ".npy"
-            print("load", fname)
-            dataset[kn] = np.load(fname)
         name1_to_name2 = {
             "IP": "dot_product",
             "L2": "squared_l2"
         }
-        distance_measure = name1_to_name2[open(cache_dir + "metric").read()]
-
-        xb = dataset["xb"]
-        xq = dataset["xq"]
-        gt = dataset["gt"]
 
         scann_dir = cache_dir + "/scann1.1.1_serialized"
         if os.path.exists(scann_dir + "/scann_config.pb"):
             searcher = scann_ops_pybind.load_searcher(scann_dir)
         else:
-            searcher = scann_make_index(xb, distance_measure, scann_dir, 0)
+            searcher = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 0)
 
         scann_dir = cache_dir + "/scann1.1.1_serialized_reorder"
         if os.path.exists(scann_dir + "/scann_config.pb"):
             searcher_reo = scann_ops_pybind.load_searcher(scann_dir)
         else:
-            searcher_reo = scann_make_index(xb, distance_measure, scann_dir, 100)
+            searcher_reo = scann_make_index(xb, name1_to_name2[distance_measure], scann_dir, 100)
 
         scann_eval_search(
             searcher, searcher_reo,
@@ -256,7 +251,6 @@ def faiss_make_index(xb, metric_type, fname):
     #    index.by_residual = False
 
     print("train")
-    # index.train(ds.get_train())
     index.train(xb[:250000])
     print("add")
     index.add(xb)
diff --git a/thirdparty/faiss/benchs/bench_all_ivf/datasets.py b/thirdparty/faiss/benchs/bench_all_ivf/datasets_oss.py
similarity index 99%
rename from thirdparty/faiss/benchs/bench_all_ivf/datasets.py
rename to thirdparty/faiss/benchs/bench_all_ivf/datasets_oss.py
index 3f712d43a..0ee897f2b 100644
--- a/thirdparty/faiss/benchs/bench_all_ivf/datasets.py
+++ b/thirdparty/faiss/benchs/bench_all_ivf/datasets_oss.py
@@ -81,7 +81,6 @@ def load_dataset(dataset='deep1M', compute_gt=False, download=False):
             f"{centdir}/clustering.dbdeep1M.IVF{ncent}.faissindex"
         )
 
-
     elif dataset.startswith("deep"):
 
         szsuf = dataset[4:]
diff --git a/thirdparty/faiss/benchs/bench_big_batch_ivf.py b/thirdparty/faiss/benchs/bench_big_batch_ivf.py
new file mode 100644
index 000000000..e678d8e7a
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_big_batch_ivf.py
@@ -0,0 +1,109 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import time
+
+import faiss
+
+import numpy as np
+
+from faiss.contrib.datasets import SyntheticDataset
+from faiss.contrib.big_batch_search import big_batch_search
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('dataset options')
+aa('--dim', type=int, default=64)
+aa('--size', default="S")
+
+group = parser.add_argument_group('index options')
+aa('--nlist', type=int, default=100)
+aa('--factory_string', default="", help="overrides nlist")
+aa('--k', type=int, default=10)
+aa('--nprobe', type=int, default=5)
+aa('--nt', type=int, default=-1, help="nb search threads")
+aa('--method', default="pairwise_distances", help="")
+
+args = parser.parse_args()
+print("args:", args)
+
+if args.size == "S":
+    ds = SyntheticDataset(32, 2000, 4000, 1000)
+elif args.size == "M":
+    ds = SyntheticDataset(32, 20000, 40000, 10000)
+elif args.size == "L":
+    ds = SyntheticDataset(32, 200000, 400000, 100000)
+else:
+    raise RuntimeError(f"dataset size {args.size} not supported")
+
+nlist = args.nlist
+nprobe = args.nprobe
+k = args.k
+
+
+def tic(name):
+    global tictoc
+    tictoc = (name, time.time())
+    print(name, end="\r", flush=True)
+
+
+def toc():
+    global tictoc
+    name, t0 = tictoc
+    dt = time.time() - t0
+    print(f"{name}: {dt:.3f} s")
+    return dt
+
+
+print(f"dataset {ds}, {nlist=:} {nprobe=:} {k=:}")
+
+if args.factory_string == "":
+    factory_string = f"IVF{nlist},Flat"
+else:
+    factory_string = args.factory_string
+
+print(f"instantiate {factory_string}")
+index = faiss.index_factory(ds.d, factory_string)
+
+if args.factory_string != "":
+    nlist = index.nlist
+
+print("nlist", nlist)
+
+tic("train")
+index.train(ds.get_train())
+toc()
+
+tic("add")
+index.add(ds.get_database())
+toc()
+
+if args.nt != -1:
+    print("setting nb of threads to", args.nt)
+    faiss.omp_set_num_threads(args.nt)
+
+tic("reference search")
+index.nprobe
+index.nprobe = nprobe
+Dref, Iref = index.search(ds.get_queries(), k)
+t_ref = toc()
+
+tic("block search")
+Dnew, Inew = big_batch_search(
+    index, ds.get_queries(),
+    k, method=args.method, verbose=10
+)
+t_tot = toc()
+
+assert (Inew != Iref).sum() / Iref.size < 1e-4
+np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+
+print(f"total block search time {t_tot:.3f} s, speedup {t_ref / t_tot:.3f}x")
diff --git a/thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp b/thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp
new file mode 100644
index 000000000..c5c6b0bf1
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_cppcontrib_sa_decode.cpp
@@ -0,0 +1,1734 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <omp.h>
+
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <thread>
+#include <tuple>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/impl/io.h>
+#include <faiss/index_factory.h>
+#include <faiss/index_io.h>
+
+#include <faiss/IndexRowwiseMinMax.h>
+#include <faiss/cppcontrib/SaDecodeKernels.h>
+
+// train a dataset
+std::tuple<std::shared_ptr<faiss::Index>, std::vector<uint8_t>> trainDataset(
+        const std::vector<float>& input,
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description) {
+    //
+    omp_set_num_threads(std::thread::hardware_concurrency());
+
+    // train an index
+    auto index = std::shared_ptr<faiss::Index>(
+            faiss::index_factory((int)d, description.c_str()));
+    index->train((int)n, input.data());
+
+    // encode
+    const size_t codeSize = index->sa_code_size();
+
+    std::vector<uint8_t> encodedData(n * codeSize);
+    index->sa_encode(n, input.data(), encodedData.data());
+
+    return std::make_tuple(std::move(index), std::move(encodedData));
+}
+
+// generate a dataset
+std::vector<float> generate(const size_t n, const size_t d) {
+    std::vector<float> data(n * d);
+
+    std::minstd_rand rng(345);
+    std::uniform_real_distribution<float> ux(0, 1);
+
+    //
+    for (size_t k = 0; k < n; k++) {
+        for (size_t j = 0; j < d; j++) {
+            data[k * d + j] = ux(rng);
+        }
+    }
+
+    return data;
+}
+
+double getError(
+        const uint64_t n,
+        const uint64_t d,
+        const std::vector<float>& v1,
+        const std::vector<float>& v2) {
+    double error = 0;
+    for (uint64_t i = 0; i < n; i++) {
+        double localError = 0;
+        for (uint64_t j = 0; j < d; j++) {
+            double q = v1[i * d + j] - v2[i * d + j];
+            localError += q * q;
+        }
+
+        error += localError;
+    }
+
+    return error;
+}
+
+// a timer
+struct StopWatch {
+    using timepoint_t = std::chrono::time_point<std::chrono::steady_clock>;
+
+    timepoint_t Start;
+
+    //
+    StopWatch() {
+        Start = std::chrono::steady_clock::now();
+    }
+
+    //
+    double elapsed() const {
+        const auto now = std::chrono::steady_clock::now();
+        std::chrono::duration<double> elapsed = now - Start;
+        return elapsed.count();
+    }
+};
+
+//
+bool testIfIVFPQ(
+        const faiss::Index* const index,
+        const float** pqCoarseCentroidsQ,
+        const float** pqFineCentroidsQ) {
+    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
+        return false;
+    }
+
+    const faiss::IndexIVFPQ* const indexQ =
+            dynamic_cast<const faiss::IndexIVFPQ*>(index);
+    if (indexQ == nullptr) {
+        return false;
+    }
+
+    const auto coarseIndexQ =
+            dynamic_cast<const faiss::IndexFlatCodes*>(indexQ->quantizer);
+    if (coarseIndexQ == nullptr) {
+        return false;
+    }
+
+    *pqFineCentroidsQ = indexQ->pq.centroids.data();
+    *pqCoarseCentroidsQ =
+            reinterpret_cast<const float*>(coarseIndexQ->codes.data());
+    return true;
+}
+
+bool testIfResidualPQ(
+        const faiss::Index* const index,
+        const float** pqCoarseCentroidsQ,
+        const float** pqFineCentroidsQ) {
+    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
+        return false;
+    }
+
+    const faiss::Index2Layer* const indexQ =
+            dynamic_cast<const faiss::Index2Layer*>(index);
+    if (indexQ == nullptr) {
+        return false;
+    }
+
+    const auto coarseIndexQ = dynamic_cast<const faiss::MultiIndexQuantizer*>(
+            indexQ->q1.quantizer);
+    if (coarseIndexQ == nullptr) {
+        return false;
+    }
+
+    *pqFineCentroidsQ = indexQ->pq.centroids.data();
+    *pqCoarseCentroidsQ = coarseIndexQ->pq.centroids.data();
+    return true;
+}
+
+//
+template <typename T>
+static void verifyIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData,
+        const uint64_t nIterations) {
+    //
+    const float* pqFineCentroidsQ = nullptr;
+    const float* pqCoarseCentroidsQ = nullptr;
+
+    //
+    testIfIVFPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+    testIfResidualPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+
+    //
+    const size_t codeSize = index->sa_code_size();
+
+    // initialize the random engine
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // use 1 thread
+    omp_set_num_threads(1);
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // sequential order
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            index->sa_decode(n, encodedData.data(), outputFaiss.data());
+        }
+        double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (int iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                T::store(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + i * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_seq"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random order
+
+    // generate a random order of points
+    std::uniform_int_distribution<uint64_t> un(0, n - 1);
+    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
+    for (uint64_t i = 0; i < nIterations * n; i++) {
+        pointIncidesToDecode[i] = un(rng);
+    }
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputFaiss.data() + i * d);
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                T::store(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        const double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        const double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random accumulate
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+        std::vector<float> outputKernel2(n * d, 0);
+        std::vector<float> outputKernel2u(n * d, 0);
+        std::vector<float> outputKernel3(n * d, 0);
+        std::vector<float> outputKernel3u(n * d, 0);
+
+        // a temporary buffer for faiss
+        std::vector<float> tempFaiss(d, 0);
+
+        // random weights
+        std::vector<float> weights(nIterations * n, 0);
+        for (uint64_t i = 0; i < nIterations * n; i++) {
+            weights[i] = u(rng);
+        }
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        tempFaiss.data());
+                for (uint64_t j = 0; j < d; j++)
+                    outputFaiss[i * d + j] += weight * tempFaiss[j];
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels: accum 1 point
+        StopWatch swKernel1;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        weight,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        const double timeKernel1 = swKernel1.elapsed();
+
+        // evaluate the error
+        const double error1 = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel1 << "\t" << error1 << std::endl;
+
+        // kernels: accum 2 points, shared centroids
+        StopWatch swKernel2;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2.data() + i * d);
+            }
+        }
+        const double timeKernel2 = swKernel2.elapsed();
+
+        // evaluate the error
+        const double error2 = getError(n, d, outputFaiss, outputKernel2);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2 << "\t" << error2 << std::endl;
+
+        // kernels: accum 2 points, unique centroids
+        StopWatch swKernel2u;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2u.data() + i * d);
+            }
+        }
+        const double timeKernel2u = swKernel2u.elapsed();
+
+        // evaluate the error
+        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2u << "\t" << error2u << std::endl;
+
+        // kernels: accum 3 points, shared centroids
+        StopWatch swKernel3;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3.data() + i * d);
+            }
+        }
+        const double timeKernel3 = swKernel3.elapsed();
+
+        // evaluate the error
+        const double error3 = getError(n, d, outputFaiss, outputKernel3);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3 << "\t" << error3 << std::endl;
+
+        // kernels: accum 3 points, unique centroids
+        StopWatch swKernel3u;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3u.data() + i * d);
+            }
+        }
+        const double timeKernel3u = swKernel3u.elapsed();
+
+        // evaluate the error
+        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3u << "\t" << error3u << std::endl;
+    }
+}
+
+//
+template <typename T>
+static void verifyMinMaxIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData,
+        const uint64_t nIterations) {
+    //
+    const float* pqFineCentroidsQ = nullptr;
+    const float* pqCoarseCentroidsQ = nullptr;
+
+    // extract an index that is wrapped with IndexRowwiseMinMaxBase
+    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
+            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
+
+    auto subIndex = indexMinMax->index;
+
+    //
+    testIfIVFPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+    testIfResidualPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+
+    //
+    const size_t codeSize = index->sa_code_size();
+
+    // initialize the random engine
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // use 1 thread
+    omp_set_num_threads(1);
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // sequential order
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            index->sa_decode(n, encodedData.data(), outputFaiss.data());
+        }
+        double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (int iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                T::store(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + i * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_seq"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random order
+
+    // generate a random order of points
+    std::uniform_int_distribution<uint64_t> un(0, n - 1);
+    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
+    for (uint64_t i = 0; i < nIterations * n; i++) {
+        pointIncidesToDecode[i] = un(rng);
+    }
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputFaiss.data() + i * d);
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                T::store(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        const double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        const double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random accumulate
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+        std::vector<float> outputKernel2(n * d, 0);
+        std::vector<float> outputKernel2u(n * d, 0);
+        std::vector<float> outputKernel3(n * d, 0);
+        std::vector<float> outputKernel3u(n * d, 0);
+
+        // a temporary buffer for faiss
+        std::vector<float> tempFaiss(d, 0);
+
+        // random weights
+        std::vector<float> weights(nIterations * n, 0);
+        for (uint64_t i = 0; i < nIterations * n; i++) {
+            weights[i] = u(rng);
+        }
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        tempFaiss.data());
+                for (uint64_t j = 0; j < d; j++) {
+                    outputFaiss[i * d + j] += weight * tempFaiss[j];
+                }
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels: accum 1 point
+        StopWatch swKernel1;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        weight,
+                        outputKernel1.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel1[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel1 = swKernel1.elapsed();
+
+        // evaluate the error
+        const double error1 = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel1 << "\t" << error1 << std::endl;
+
+        // kernels: accum 2 points, shared centroids
+        StopWatch swKernel2;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel2[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel2 = swKernel2.elapsed();
+
+        // evaluate the error
+        const double error2 = getError(n, d, outputFaiss, outputKernel2);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2 << "\t" << error2 << std::endl;
+
+        // kernels: accum 2 points, unique centroids
+        StopWatch swKernel2u;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2u.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel2u[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel2u = swKernel2u.elapsed();
+
+        // evaluate the error
+        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2u << "\t" << error2u << std::endl;
+
+        // kernels: accum 3 points, shared centroids
+        StopWatch swKernel3;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel3[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel3 = swKernel3.elapsed();
+
+        // evaluate the error
+        const double error3 = getError(n, d, outputFaiss, outputKernel3);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3 << "\t" << error3 << std::endl;
+
+        // kernels: accum 3 points, unique centroids
+        StopWatch swKernel3u;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        pqCoarseCentroidsQ,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3u.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel3u[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel3u = swKernel3u.elapsed();
+
+        // evaluate the error
+        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3u << "\t" << error3u << std::endl;
+    }
+}
+
+//
+template <typename T>
+static void verifyIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData,
+        const uint64_t nIterations) {
+    //
+    const faiss::IndexPQ* const indexQ =
+            dynamic_cast<const faiss::IndexPQ*>(index.get());
+    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
+
+    //
+    const size_t codeSize = index->sa_code_size();
+
+    // initialize the random engine
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // use 1 thread
+    omp_set_num_threads(1);
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // sequential order
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            index->sa_decode(n, encodedData.data(), outputFaiss.data());
+        }
+        double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (int iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                T::store(
+                        pqFineCentroidsQ,
+                        encodedData.data() + i * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_seq"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random order
+
+    // generate a random order of points
+    std::uniform_int_distribution<uint64_t> un(0, n - 1);
+    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
+    for (uint64_t i = 0; i < nIterations * n; i++) {
+        pointIncidesToDecode[i] = un(rng);
+    }
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputFaiss.data() + i * d);
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                T::store(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        const double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        const double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random accumulate
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+        std::vector<float> outputKernel2(n * d, 0);
+        std::vector<float> outputKernel2u(n * d, 0);
+        std::vector<float> outputKernel3(n * d, 0);
+        std::vector<float> outputKernel3u(n * d, 0);
+
+        // a temporary buffer for faiss
+        std::vector<float> tempFaiss(d, 0);
+
+        // random weights
+        std::vector<float> weights(nIterations * n, 0);
+        for (uint64_t i = 0; i < nIterations * n; i++) {
+            weights[i] = u(rng);
+        }
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        tempFaiss.data());
+                for (uint64_t j = 0; j < d; j++) {
+                    outputFaiss[i * d + j] += weight * tempFaiss[j];
+                }
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels: accum 1 point
+        StopWatch swKernel1;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        weight,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        const double timeKernel1 = swKernel1.elapsed();
+
+        // evaluate the error
+        const double error1 = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel1 << "\t" << error1 << std::endl;
+
+        // kernels: accum 2 points, shared centroids
+        StopWatch swKernel2;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2.data() + i * d);
+            }
+        }
+        const double timeKernel2 = swKernel2.elapsed();
+
+        // evaluate the error
+        const double error2 = getError(n, d, outputFaiss, outputKernel2);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2 << "\t" << error2 << std::endl;
+
+        // kernels: accum 2 points, unique centroids
+        StopWatch swKernel2u;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2u.data() + i * d);
+            }
+        }
+        const double timeKernel2u = swKernel2u.elapsed();
+
+        // evaluate the error
+        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2u << "\t" << error2u << std::endl;
+
+        // kernels: accum 3 points, shared centroids
+        StopWatch swKernel3;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3.data() + i * d);
+            }
+        }
+        const double timeKernel3 = swKernel3.elapsed();
+
+        // evaluate the error
+        const double error3 = getError(n, d, outputFaiss, outputKernel3);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3 << "\t" << error3 << std::endl;
+
+        // kernels: accum 3 points, unique centroids
+        StopWatch swKernel3u;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3u.data() + i * d);
+            }
+        }
+        const double timeKernel3u = swKernel3u.elapsed();
+
+        // evaluate the error
+        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3u << "\t" << error3u << std::endl;
+    }
+}
+
+//
+template <typename T>
+static void verifyMinMaxIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData,
+        const uint64_t nIterations) {
+    // extract an index that is wrapped with IndexRowwiseMinMaxBase
+    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
+            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
+
+    auto subIndex = indexMinMax->index;
+
+    //
+    const faiss::IndexPQ* const indexQ =
+            dynamic_cast<const faiss::IndexPQ*>(subIndex);
+    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
+
+    //
+    const size_t codeSize = index->sa_code_size();
+    // initialize the random engine
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // use 1 thread
+    omp_set_num_threads(1);
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // sequential order
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            index->sa_decode(n, encodedData.data(), outputFaiss.data());
+        }
+        double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (int iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                T::store(
+                        pqFineCentroidsQ,
+                        encodedData.data() + i * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_seq"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random order
+
+    // generate a random order of points
+    std::uniform_int_distribution<uint64_t> un(0, n - 1);
+    std::vector<uint64_t> pointIncidesToDecode(nIterations * n, 0);
+    for (uint64_t i = 0; i < nIterations * n; i++) {
+        pointIncidesToDecode[i] = un(rng);
+    }
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputFaiss.data() + i * d);
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels
+        StopWatch swKernel;
+        for (uint64_t iter = 0; iter < nIterations; iter++) {
+            for (uint64_t i = 0; i < n; i++) {
+                const auto pointIdx = pointIncidesToDecode[i + iter * n];
+                T::store(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        outputKernel1.data() + i * d);
+            }
+        }
+        const double timeKernel = swKernel.elapsed();
+
+        // evaluate the error
+        const double error = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "store_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel << "\t" << error << std::endl;
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // random accumulate
+
+    {
+        std::vector<float> outputFaiss(n * d, 0);
+        std::vector<float> outputKernel1(n * d, 0);
+        std::vector<float> outputKernel2(n * d, 0);
+        std::vector<float> outputKernel2u(n * d, 0);
+        std::vector<float> outputKernel3(n * d, 0);
+        std::vector<float> outputKernel3u(n * d, 0);
+
+        // a temporary buffer for faiss
+        std::vector<float> tempFaiss(d, 0);
+
+        // random weights
+        std::vector<float> weights(nIterations * n, 0);
+        for (uint64_t i = 0; i < nIterations * n; i++) {
+            weights[i] = u(rng);
+        }
+
+        // faiss
+        StopWatch swFaiss;
+        for (uint64_t i = 0; i < n; i++) {
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                index->sa_decode(
+                        1,
+                        encodedData.data() + pointIdx * codeSize,
+                        tempFaiss.data());
+                for (uint64_t j = 0; j < d; j++) {
+                    outputFaiss[i * d + j] += weight * tempFaiss[j];
+                }
+            }
+        }
+        const double timeFaiss = swFaiss.elapsed();
+
+        // kernels: accum 1 point
+        StopWatch swKernel1;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter++) {
+                const auto pointIdx =
+                        pointIncidesToDecode[i * nIterations + iter];
+                const auto weight = weights[i * nIterations + iter];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx * codeSize,
+                        weight,
+                        outputKernel1.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel1[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel1 = swKernel1.elapsed();
+
+        // evaluate the error
+        const double error1 = getError(n, d, outputFaiss, outputKernel1);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel1 << "\t" << error1 << std::endl;
+
+        // kernels: accum 2 points, shared centroids
+        StopWatch swKernel2;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel2[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel2 = swKernel2.elapsed();
+
+        // evaluate the error
+        const double error2 = getError(n, d, outputFaiss, outputKernel2);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2 << "\t" << error2 << std::endl;
+
+        // kernels: accum 2 points, unique centroids
+        StopWatch swKernel2u;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 2) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        outputKernel2u.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel2u[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel2u = swKernel2u.elapsed();
+
+        // evaluate the error
+        const double error2u = getError(n, d, outputFaiss, outputKernel2u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum2u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel2u << "\t" << error2u << std::endl;
+
+        // kernels: accum 3 points, shared centroids
+        StopWatch swKernel3;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel3[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel3 = swKernel3.elapsed();
+
+        // evaluate the error
+        const double error3 = getError(n, d, outputFaiss, outputKernel3);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3 << "\t" << error3 << std::endl;
+
+        // kernels: accum 3 points, unique centroids
+        StopWatch swKernel3u;
+        for (uint64_t i = 0; i < n; i++) {
+            float outputAccumMin = 0;
+            for (uint64_t iter = 0; iter < nIterations; iter += 3) {
+                const auto pointIdx0 =
+                        pointIncidesToDecode[i * nIterations + iter + 0];
+                const auto weight0 = weights[i * nIterations + iter + 0];
+                const auto pointIdx1 =
+                        pointIncidesToDecode[i * nIterations + iter + 1];
+                const auto weight1 = weights[i * nIterations + iter + 1];
+                const auto pointIdx2 =
+                        pointIncidesToDecode[i * nIterations + iter + 2];
+                const auto weight2 = weights[i * nIterations + iter + 2];
+                T::accum(
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx0 * codeSize,
+                        weight0,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx1 * codeSize,
+                        weight1,
+                        pqFineCentroidsQ,
+                        encodedData.data() + pointIdx2 * codeSize,
+                        weight2,
+                        outputKernel3u.data() + i * d,
+                        outputAccumMin);
+            }
+            for (uint64_t j = 0; j < d; j++) {
+                outputKernel3u[i * d + j] += outputAccumMin;
+            }
+        }
+        const double timeKernel3u = swKernel3u.elapsed();
+
+        // evaluate the error
+        const double error3u = getError(n, d, outputFaiss, outputKernel3u);
+
+        std::cout << description << "\t" << n << "\t" << d << "\t"
+                  << "accum3u_rnd"
+                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
+                  << timeKernel3u << "\t" << error3u << std::endl;
+    }
+}
+
+template <typename T>
+void testIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const uint64_t nIterations) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyIndex2LevelDecoder<T>(
+            n, d, description, index, encodedData, nIterations);
+}
+
+template <typename T>
+void testMinMaxIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const uint64_t nIterations) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyMinMaxIndex2LevelDecoder<T>(
+            n, d, description, index, encodedData, nIterations);
+}
+
+template <typename T>
+void testIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const uint64_t nIterations) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyIndexPQDecoder<T>(n, d, description, index, encodedData, nIterations);
+}
+
+template <typename T>
+void testMinMaxIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description,
+        const uint64_t nIterations) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyMinMaxIndexPQDecoder<T>(
+            n, d, description, index, encodedData, nIterations);
+}
+
+//
+int main(int argc, char** argv) {
+    // 1 MB points
+    const uint64_t INDEX_SIZE = 65536 * 16;
+    const uint64_t N_ITERATIONS = 18;
+
+    static_assert(
+            (N_ITERATIONS % 6) == 0, "Number of iterations should be 6*x");
+
+    // print the header
+    std::cout << "Codec\t"
+              << "n\t"
+              << "d\t"
+              << "Experiment\t"
+              << "Iterations\t"
+              << "Faiss time\t"
+              << "SADecodeKernel time\t"
+              << "Error" << std::endl;
+
+    // The following experiment types are available:
+    // * store_seq - decode a contiguous block of codes into vectors, one by one
+    // * store_rnd - decode a contiguous block of codes into vectors in a random
+    // order
+    // * accum_rnd - create a linear combination from decoded vectors,
+    // random order
+    // * accum2_rnd - create a linear combination from decoded vectors,
+    // random order, decode 2 codes per call, centroid tables are shared
+    // * accum2u_rnd - create a linear combination from decoded vectors,
+    // random order, decode 2 codes per call, centroid tables are not shared
+    // * accum3_rnd - create a linear combination from decoded vectors,
+    // random order, decode 3 codes per call, centroid tables are shared
+    // * accum3u_rnd - create a linear combination from decoded vectors,
+    // random order, decode 3 codes per call, centroid tables are not shared
+    //
+    // It is expected that:
+    // * store_seq is faster than store_rnd
+    // * accum2 is faster than accum
+    // * accum3 is faster than accum2
+
+    // test plain PQx8
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 2>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ64np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 4>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ32np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 8>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ16np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 16>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ8np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 32>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ4np", N_ITERATIONS);
+    }
+
+    // test PQx10
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 2, 10>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ64x10np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 4, 10>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ32x10np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 8, 10>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ16x10np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 16, 10>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ8x10np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::IndexPQDecoder<128, 32, 10>;
+        testIndexPQDecoder<T>(INDEX_SIZE, 128, "PQ4x10np", N_ITERATIONS);
+    }
+
+    // test MinMaxFP16,PQx8
+    {
+        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 2>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndexPQDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,PQ64np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 4>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndexPQDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,PQ32np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 8>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndexPQDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,PQ16np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 16>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndexPQDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,PQ8np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::IndexPQDecoder<128, 32>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndexPQDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,PQ4np", N_ITERATIONS);
+    }
+
+    // test IVFPQ
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "IVF256,PQ64np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 4>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "IVF256,PQ32np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 8>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "IVF256,PQ16np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "IVF256,PQ8np", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "IVF256,PQ4np", N_ITERATIONS);
+    }
+
+    // test Residual,PQ
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 2>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual4x8,PQ64", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 4>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual4x8,PQ32", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 8>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual4x8,PQ16", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 16>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual4x8,PQ8", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 32>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual4x8,PQ4", N_ITERATIONS);
+    }
+
+    // test MinMaxFP16,IVFPQ
+    {
+        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ64np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 4>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ32np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 8>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ16np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ8np", N_ITERATIONS);
+    }
+    {
+        using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32>;
+        using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+        testMinMaxIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "MinMaxFP16,IVF256,PQ4np", N_ITERATIONS);
+    }
+
+    // test Residual,PQ with unusual bits
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2, 16, 10>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual1x10,PQ64x10", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 4, 16, 10>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual1x10,PQ32x10", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 8, 16, 10>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual1x10,PQ16x10", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16, 16, 10>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual1x10,PQ8x10", N_ITERATIONS);
+    }
+    {
+        using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32, 16, 10>;
+        testIndex2LevelDecoder<T>(
+                INDEX_SIZE, 128, "Residual1x10,PQ4x10", N_ITERATIONS);
+    }
+
+    return 0;
+}
diff --git a/thirdparty/faiss/benchs/bench_gpu_1bn.py b/thirdparty/faiss/benchs/bench_gpu_1bn.py
index 362b10387..94e68d2fa 100644
--- a/thirdparty/faiss/benchs/bench_gpu_1bn.py
+++ b/thirdparty/faiss/benchs/bench_gpu_1bn.py
@@ -13,7 +13,7 @@
 import faiss
 import re
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 from datasets import ivecs_read
 
 ####################################################################
diff --git a/thirdparty/faiss/benchs/bench_gpu_sift1m.py b/thirdparty/faiss/benchs/bench_gpu_sift1m.py
index da40f7025..5372d1bd0 100644
--- a/thirdparty/faiss/benchs/bench_gpu_sift1m.py
+++ b/thirdparty/faiss/benchs/bench_gpu_sift1m.py
@@ -85,7 +85,8 @@
 
 for lnprobe in range(10):
     nprobe = 1 << lnprobe
-    index.setNumProbes(nprobe)
+    index.nprobe
+    index.nprobe = nprobe
     t, r = evaluate(index, xq, gt, 100)
 
     print("nprobe=%4d %.3f ms recalls= %.4f %.4f %.4f" % (nprobe, t, r[1], r[10], r[100]))
diff --git a/thirdparty/faiss/benchs/bench_hamming_computer.cpp b/thirdparty/faiss/benchs/bench_hamming_computer.cpp
index 5e856b8bd..36da7a102 100644
--- a/thirdparty/faiss/benchs/bench_hamming_computer.cpp
+++ b/thirdparty/faiss/benchs/bench_hamming_computer.cpp
@@ -9,6 +9,8 @@
 #include <cstdio>
 #include <vector>
 
+#include <cinttypes>
+
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/random.h>
@@ -16,6 +18,66 @@
 
 using namespace faiss;
 
+// These implementations are currently slower than HammingComputerDefault so
+// they are not in the main faiss anymore.
+struct HammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    HammingComputerM8() = default;
+
+    HammingComputerM8(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+struct HammingComputerM4 {
+    const uint32_t* a;
+    int n;
+
+    HammingComputerM4() = default;
+
+    HammingComputerM4(const uint8_t* a4, int code_size) {
+        set(a4, code_size);
+    }
+
+    void set(const uint8_t* a4, int code_size) {
+        assert(code_size % 4 == 0);
+        a = (uint32_t*)a4;
+        n = code_size / 4;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint32_t* b = (uint32_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += popcount64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 4;
+    }
+};
+
 template <class T>
 void hamming_cpt_test(
         int code_size,
@@ -30,6 +92,114 @@ void hamming_cpt_test(
     }
 }
 
+template <int CODE_SIZE_IN_BITS>
+void hamming_func_test(
+        const uint8_t* const x1,
+        const uint8_t* const x2,
+        const size_t n1,
+        const size_t n2,
+        uint64_t& sumv,
+        uint64_t& xorv) {
+    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
+
+    double t0 = faiss::getmillisecs();
+
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+
+    const size_t nruns = 10;
+    for (size_t irun = 0; irun < 10; irun++) {
+#pragma omp parallel reduction(+ : sumx, xorx)
+        {
+#pragma omp for
+            for (size_t i = 0; i < n1; i++) {
+                uint64_t local_sum = 0;
+                uint64_t local_xor = 0;
+
+                const uint64_t* data1_ptr =
+                        (const uint64_t*)(x1 + i * CODE_SIZE_IN_BYTES);
+
+                for (size_t j = 0; j < n2; j++) {
+                    const uint64_t* data2_ptr =
+                            (const uint64_t*)(x2 + j * CODE_SIZE_IN_BYTES);
+
+                    uint64_t code = faiss::hamming<CODE_SIZE_IN_BITS>(
+                            data1_ptr, data2_ptr);
+                    local_sum += code;
+                    local_xor ^= code;
+                }
+
+                sumx += local_sum;
+                xorx ^= local_xor;
+            }
+        }
+    }
+
+    sumv = sumx;
+    xorv = xorx;
+
+    double t1 = faiss::getmillisecs();
+    printf("hamming<%d>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
+           CODE_SIZE_IN_BITS,
+           (t1 - t0) / nruns,
+           sumx,
+           xorx);
+}
+
+template <typename HammingComputerT, int CODE_SIZE_IN_BITS>
+void hamming_computer_test(
+        const uint8_t* const x1,
+        const uint8_t* const x2,
+        const size_t n1,
+        const size_t n2,
+        uint64_t& sumv,
+        uint64_t& xorv) {
+    constexpr size_t CODE_SIZE_IN_BYTES = CODE_SIZE_IN_BITS / 8;
+
+    double t0 = faiss::getmillisecs();
+
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+
+    const size_t nruns = 10;
+    for (size_t irun = 0; irun < nruns; irun++) {
+        sumx = 0;
+        xorx = 0;
+
+#pragma omp parallel reduction(+ : sumx, xorx)
+        {
+#pragma omp for
+            for (size_t i = 0; i < n1; i++) {
+                uint64_t local_sum = 0;
+                uint64_t local_xor = 0;
+
+                const uint8_t* data1_ptr = x1 + i * CODE_SIZE_IN_BYTES;
+                HammingComputerT hc(data1_ptr, CODE_SIZE_IN_BYTES);
+
+                for (size_t j = 0; j < n2; j++) {
+                    const uint8_t* data2_ptr = x2 + j * CODE_SIZE_IN_BYTES;
+                    uint64_t code = hc.hamming(data2_ptr);
+                    local_sum += code;
+                    local_xor ^= code;
+                }
+
+                sumx += local_sum;
+                xorx ^= local_xor;
+            }
+        }
+    }
+
+    sumv = sumx;
+    xorv = xorx;
+
+    double t1 = faiss::getmillisecs();
+    printf("HammingComputer<%zd>: %.3f msec, %" PRIX64 ", %" PRIX64 "\n",
+           CODE_SIZE_IN_BYTES,
+           (t1 - t0) / nruns,
+           sumx,
+           xorx);
+}
+
 int main() {
     size_t n = 4 * 1000 * 1000;
 
@@ -89,5 +259,57 @@ int main() {
         printf("Hamming_M8   implem: %.3f ms\n", tot_t2 / nrun);
         printf("Hamming_M4   implem: %.3f ms\n", tot_t3 / nrun);
     }
+
+    // evaluate various hamming<>() function calls
+    const size_t MAX_HAMMING_FUNC_CODE_SIZE = 512;
+
+    const size_t n1 = 65536;
+    const size_t n2 = 16384;
+
+    std::vector<uint8_t> x1(n1 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
+    std::vector<uint8_t> x2(n2 * MAX_HAMMING_FUNC_CODE_SIZE / 8);
+    byte_rand(x1.data(), x1.size(), 12345);
+    byte_rand(x2.data(), x2.size(), 23456);
+
+    // These two values serve as a kind of CRC.
+    uint64_t sumx = 0;
+    uint64_t xorx = 0;
+    hamming_func_test<64>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<128>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<256>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<384>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_func_test<512>(x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    // evaluate various HammingComputerXX
+    hamming_computer_test<faiss::HammingComputer4, 32>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer16, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer20, 160>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer32, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::HammingComputer64, 512>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    // evaluate various GenHammingDistanceComputerXX
+    hamming_computer_test<faiss::GenHammingComputer8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputer16, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputer32, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
+    hamming_computer_test<faiss::GenHammingComputerM8, 64>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 128>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 256>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+    hamming_computer_test<faiss::GenHammingComputerM8, 512>(
+            x1.data(), x2.data(), n1, n2, sumx, xorx);
+
     return 0;
 }
diff --git a/thirdparty/faiss/benchs/bench_hamming_knn.py b/thirdparty/faiss/benchs/bench_hamming_knn.py
new file mode 100644
index 000000000..3a2e9b023
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_hamming_knn.py
@@ -0,0 +1,29 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import numpy as np
+import faiss
+
+if __name__ == "__main__":
+    faiss.omp_set_num_threads(1)
+
+    for d in 4, 8, 16, 13:
+        nq = 10000
+        nb = 30000
+        print('Bits per vector = 8 *', d)
+        xq = faiss.randint((nq, d // 4), seed=1234, vmax=256**4).view('uint8')
+        xb = faiss.randint((nb, d // 4), seed=1234, vmax=256**4).view('uint8')
+        for variant in "hc", "mc":
+            print(f"{variant=:}", end="\t")
+            for k in 1, 4, 16, 64, 256:
+                times = []
+                for _run in range(5):
+                    t0 = time.time()
+                    D, I = faiss.knn_hamming(xq, xb, k, variant=variant)
+                    t1 = time.time()
+                    times.append(t1 - t0)
+                print(f'| {k=:} t={np.mean(times):.3f} s ± {np.std(times):.3f} ', flush=True, end="")
+            print()
diff --git a/thirdparty/faiss/benchs/bench_hnsw.py b/thirdparty/faiss/benchs/bench_hnsw.py
index e52c7d42a..7c5620bed 100644
--- a/thirdparty/faiss/benchs/bench_hnsw.py
+++ b/thirdparty/faiss/benchs/bench_hnsw.py
@@ -17,7 +17,7 @@
 
 
 k = int(sys.argv[1])
-todo = sys.argv[1:]
+todo = sys.argv[2:]
 
 print("load data")
 
diff --git a/thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py b/thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py
new file mode 100644
index 000000000..8a509f323
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_hybrid_cpu_gpu.py
@@ -0,0 +1,606 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import pickle
+import time
+from multiprocessing.pool import ThreadPool
+
+import faiss
+import numpy as np
+
+try:
+    from faiss.contrib.datasets_fb import dataset_from_name
+except ImportError:
+    from faiss.contrib.datasets import dataset_from_name
+
+from faiss.contrib.evaluation import OperatingPointsWithRanges
+from faiss.contrib.ivf_tools import replace_ivf_quantizer
+
+#################################################################
+# Preassigned search functions
+#################################################################
+
+
+def search_preassigned(xq, k, index, quantizer, batch_size=0):
+    """
+    Explicitly call the coarse quantizer and the search_preassigned
+    on the index.
+    """
+    n, d = xq.shape
+    nprobe = index.nprobe
+    if batch_size == 0:
+        batch_size = n + 1
+    D = np.empty((n, k), dtype='float32')
+    I = np.empty((n, k), dtype='int64')
+    for i0 in range(0, n, batch_size):
+        Dq, Iq = quantizer.search(xq[i0:i0 + batch_size], nprobe)
+        D[i0:i0 + batch_size], I[i0:i0 + batch_size] = \
+            index.search_preassigned(xq[i0:i0 + batch_size], k, Iq, Dq)
+    return D, I
+
+
+def tiled_search_preassigned(xq, k, index, quantizer, batch_size=32768):
+    """
+    Explicitly call the coarse quantizer and the search_preassigned
+    on the index. Allow overlapping between coarse quantization and
+    scanning the inverted lists.
+    """
+    n, d = xq.shape
+
+    # prepare a thread that will run the quantizer
+    qq_pool = ThreadPool(1)
+    nprobe = index.nprobe
+
+    def coarse_quant(i0):
+        if i0 >= n:
+            return None
+        i1 = min(i0 + batch_size, n)
+        return quantizer.search(xq[i0:i1], nprobe)
+
+    D = np.empty((n, k), dtype='float32')
+    I = np.empty((n, k), dtype='int64')
+    qq = coarse_quant(0)
+
+    for i0 in range(0, n, batch_size):
+        i1 = min(i0 + batch_size, n)
+        qq_next = qq_pool.apply_async(coarse_quant, (i0 + batch_size, ))
+        Dq, Iq = qq
+        index.search_preassigned(
+            xq[i0:i1], k, Iq=Iq, Dq=Dq, I=I[i0:i1], D=D[i0:i1])
+        qq = qq_next.get()
+
+    qq_pool.close()
+    return D, I
+
+
+#################################################################
+# IVF index objects with a separate coarse quantizer
+#################################################################
+
+class SeparateCoarseQuantizationIndex:
+    """
+    Separately manage the coarse quantizer and the IVF index.
+    """
+
+    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
+        self.index = index
+        self.index_ivf = extract_index_ivf(index)
+        if isinstance(self.index_ivf, faiss.IndexIVF):
+            self.index_ivf.parallel_mode
+            self.index_ivf.parallel_mode = 3
+
+        self.quantizer = quantizer
+        assert self.quantizer.d == self.index_ivf.d
+        # populate quantizer if it was not done before
+        if quantizer.ntotal > 0:
+            assert quantizer.ntotal == self.index_ivf.nlist
+        else:
+            centroids = self.index_ivf.quantizer.reconstruct_n()
+            print(f"adding centroids size {centroids.shape} to quantizer")
+            quantizer.train(centroids)
+            quantizer.add(centroids)
+        self.bs = bs
+        self.seq_tiling = seq_tiling
+
+    def search(self, xq, k):
+        # perform coarse quantization
+        if isinstance(self.index, faiss.IndexPreTransform):
+            # print("applying pre-transform")
+            assert self.index.chain.size() == 1
+            xq = self.index.chain.at(0).apply(xq)
+        if self.bs <= 0:
+            # non batched
+            nprobe = self.index_ivf.nprobe
+            Dq, Iq = self.quantizer.search(xq, nprobe)
+
+            return self.index_ivf.search_preassigned(xq, k, Iq, Dq)
+        if self.seq_tiling:
+            return search_preassigned(
+                xq, k, self.index_ivf, self.quantizer, self.bs)
+        else:
+            return tiled_search_preassigned(
+                xq, k, self.index_ivf, self.quantizer, self.bs)
+
+
+class ShardedGPUIndex:
+    """
+    Multiple GPU indexes, each on its GPU, with a common coarse quantizer.
+    The Python version of IndexShardsIVF
+    """
+    def __init__(self, quantizer, index, bs=-1, seq_tiling=False):
+        self.quantizer = quantizer
+        self.cpu_index = index
+        if isinstance(index, faiss.IndexPreTransform):
+            index = faiss.downcast_index(index.index)
+        ngpu = index.count()
+        self.pool = ThreadPool(ngpu)
+        self.bs = bs
+        if bs > 0:
+            self.q_pool = ThreadPool(1)
+
+    def __del__(self):
+        self.pool.close()
+        if self.bs > 0:
+            self.q_pool.close()
+
+    def search(self, xq, k):
+        nq = len(xq)
+        # perform coarse quantization
+        index = self.cpu_index
+        if isinstance(self.cpu_index, faiss.IndexPreTransform):
+            assert index.chain.size() == 1
+            xq = self.cpu_index.chain.at(0).apply(xq)
+            index = faiss.downcast_index(index.index)
+        ngpu = index.count()
+        sub_index_0 = faiss.downcast_index(index.at(0))
+        nprobe = sub_index_0.nprobe
+
+        Dall = np.empty((ngpu, nq, k), dtype='float32')
+        Iall = np.empty((ngpu, nq, k), dtype='int64')
+        bs = self.bs
+        if bs <= 0:
+
+            Dq, Iq = self.quantizer.search(xq, nprobe)
+
+            def do_search(rank):
+                gpu_index = faiss.downcast_index(index.at(rank))
+                Dall[rank], Iall[rank] = gpu_index.search_preassigned(
+                    xq, k, Iq, Dq)
+            list(self.pool.map(do_search, range(ngpu)))
+        else:
+            qq_pool = self.q_pool
+            bs = self.bs
+
+            def coarse_quant(i0):
+                if i0 >= nq:
+                    return None
+                return self.quantizer.search(xq[i0:i0 + bs], nprobe)
+
+            def do_search(rank, i0, qq):
+                gpu_index = faiss.downcast_index(index.at(rank))
+                Dq, Iq = qq
+                Dall[rank, i0:i0 + bs], Iall[rank, i0:i0 + bs] = \
+                    gpu_index.search_preassigned(xq[i0:i0 + bs], k, Iq, Dq)
+
+            qq = coarse_quant(0)
+
+            for i0 in range(0, nq, bs):
+                qq_next = qq_pool.apply_async(coarse_quant, (i0 + bs, ))
+                list(self.pool.map(
+                    lambda rank: do_search(rank, i0, qq),
+                    range(ngpu)
+                ))
+                qq = qq_next.get()
+
+        return faiss.merge_knn_results(Dall, Iall)
+
+
+def extract_index_ivf(index):
+    """ extract the IVF sub-index from the index, supporting GpuIndexes
+    as well """
+    try:
+        return faiss.extract_index_ivf(index)
+    except RuntimeError:
+        if index.__class__ == faiss.IndexPreTransform:
+            index = faiss.downcast_index(index.index)
+        if isinstance(index, faiss.GpuIndexIVF):
+            return index
+        raise RuntimeError(f"could not extract IVF index from {index}")
+
+
+def set_index_parameter(index, name, val):
+    """
+    Index parameter setting that works on the index lookalikes defined above
+    """
+    if index.__class__ == SeparateCoarseQuantizationIndex:
+        if name == "nprobe":
+            set_index_parameter(index.index_ivf, name, val)
+        elif name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            raise RuntimeError()
+        return
+
+    if index.__class__ == ShardedGPUIndex:
+        if name == "nprobe":
+            set_index_parameter(index.cpu_index, name, val)
+        elif name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            raise RuntimeError()
+        return
+
+    # then it's a Faiss index
+    index = faiss.downcast_index(index)
+
+    if isinstance(index, faiss.IndexPreTransform):
+        set_index_parameter(index.index, name, val)
+    elif isinstance(index, faiss.IndexShardsIVF):
+        if name != "nprobe" and name.startswith("quantizer_"):
+            set_index_parameter(
+                index.quantizer, name[name.find("_") + 1:], val)
+        else:
+            for i in range(index.count()):
+                sub_index = index.at(i)
+                set_index_parameter(sub_index, name, val)
+    elif (isinstance(index, faiss.IndexShards) or
+          isinstance(index, faiss.IndexReplicas)):
+        for i in range(index.count()):
+            sub_index = index.at(i)
+            set_index_parameter(sub_index, name, val)
+    elif name.startswith("quantizer_"):
+        index_ivf = extract_index_ivf(index)
+        set_index_parameter(
+            index_ivf.quantizer, name[name.find("_") + 1:], val)
+    elif name == "efSearch":
+        index.hnsw.efSearch
+        index.hnsw.efSearch = int(val)
+    elif name == "nprobe":
+        index_ivf = extract_index_ivf(index)
+        index_ivf.nprobe
+        index_ivf.nprobe = int(val)
+    else:
+        raise RuntimeError(f"could not set param {name} on {index}")
+
+
+#####################################################################
+# Driver routine
+#####################################################################
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    def aa(*args, **kwargs):
+        group.add_argument(*args, **kwargs)
+
+    group = parser.add_argument_group('dataset options')
+    aa('--nq', type=int, default=int(10e5),
+       help="nb queries (queries will be duplicated if below that number")
+    aa('--db', default='bigann10M', help='dataset')
+
+    group = parser.add_argument_group('index options')
+    aa('--indexname', default="", help="override index name")
+    aa('--mmap', default=False, action='store_true', help='mmap index')
+    aa('--shard_type', default=1, type=int, help="set type of sharding")
+    aa('--useFloat16', default=False, action='store_true',
+       help='GPU cloner options')
+    aa('--useFloat16CoarseQuantizer', default=False, action='store_true',
+       help='GPU cloner options')
+    aa('--usePrecomputed', default=False, action='store_true',
+       help='GPU cloner options')
+    group = parser.add_argument_group('search options')
+    aa('--k', type=int, default=100)
+    aa('--search_type', default="cpu",
+        choices=[
+            "cpu", "gpu", "gpu_flat_quantizer",
+            "cpu_flat_gpu_quantizer", "gpu_tiled", "gpu_ivf_quantizer",
+            "multi_gpu", "multi_gpu_flat_quantizer",
+            "multi_gpu_sharded", "multi_gpu_flat_quantizer_sharded",
+            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
+            "multi_gpu_sharded1_ivf",
+            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
+            "multi_gpu_Csharded1_ivf",
+        ],
+        help="how to search"
+    )
+    aa('--ivf_quant_nlist', type=int, default=1024,
+       help="nb of invlists for IVF quantizer")
+    aa('--batch_size', type=int, default=-1,
+       help="batch size for tiled CPU / GPU computation (-1= no tiling)")
+    aa('--n_autotune', type=int, default=300,
+        help="max nb of auto-tuning steps")
+    aa('--nt', type=int, default=-1, help="force number of CPU threads to this")
+
+    group = parser.add_argument_group('output options')
+    aa('--quiet', default=False, action="store_true")
+    aa('--stats', default="", help="pickle to store output stats")
+
+    args = parser.parse_args()
+    print("args:", args)
+
+    if not args.quiet:
+        # log some stats about the machine
+        os.system("grep -m1 'model name' < /proc/cpuinfo")
+        os.system("grep -E 'MemTotal|MemFree' /proc/meminfo")
+        os.system("nvidia-smi")
+
+    print("prepare dataset", args.db)
+    ds = dataset_from_name(args.db)
+    print(ds)
+
+    print("Faiss nb GPUs:", faiss.get_num_gpus())
+
+    xq = ds.get_queries()
+    if args.nq > len(xq):
+        xqx = []
+        n = 0
+        while n < args.nq:
+            xqx.append(xq[:args.nq - n])
+            n += len(xqx[-1])
+        print(f"increased nb queries from {len(xq)} to {n}")
+        xq = np.vstack(xqx)
+
+    if args.nt != -1:
+        print("setting nb openmp threads to", args.nt)
+        faiss.omp_set_num_threads(args.nt)
+
+    print("loading index")
+
+    if args.mmap:
+        io_flag = faiss.IO_FLAG_READ_ONLY | faiss.IO_FLAG_MMAP
+    else:
+        io_flag = 0
+
+    print(f"load index {args.indexname} {io_flag=:x}")
+    index = faiss.read_index(args.indexname, io_flag)
+    index_ivf = faiss.extract_index_ivf(index)
+
+    print("prepare index")
+    op = OperatingPointsWithRanges()
+    op.add_range(
+        "nprobe", [
+            2 ** i for i in range(20)
+            if 2 ** i < index_ivf.nlist * 0.1 and 2 ** i <= 4096
+        ]
+    )
+
+    # prepare options for GPU clone
+
+    co = faiss.GpuMultipleClonerOptions()
+    co.useFloat16 = args.useFloat16
+    co.useFloat16CoarseQuantizer = args.useFloat16CoarseQuantizer
+    co.usePrecomputed = args.usePrecomputed
+    co.shard_type = args.shard_type
+
+    if args.search_type == "cpu":
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+    elif args.search_type == "gpu":
+        print("move index to 1 GPU")
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type == "gpu_tiled":
+        print("move index to 1 GPU")
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+        index = SeparateCoarseQuantizationIndex(
+            quantizer_hnsw, index, bs=args.batch_size)
+    elif args.search_type == "gpu_ivf_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        centroids = index_ivf.quantizer.reconstruct_n()
+        replace_ivf_quantizer(index_ivf, faiss.IndexFlatL2(index_ivf.d))
+        res = faiss.StandardGpuResources()
+        new_quantizer = faiss.index_factory(
+            index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
+        new_quantizer.train(centroids)
+        new_quantizer.add(centroids)
+        index = SeparateCoarseQuantizationIndex(
+            faiss.index_cpu_to_gpu(res, 0, new_quantizer, co),
+            faiss.index_cpu_to_gpu(res, 0, index, co),
+            bs=args.batch_size, seq_tiling=True
+        )
+        op.add_range(
+            "quantizer_nprobe",
+            [2 ** i for i in range(9)]
+        )
+        op.restrict_range("nprobe", 1025)
+    elif args.search_type == "gpu_flat_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        replace_ivf_quantizer(index_ivf, new_quantizer)
+        res = faiss.StandardGpuResources()
+        index = faiss.index_cpu_to_gpu(res, 0, index, co)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type == "cpu_flat_gpu_quantizer":
+        index_ivf = faiss.extract_index_ivf(index)
+        quantizer = faiss.IndexFlatL2(index_ivf.d)
+        res = faiss.StandardGpuResources()
+        quantizer = faiss.index_cpu_to_gpu(res, 0, quantizer, co)
+        index = SeparateCoarseQuantizationIndex(
+            quantizer, index, bs=args.batch_size)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in ("multi_gpu", "multi_gpu_sharded"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        co.shard = "sharded" in args.search_type
+        index = faiss.index_cpu_to_all_gpus(index, co=co)
+        op.add_range(
+            "quantizer_efSearch",
+            [2 ** i for i in range(10)]
+        )
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in (
+            "multi_gpu_flat_quantizer", "multi_gpu_flat_quantizer_sharded"):
+        index_ivf = faiss.extract_index_ivf(index)
+        new_quantizer = faiss.IndexFlatL2(ds.d)
+        replace_ivf_quantizer(index_ivf, new_quantizer)
+        index = faiss.index_cpu_to_all_gpus(index, co=co)
+        op.restrict_range("nprobe", 2049)
+    elif args.search_type in (
+            "multi_gpu_sharded1", "multi_gpu_sharded1_flat",
+            "multi_gpu_sharded1_ivf"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+        hnsw_quantizer = replace_ivf_quantizer(index_ivf, new_quantizer)
+        co.shard
+        co.shard = True
+        gpus = list(range(faiss.get_num_gpus()))
+        res = [faiss.StandardGpuResources() for _ in gpus]
+        index = faiss.index_cpu_to_gpu_multiple_py(res, index, co, gpus)
+        op.restrict_range("nprobe", 2049)
+        if args.search_type == "multi_gpu_sharded1":
+            op.add_range(
+                "quantizer_efSearch",
+                [2 ** i for i in range(10)]
+            )
+            index = ShardedGPUIndex(hnsw_quantizer, index, bs=args.batch_size)
+        elif args.search_type == "multi_gpu_sharded1_ivf":
+            centroids = hnsw_quantizer.storage.reconstruct_n()
+            quantizer = faiss.index_factory(
+                centroids.shape[1], f"IVF{args.ivf_quant_nlist},Flat")
+            quantizer.train(centroids)
+            quantizer.add(centroids)
+            co.shard = False
+            quantizer = faiss.index_cpu_to_gpu_multiple_py(
+                res, quantizer, co, gpus)
+            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
+
+            op.add_range(
+                "quantizer_nprobe",
+                [2 ** i for i in range(9)]
+            )
+            op.restrict_range("nprobe", 1025)
+        elif args.search_type == "multi_gpu_sharded1_flat":
+            quantizer = hnsw_quantizer.storage
+            quantizer = faiss.index_cpu_to_gpu_multiple_py(
+                res, quantizer, co, gpus)
+            index = ShardedGPUIndex(quantizer, index, bs=args.batch_size)
+        else:
+            raise RuntimeError()
+    elif args.search_type in (
+            "multi_gpu_Csharded1", "multi_gpu_Csharded1_flat",
+            "multi_gpu_Csharded1_ivf"):
+        print(f"move index to {faiss.get_num_gpus()} GPU")
+        co.shard = True
+        co.common_ivf_quantizer
+        co.common_ivf_quantizer = True
+        op.restrict_range("nprobe", 2049)
+        if args.search_type == "multi_gpu_Csharded1":
+            op.add_range(
+                "quantizer_efSearch",
+                [2 ** i for i in range(10)]
+            )
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        elif args.search_type == "multi_gpu_Csharded1_flat":
+            new_quantizer = faiss.IndexFlatL2(index_ivf.d)
+            quantizer_hnsw = replace_ivf_quantizer(index_ivf, new_quantizer)
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        elif args.search_type == "multi_gpu_Csharded1_ivf":
+            quantizer = faiss.index_factory(
+                index_ivf.d, f"IVF{args.ivf_quant_nlist},Flat")
+            quantizer_hnsw = replace_ivf_quantizer(index_ivf, quantizer)
+            op.add_range(
+                "quantizer_nprobe",
+                [2 ** i for i in range(9)]
+            )
+            index = faiss.index_cpu_to_all_gpus(index, co)
+        else:
+            raise RuntimeError()
+    else:
+        raise RuntimeError()
+
+    totex = op.num_experiments()
+    rs = np.random.RandomState(123)
+    if totex < args.n_autotune:
+        experiments = rs.permutation(totex - 2) + 1
+    else:
+        experiments = rs.randint(
+            totex - 2, size=args.n_autotune - 2, replace=False)
+
+    experiments = [0, totex - 1] + list(experiments)
+    print(f"total nb experiments {totex}, running {len(experiments)}")
+
+    print("perform search")
+    gt = ds.get_groundtruth(100)
+
+    # piggyback on operating points so that this gets stored in the stats file
+    op.all_experiments = []
+    op.platform = {
+        "loadavg": open("/proc/loadavg", "r").readlines(),
+        "procesor": [l for l in open("/proc/cpuinfo") if "model name" in l][0],
+        "GPU": list(os.popen("nvidia-smi", "r")),
+        "mem": open("/proc/meminfo", "r").readlines(),
+        "pid": os.getpid()
+    }
+    op.args = args
+    if args.stats:
+        print(f"storing stats in {args.stats} after each experiment")
+
+    for cno in experiments:
+        key = op.cno_to_key(cno)
+        parameters = op.get_parameters(key)
+        print(f"{cno=:4d} {str(parameters):50}", end=": ", flush=True)
+
+        (max_perf, min_time) = op.predict_bounds(key)
+        if not op.is_pareto_optimal(max_perf, min_time):
+            print(f"SKIP, {max_perf=:.3f} {min_time=:.3f}", )
+            continue
+
+        for name, val in parameters.items():
+            set_index_parameter(index, name, val)
+
+        if cno == 0:
+            # warmup
+            for _ in range(5):
+                D, I = index.search(xq, 100)
+
+        t0 = time.time()
+        try:
+            D, I = index.search(xq, 100)
+        except RuntimeError as e:
+            print(f"ERROR {e}")
+            continue
+        t1 = time.time()
+
+        recalls = {}
+        for rank in 1, 10, 100:
+            recall = (gt[:, :1] == I[:ds.nq, :rank]).sum() / ds.nq
+            recalls[rank] = recall
+
+        print(f"time={t1 - t0:.3f} s recalls={recalls}")
+        perf = recalls[1]
+        op.add_operating_point(key, perf, t1 - t0)
+        op.all_experiments.append({
+            "cno": cno,
+            "key": key,
+            "parameters": parameters,
+            "time": t1 - t0,
+            "recalls": recalls
+        })
+
+        if args.stats:
+            pickle.dump(op, open(args.stats, "wb"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/thirdparty/faiss/benchs/bench_ivf_fastscan.py b/thirdparty/faiss/benchs/bench_ivf_fastscan.py
new file mode 100644
index 000000000..c2d1d5e83
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_ivf_fastscan.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import time
+import os
+import multiprocessing as mp
+import numpy as np
+import matplotlib.pyplot as plt
+
+try:
+    from faiss.contrib.datasets_fb import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+except ImportError:
+    from faiss.contrib.datasets import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+
+
+# ds = DatasetDeep1B(10**6)
+# ds = DatasetBigANN(nb_M=1)
+ds = DatasetSIFT1M()
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+
+xt = ds.get_train()
+
+nb, d = xb.shape
+nq, d = xq.shape
+nt, d = xt.shape
+
+k = 1
+AQ = faiss.AdditiveQuantizer
+
+
+def eval_recall(index, name):
+    t0 = time.time()
+    D, I = index.search(xq, k=k)
+    t = time.time() - t0
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+
+    corrects = (gt == I).sum()
+    recall = corrects / nq
+    print(
+        f'\tnprobe {index.nprobe:3d}, Recall@{k}: '
+        f'{recall:.6f}, speed: {speed:.6f} ms/query'
+    )
+
+    return recall, qps
+
+
+def eval_and_plot(name, rescale_norm=True, plot=True):
+    index = faiss.index_factory(d, name)
+    index_path = f"indices/{name}.faissindex"
+
+    if os.path.exists(index_path):
+        index = faiss.read_index(index_path)
+    else:
+        faiss.omp_set_num_threads(mp.cpu_count())
+        index.train(xt)
+        index.add(xb)
+        faiss.write_index(index, index_path)
+
+    # search params
+    if hasattr(index, 'rescale_norm'):
+        index.rescale_norm = rescale_norm
+        name += f"(rescale_norm={rescale_norm})"
+    faiss.omp_set_num_threads(1)
+
+    data = []
+    print(f"======{name}")
+    for nprobe in 1, 2, 4, 6, 8, 12, 16, 24, 32, 48, 64, 128:
+        index.nprobe = nprobe
+        recall, qps = eval_recall(index, name)
+        data.append((recall, qps))
+
+    if plot:
+        data = np.array(data)
+        plt.plot(data[:, 0], data[:, 1], label=name)  # x - recall, y - qps
+
+
+M, nlist = 32, 1024
+
+# just for warmup...
+# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
+
+# benchmark
+plt.figure(figsize=(8, 6), dpi=80)
+
+# PQ
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs")
+eval_and_plot(f"IVF{nlist},PQ{M}x4fsr")
+
+# AQ, by_residual
+eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4")
+eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4")
+eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fsr_Nlsq2x4", rescale_norm=False)
+eval_and_plot(f"IVF{nlist},RQ{M-2}x4fsr_Nrq2x4", rescale_norm=False)
+
+# AQ, no by_residual
+eval_and_plot(f"IVF{nlist},LSQ{M-2}x4fs_Nlsq2x4")
+eval_and_plot(f"IVF{nlist},RQ{M-2}x4fs_Nrq2x4")
+
+plt.title("Indices on SIFT1M")
+plt.xlabel("Recall@1")
+plt.ylabel("QPS")
+plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
+plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')
diff --git a/thirdparty/faiss/benchs/bench_ivf_fastscan_single_query.py b/thirdparty/faiss/benchs/bench_ivf_fastscan_single_query.py
new file mode 100644
index 000000000..45352672e
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_ivf_fastscan_single_query.py
@@ -0,0 +1,122 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import time
+import os
+import multiprocessing as mp
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+try:
+    from faiss.contrib.datasets_fb import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+except ImportError:
+    from faiss.contrib.datasets import \
+        DatasetSIFT1M, DatasetDeep1B, DatasetBigANN
+
+# ds = DatasetDeep1B(10**6)
+ds = DatasetBigANN(nb_M=50)
+# ds = DatasetSIFT1M()
+
+xq = ds.get_queries()
+xb = ds.get_database()
+gt = ds.get_groundtruth()
+
+xt = ds.get_train()
+
+nb, d = xb.shape
+nq, d = xq.shape
+nt, d = xt.shape
+
+print('the dimension is {}, {}'.format(nb, d))
+
+k = 64
+
+
+def eval_recall(index, name, single_query=False):
+    t0 = time.time()
+    D, I = index.search(xq, k=k)
+
+    t = time.time() - t0
+    if single_query:
+        t0 = time.time()
+        for row in range(nq):
+            Ds, Is = index.search(xq[row:row + 1], k=k)
+            D[row, :] = Ds
+            I[row, :] = Is
+        t = time.time() - t0
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+
+    corrects = (gt[:, :1] == I[:, :k]).sum()
+    recall = corrects / nq
+    print(
+        f'\tnprobe {index.nprobe:3d}, 1Recall@{k}: '
+        f'{recall:.6f}, speed: {speed:.6f} ms/query'
+    )
+
+    return recall, qps
+
+
+def eval_and_plot(
+        name, rescale_norm=True, plot=True, single_query=False,
+        implem=None, num_threads=1):
+    index = faiss.index_factory(d, name)
+    index_path = f"indices/{name}.faissindex"
+
+    if os.path.exists(index_path):
+        index = faiss.read_index(index_path)
+    else:
+        faiss.omp_set_num_threads(mp.cpu_count())
+        index.train(xt)
+        index.add(xb)
+        faiss.write_index(index, index_path)
+
+    # search params
+    if hasattr(index, 'rescale_norm'):
+        index.rescale_norm = rescale_norm
+        name += f"(rescale_norm={rescale_norm})"
+    if implem is not None and hasattr(index, 'implem'):
+        index.implem = implem
+        name += f"(implem={implem})"
+    if single_query:
+        name += f"(single_query={single_query})"
+    if num_threads > 1:
+        name += f"(num_threads={num_threads})"
+
+    faiss.omp_set_num_threads(num_threads)
+
+    data = []
+    print(f"======{name}")
+    for nprobe in 1, 4, 8, 16, 32, 64, 128, 256:
+        index.nprobe = nprobe
+        recall, qps = eval_recall(index, name, single_query=single_query)
+        data.append((recall, qps))
+
+    if plot:
+        data = np.array(data)
+        plt.plot(data[:, 0], data[:, 1], label=name)  # x - recall, y - qps
+
+
+M, nlist = 64, 4096
+
+# just for warmup...
+# eval_and_plot(f"IVF{nlist},PQ{M}x4fs", plot=False)
+
+# benchmark
+plt.figure(figsize=(8, 6), dpi=80)
+
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", num_threads=8)
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=0, num_threads=8)
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=14, num_threads=8)
+eval_and_plot(f"IVF{nlist},PQ{M}x4fs", single_query=True, implem=15, num_threads=8)
+
+plt.title("Indices on Bigann50M")
+plt.xlabel("1Recall@{}".format(k))
+plt.ylabel("QPS")
+plt.legend(bbox_to_anchor=(1.02, 0.1), loc='upper left', borderaxespad=0)
+plt.savefig("bench_ivf_fastscan.png", bbox_inches='tight')
diff --git a/thirdparty/faiss/benchs/bench_ivf_selector.cpp b/thirdparty/faiss/benchs/bench_ivf_selector.cpp
new file mode 100644
index 000000000..6610ce1c9
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_ivf_selector.cpp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <omp.h>
+#include <unistd.h>
+#include <memory>
+
+#include <faiss/IVFlib.h>
+#include <faiss/IndexIVF.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/index_factory.h>
+#include <faiss/index_io.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+/************************
+ * This benchmark attempts to measure the runtime overhead to use an IDSelector
+ * over doing an unconditional sequential scan. Unfortunately the results of the
+ * benchmark also depend a lot on the parallel_mode and the way
+ * search_with_parameters works.
+ */
+
+int main() {
+    using idx_t = faiss::idx_t;
+    int d = 64;
+    size_t nb = 1024 * 1024;
+    size_t nq = 512 * 16;
+    size_t k = 10;
+    std::vector<float> data((nb + nq) * d);
+    float* xb = data.data();
+    float* xq = data.data() + nb * d;
+    faiss::rand_smooth_vectors(nb + nq, d, data.data(), 1234);
+
+    std::unique_ptr<faiss::Index> index;
+    // const char *index_key = "IVF1024,Flat";
+    const char* index_key = "IVF1024,SQ8";
+    printf("index_key=%s\n", index_key);
+    std::string stored_name =
+            std::string("/tmp/bench_ivf_selector_") + index_key + ".faissindex";
+
+    if (access(stored_name.c_str(), F_OK) != 0) {
+        printf("creating index\n");
+        index.reset(faiss::index_factory(d, index_key));
+
+        double t0 = faiss::getmillisecs();
+        index->train(nb, xb);
+        double t1 = faiss::getmillisecs();
+        index->add(nb, xb);
+        double t2 = faiss::getmillisecs();
+        printf("Write %s\n", stored_name.c_str());
+        faiss::write_index(index.get(), stored_name.c_str());
+    } else {
+        printf("Read %s\n", stored_name.c_str());
+        index.reset(faiss::read_index(stored_name.c_str()));
+    }
+    faiss::IndexIVF* index_ivf = static_cast<faiss::IndexIVF*>(index.get());
+    index->verbose = true;
+
+    for (int tt = 0; tt < 3; tt++) {
+        if (tt == 1) {
+            index_ivf->parallel_mode = 3;
+        } else {
+            index_ivf->parallel_mode = 0;
+        }
+
+        if (tt == 2) {
+            printf("set single thread\n");
+            omp_set_num_threads(1);
+        }
+        printf("parallel_mode=%d\n", index_ivf->parallel_mode);
+
+        std::vector<float> D1(nq * k);
+        std::vector<idx_t> I1(nq * k);
+        {
+            double t2 = faiss::getmillisecs();
+            index->search(nq, xq, k, D1.data(), I1.data());
+            double t3 = faiss::getmillisecs();
+
+            printf("search time, no selector: %.3f ms\n", t3 - t2);
+        }
+
+        std::vector<float> D2(nq * k);
+        std::vector<idx_t> I2(nq * k);
+        {
+            double t2 = faiss::getmillisecs();
+            faiss::IVFSearchParameters params;
+
+            faiss::ivflib::search_with_parameters(
+                    index.get(), nq, xq, k, D2.data(), I2.data(), &params);
+            double t3 = faiss::getmillisecs();
+            printf("search time with nullptr selector: %.3f ms\n", t3 - t2);
+        }
+        FAISS_THROW_IF_NOT(I1 == I2);
+        FAISS_THROW_IF_NOT(D1 == D2);
+
+        {
+            double t2 = faiss::getmillisecs();
+            faiss::IVFSearchParameters params;
+            faiss::IDSelectorAll sel;
+            params.sel = &sel;
+
+            faiss::ivflib::search_with_parameters(
+                    index.get(), nq, xq, k, D2.data(), I2.data(), &params);
+            double t3 = faiss::getmillisecs();
+            printf("search time with selector: %.3f ms\n", t3 - t2);
+        }
+        FAISS_THROW_IF_NOT(I1 == I2);
+        FAISS_THROW_IF_NOT(D1 == D2);
+
+        std::vector<float> D3(nq * k);
+        std::vector<idx_t> I3(nq * k);
+        {
+            int nt = omp_get_max_threads();
+            double t2 = faiss::getmillisecs();
+            faiss::IVFSearchParameters params;
+
+#pragma omp parallel for if (nt > 1)
+            for (idx_t slice = 0; slice < nt; slice++) {
+                idx_t i0 = nq * slice / nt;
+                idx_t i1 = nq * (slice + 1) / nt;
+                if (i1 > i0) {
+                    faiss::ivflib::search_with_parameters(
+                            index.get(),
+                            i1 - i0,
+                            xq + i0 * d,
+                            k,
+                            D3.data() + i0 * k,
+                            I3.data() + i0 * k,
+                            &params);
+                }
+            }
+            double t3 = faiss::getmillisecs();
+            printf("search time with null selector + manual parallel: %.3f ms\n",
+                   t3 - t2);
+        }
+        FAISS_THROW_IF_NOT(I1 == I3);
+        FAISS_THROW_IF_NOT(D1 == D3);
+    }
+
+    return 0;
+}
diff --git a/thirdparty/faiss/benchs/bench_polysemous_1bn.py b/thirdparty/faiss/benchs/bench_polysemous_1bn.py
index 0396b56b5..eabced595 100644
--- a/thirdparty/faiss/benchs/bench_polysemous_1bn.py
+++ b/thirdparty/faiss/benchs/bench_polysemous_1bn.py
@@ -9,7 +9,7 @@
 import numpy as np
 import re
 import faiss
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 from datasets import ivecs_read
 
 
diff --git a/thirdparty/faiss/benchs/bench_pq_transposed_centroid_table.py b/thirdparty/faiss/benchs/bench_pq_transposed_centroid_table.py
new file mode 100644
index 000000000..7aed6bffb
--- /dev/null
+++ b/thirdparty/faiss/benchs/bench_pq_transposed_centroid_table.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import time
+import random
+
+import faiss.contrib.datasets
+
+
+# copied from benchs/bench_all_ivf/bench_all_ivf.py
+def unwind_index_ivf(index):
+    if isinstance(index, faiss.IndexPreTransform):
+        assert index.chain.size() == 1
+        vt = index.chain.at(0)
+        index_ivf, vt2 = unwind_index_ivf(faiss.downcast_index(index.index))
+        assert vt2 is None
+        return index_ivf, vt
+    if hasattr(faiss, "IndexRefine") and isinstance(index, faiss.IndexRefine):
+        return unwind_index_ivf(faiss.downcast_index(index.base_index))
+    if isinstance(index, faiss.IndexIVF):
+        return index, None
+    else:
+        return None, None
+
+
+def test_bigann10m(index_file, index_parameters):
+    ds = faiss.contrib.datasets.DatasetBigANN(nb_M=10)
+
+    xq = ds.get_queries()
+    xb = ds.get_database()
+    gt = ds.get_groundtruth()
+
+    nb, d = xb.shape
+    nq, d = xq.shape
+
+    print("Reading index {}".format(index_file))
+    index = faiss.read_index(index_file)
+
+    ps = faiss.ParameterSpace()
+    ps.initialize(index)
+
+    index_ivf, vec_transform = unwind_index_ivf(index)
+
+    print('params                                                                      regular    transp_centroids   regular   R@1    R@10   R@100')
+    for index_parameter in index_parameters:
+        ps.set_index_parameters(index, index_parameter)
+
+        print(index_parameter.ljust(70), end=' ')
+
+        k = 100
+
+        # warmup
+        D, I = index.search(xq, k)
+
+        # warmup
+        D, I = index.search(xq, k)
+
+        # eval
+        t2_0 = time.time()
+        D, I = index.search(xq, k)
+        t2_1 = time.time()
+
+        # eval
+        index_ivf.pq.sync_transposed_centroids()
+        t3_0 = time.time()
+        D, I = index.search(xq, k)
+        t3_1 = time.time()
+
+        # eval
+        index_ivf.pq.clear_transposed_centroids()
+        t4_0 = time.time()
+        D, I = index.search(xq, k)
+        t4_1 = time.time()
+
+        print("   %9.5f  " % (t2_1 - t2_0), end=' ')
+        print("   %9.5f  " % (t3_1 - t3_0), end=' ')
+        print("   %9.5f  " % (t4_1 - t4_0), end=' ')
+
+        for rank in 1, 10, 100:
+            n_ok = (I[:, :rank] == gt[:, :1]).sum()
+            print("%.4f" % (n_ok / float(nq)), end=' ')
+        print()
+
+
+if __name__ == "__main__":
+    faiss.contrib.datasets.dataset_basedir = '/home/aguzhva/ANN_SIFT1B/'
+
+    # represents OPQ32_128,IVF65536_HNSW32,PQ32 index
+    index_file_1 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/hnsw32/.faissindex"
+
+    nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+    quantizer_efsearch_values = [4, 8, 16, 32, 64, 128, 256, 512]
+    ht_values = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 256]
+
+    # represents OPQ32_128,IVF65536(IVF256,PQHDx4fs,RFlat),PQ32 index
+    index_file_2 = "/home/aguzhva/ANN_SIFT1B/run_tests/bench_ivf/indexes/pq4/.faissindex"
+
+    quantizer_k_factor_rf_values = [1, 2, 4, 8, 16, 32, 64]
+    quantizer_nprobe_values = [1, 2, 4, 8, 16, 32, 64, 128]
+
+    # test the first index
+    index_parameters_1 = []
+    for _ in range(0, 20):
+        nprobe = random.choice(nprobe_values)
+        quantizer_efsearch = random.choice(quantizer_efsearch_values)
+        ht = random.choice(ht_values)
+        index_parameters_1.append(
+            "nprobe={},quantizer_efSearch={},ht={}".format(
+                nprobe,
+                quantizer_efsearch,
+                ht)
+        )
+
+    test_bigann10m(index_file_1, index_parameters_1)
+
+    # test the second index
+    index_parameters_2 = []
+    for _ in range(0, 20):
+        nprobe = random.choice(nprobe_values)
+        quantizer_k_factor_rf = random.choice(quantizer_k_factor_rf_values)
+        quantizer_nprobe = random.choice(quantizer_nprobe_values)
+        ht = random.choice(ht_values)
+        index_parameters_2.append(
+            "nprobe={},quantizer_k_factor_rf={},quantizer_nprobe={},ht={}".format(
+                nprobe,
+                quantizer_k_factor_rf,
+                quantizer_nprobe,
+                ht)
+        )
+
+    test_bigann10m(index_file_2, index_parameters_2)
diff --git a/thirdparty/faiss/benchs/bench_quantizer.py b/thirdparty/faiss/benchs/bench_quantizer.py
index 54c710ada..882f9fb2d 100644
--- a/thirdparty/faiss/benchs/bench_quantizer.py
+++ b/thirdparty/faiss/benchs/bench_quantizer.py
@@ -43,8 +43,16 @@ def eval_quantizer(q, xq, xb, gt, xt, variants=None):
     for name, val in variants:
         if name is not None:
             print(f"{name}={val}")
-            getattr(q, name)  # make sure field exists
-            setattr(q, name, val)
+
+            if isinstance(q, faiss.ProductAdditiveQuantizer):
+                for i in range(q.nsplits):
+                    subq = faiss.downcast_Quantizer(q.subquantizer(i))
+                    getattr(subq, name)
+                    setattr(subq, name, val)
+            else:
+                getattr(q, name)  # make sure field exists
+                setattr(q, name, val)
+
         eval_codec(q, xq, xb, gt)
 
 
@@ -60,10 +68,12 @@ def eval_quantizer(q, xq, xb, gt, xt, variants=None):
     ds = DatasetSIFT1M()
 
 if len(todo) > 0:
-    if "x" in todo[0]:
-        M, nbits = todo[0].split("x")
-        M = int(M)
-        nbits = int(nbits)
+    if todo[0].count("x") == 1:
+        M, nbits = [int(x) for x in todo[0].split("x")]
+        del todo[0]
+    elif todo[0].count("x") == 2:
+        nsplits, Msub, nbits = [int(x) for x in todo[0].split("x")]
+        M = nsplits * Msub
         del todo[0]
 
 maxtrain = max(100 << nbits, 10**5)
@@ -106,6 +116,18 @@ def eval_quantizer(q, xq, xb, gt, xt, variants=None):
     print("===== PQ")
     eval_quantizer(pq, xq2, xb2, gt, xt2)
 
+if 'prq' in todo:
+    print(f"===== PRQ{nsplits}x{Msub}x{nbits}")
+    prq = faiss.ProductResidualQuantizer(d, nsplits, Msub, nbits)
+    variants = [("max_beam_size", i) for i in (1, 2, 4, 8, 16, 32)]
+    eval_quantizer(prq, xq, xb, gt, xt, variants=variants)
+
+if 'plsq' in todo:
+    print(f"===== PLSQ{nsplits}x{Msub}x{nbits}")
+    plsq = faiss.ProductLocalSearchQuantizer(d, nsplits, Msub, nbits)
+    variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
+    eval_quantizer(plsq, xq, xb, gt, xt, variants=variants)
+
 if 'rq' in todo:
     print("===== RQ")
     rq = faiss.ResidualQuantizer(d, M, nbits, )
@@ -131,6 +153,5 @@ def eval_quantizer(q, xq, xb, gt, xt, variants=None):
 if 'lsq' in todo:
     print("===== LSQ")
     lsq = faiss.LocalSearchQuantizer(d, M, nbits)
-    lsq.verbose = True
     variants = [("encode_ils_iters", i) for i in (2, 3, 4, 8, 16)]
     eval_quantizer(lsq, xq, xb, gt, xt, variants=variants)
diff --git a/thirdparty/faiss/benchs/distributed_ondisk/README.md b/thirdparty/faiss/benchs/distributed_ondisk/README.md
index 643a99a1d..22904f468 100644
--- a/thirdparty/faiss/benchs/distributed_ondisk/README.md
+++ b/thirdparty/faiss/benchs/distributed_ondisk/README.md
@@ -1,41 +1,38 @@
-# Distributed on-disk index for 1T-scale datasets 
+# Distributed on-disk index for 1T-scale datasets
 
-This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors). 
-All the code is in python 3 (and not compatible with Python 2). 
+This is code corresponding to the description in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).
+All the code is in python 3 (and not compatible with Python 2).
 The current code uses the Deep1B dataset for demonstration purposes, but can scale to 1000x larger.
-To run it, download the Deep1B dataset as explained [here](../#getting-deep1b), and edit paths to the dataset in the scripts. 
+To run it, download the Deep1B dataset as explained [here](../#getting-deep1b), and edit paths to the dataset in the scripts.
 
-The cluster commands are written for the Slurm batch scheduling system. 
+The cluster commands are written for the Slurm batch scheduling system.
 Hopefully, changing to another type of scheduler should be quite straightforward.
 
 ## Distributed k-means
 
-To cluster 500M vectors to 10M centroids, it is useful to have a distriubuted k-means implementation. 
-The distribution simply consists in splitting the training vectors across machines (servers) and have them do the assignment. 
+To cluster 500M vectors to 10M centroids, it is useful to have a distributed k-means implementation.
+The distribution simply consists in splitting the training vectors across machines (servers) and have them do the assignment.
 The master/client then synthesizes the results and updates the centroids.
 
 The distributed k-means implementation here is based on 3 files:
 
-- [`rpc.py`](rpc.py) is a very simple remote procedure call implementation based on sockets and pickle. 
-It exposes the methods of an object on the server side so that they can be called from the client as if the object was local.
-
-- [`distributed_kmeans.py`](distributed_kmeans.py) contains the k-means implementation. 
-The main loop of k-means is re-implemented in python but follows closely the Faiss C++ implementation, and should not be significantly less efficient. 
-It relies on a `DatasetAssign` object that does the assignement to centrtoids, which is the bulk of the computation. 
+- [`distributed_kmeans.py`](distributed_kmeans.py) contains the k-means implementation.
+The main loop of k-means is re-implemented in python but follows closely the Faiss C++ implementation, and should not be significantly less efficient.
+It relies on a `DatasetAssign` object that does the assignment to centroids, which is the bulk of the computation.
 The object can be a Faiss CPU index, a GPU index or a set of remote GPU or CPU indexes.
 
-- [`run_on_cluster.bash`](run_on_cluster.bash) contains the shell code to run the distributed k-means on a cluster. 
+- [`run_on_cluster.bash`](run_on_cluster.bash) contains the shell code to run the distributed k-means on a cluster.
 
 The distributed k-means works with a Python install that contains faiss and scipy (for sparse matrices).
-It clusters the training data of Deep1B, this can be changed easily to any file in fvecs, bvecs or npy format that contains the training set. 
-The training vectors may be too large to fit in RAM, but they are memory-mapped so that should not be a problem. 
+It clusters the training data of Deep1B, this can be changed easily to any file in fvecs, bvecs or npy format that contains the training set.
+The training vectors may be too large to fit in RAM, but they are memory-mapped so that should not be a problem.
 The file is also assumed to be accessible from all server machines with eg. a distributed file system.
 
-### Local tests 
+### Local tests
 
-Edit `distibuted_kmeans.py` to point `testdata` to your local copy of the dataset. 
+Edit `distributed_kmeans.py` to point `testdata` to your local copy of the dataset.
 
-Then, 4 levels of sanity check can be run: 
+Then, 4 levels of sanity check can be run:
 ```bash
 # reference Faiss C++ run
 python distributed_kmeans.py --test 0
@@ -50,69 +47,69 @@ The output should look like [This gist](https://gist.github.com/mdouze/ffa01fe66
 
 ### Distributed sanity check
 
-To run the distributed k-means, `distibuted_kmeans.py` has to be run both on the servers (`--server` option) and client sides (`--client` option). 
-Edit the top of `run_on_cluster.bash` to set the path of the data to cluster. 
+To run the distributed k-means, `distributed_kmeans.py` has to be run both on the servers (`--server` option) and client sides (`--client` option).
+Edit the top of `run_on_cluster.bash` to set the path of the data to cluster.
 
-Sanity checks can be run with 
-```bash 
+Sanity checks can be run with
+```bash
 # non distributed baseline
 bash run_on_cluster.bash test_kmeans_0
 # using all the machine's GPUs
 bash run_on_cluster.bash test_kmeans_1
-# distrbuted run, with one local server per GPU
+# distributed run, with one local server per GPU
 bash run_on_cluster.bash test_kmeans_2
 ```
-The test `test_kmeans_2` simulates a distributed run on a single machine by starting one server process per GPU and connecting to the servers via the rpc protocol. 
+The test `test_kmeans_2` simulates a distributed run on a single machine by starting one server process per GPU and connecting to the servers via the rpc protocol.
 The output should look like [this gist](https://gist.github.com/mdouze/5b2dc69b74579ecff04e1686a277d32e).
 
 
 
 ### Distributed run
 
-The way the script can be distributed depends on the cluster's scheduling system. 
-Here we use Slurm, but it should be relatively easy to adapt to any scheduler that can allocate a set of matchines and start the same exectuable on all of them. 
+The way the script can be distributed depends on the cluster's scheduling system.
+Here we use Slurm, but it should be relatively easy to adapt to any scheduler that can allocate a set of machines and start the same executable on all of them.
 
-The command 
-```
+The command
+```bash
 bash run_on_cluster.bash slurm_distributed_kmeans
 ```
-asks SLURM for 5 machines with 4 GPUs each with the `srun` command. 
-All 5 machines run the script with the `slurm_within_kmeans_server` option. 
+asks SLURM for 5 machines with 4 GPUs each with the `srun` command.
+All 5 machines run the script with the `slurm_within_kmeans_server` option.
 They determine the number of servers and their own server id via the `SLURM_NPROCS` and `SLURM_PROCID` environment variables.
 
 All machines start `distributed_kmeans.py` in server mode for the slice of the dataset they are responsible for.
 
-In addition, the machine #0 also starts the client. 
-The client knows who are the other servers via the variable `SLURM_JOB_NODELIST`. 
-It connects to all clients and performs the clustering. 
+In addition, the machine #0 also starts the client.
+The client knows who are the other servers via the variable `SLURM_JOB_NODELIST`.
+It connects to all clients and performs the clustering.
 
 The output should look like [this gist](https://gist.github.com/mdouze/8d25e89fb4af5093057cae0f917da6cd).
 
 ### Run used for deep1B
 
-For the real run, we run the clustering on 50M vectors to 1M centroids. 
+For the real run, we run the clustering on 50M vectors to 1M centroids.
 This is just a matter of using as many machines / GPUs as possible in setting the output centroids with the `--out filename` option.
 Then run
-```
+```bash
 bash run_on_cluster.bash deep1b_clustering
 ```
 
-The last lines of output read like: 
-```
+The last lines of output read like:
+```bash
   Iteration 19 (898.92 s, search 875.71 s): objective=1.33601e+07 imbalance=1.303 nsplit=0
  0: writing centroids to /checkpoint/matthijs/ondisk_distributed/1M_centroids.npy
 ```
 
-This means that the total training time was 899s, of which 876s were used for computation. 
-However, the computation includes the I/O overhead to the assignment servers. 
-In this implementation, the overhead of transmitting the data is non-negligible and so is the centroid computation stage. 
-This is due to the inefficient Python implementation and the RPC protocol that is not optimized for broadcast / gather (like MPI). 
+This means that the total training time was 899s, of which 876s were used for computation.
+However, the computation includes the I/O overhead to the assignment servers.
+In this implementation, the overhead of transmitting the data is non-negligible and so is the centroid computation stage.
+This is due to the inefficient Python implementation and the RPC protocol that is not optimized for broadcast / gather (like MPI).
 However, it is a simple implementation that should run on most clusters.
 
 ## Making the trained index
 
-After the centroids are obtained, an empty trained index must be constructed. 
-This is done by: 
+After the centroids are obtained, an empty trained index must be constructed.
+This is done by:
 
 - applying a pre-processing stage (a random rotation) to balance the dimensions of the vectors. This can be done after clustering, the clusters are just rotated as well.
 
@@ -120,44 +117,44 @@ This is done by:
 
 - training the 6-bit scalar quantizer used to encode the vectors
 
-This is performed by the script [`make_trained_index.py`](make_trained_index.py). 
+This is performed by the script [`make_trained_index.py`](make_trained_index.py).
 
 ## Building the index by slices
 
-We call the slices "vslisces" as they are vertical slices of the big matrix, see explanation in the wiki section [Split across datanbase partitions](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors#split-across-database-partitions).
+We call the slices "vslices" as they are vertical slices of the big matrix, see explanation in the wiki section [Split across database partitions](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors#split-across-database-partitions).
 
-The script [make_index_vslice.py](make_index_vslice.py) makes an index for a subset of the vectors of the input data and stores it as an independent index. 
+The script [make_index_vslice.py](make_index_vslice.py) makes an index for a subset of the vectors of the input data and stores it as an independent index.
 There are 200 slices of 5M vectors each for Deep1B.
-It can be run in a brute-force parallel fashion, there is no constraint on ordering. 
-To run the script in parallel on a slurm cluster, use: 
-```
+It can be run in a brute-force parallel fashion, there is no constraint on ordering.
+To run the script in parallel on a slurm cluster, use:
+```bash
 bash run_on_cluster.bash make_index_vslices
 ```
-For a real dataset, the data would be read from a DBMS. 
+For a real dataset, the data would be read from a DBMS.
 In that case, reading the data and indexing it in parallel is worthwhile because reading is very slow.
 
-## Splitting accross inverted lists
+## Splitting across inverted lists
 
-The 200 slices need to be merged together. 
-This is done with the script [merge_to_ondisk.py](merge_to_ondisk.py), that memory maps the 200 vertical slice indexes, extracts a subset of the inverted lists and writes them to a contiguous horizontal slice. 
-We slice the inverted lists into 50 horizontal slices. 
-This is run with 
-```
+The 200 slices need to be merged together.
+This is done with the script [merge_to_ondisk.py](merge_to_ondisk.py), that memory maps the 200 vertical slice indexes, extracts a subset of the inverted lists and writes them to a contiguous horizontal slice.
+We slice the inverted lists into 50 horizontal slices.
+This is run with
+```bash
 bash run_on_cluster.bash make_index_hslices
 ```
 
 ## Querying the index
 
-At this point the index is ready. 
-The horizontal slices need to be loaded in the right order and combined into an index to be usable. 
-This is done in the [combined_index.py](combined_index.py) script. 
-It provides a `CombinedIndexDeep1B` object that contains an index object that can be searched. 
-To test, run: 
-```
+At this point the index is ready.
+The horizontal slices need to be loaded in the right order and combined into an index to be usable.
+This is done in the [combined_index.py](combined_index.py) script.
+It provides a `CombinedIndexDeep1B` object that contains an index object that can be searched.
+To test, run:
+```bash
 python combined_index.py
 ```
-The output should look like: 
-```
+The output should look like:
+```bash
 (faiss_1.5.2) matthijs@devfair0144:~/faiss_versions/faiss_1Tcode/faiss/benchs/distributed_ondisk$ python combined_index.py
 reading /checkpoint/matthijs/ondisk_distributed//hslices/slice49.faissindex
 loading empty index /checkpoint/matthijs/ondisk_distributed/trained.faissindex
@@ -168,30 +165,30 @@ nnprobe=10 1-recall@1=0.6499 t=17.67s
 nprobe=100 1-recall@1=0.8673 t=29.23s
 nprobe=1000 1-recall@1=0.9132 t=129.58s
 ```
-ie. searching is a lot slower than from RAM. 
+ie. searching is a lot slower than from RAM.
 
 ## Distributed query
 
-To reduce the bandwidth required from the machine that does the queries, it is possible to split the search accross several search servers. 
+To reduce the bandwidth required from the machine that does the queries, it is possible to split the search across several search servers.
 This way, only the effective results are returned to the main machine.
 
-The search client and server are implemented in [`search_server.py`](search_server.py). 
+The search client and server are implemented in [`search_server.py`](search_server.py).
 It can be used as a script to start a search server for `CombinedIndexDeep1B` or as a module to load the clients.
 
-The search servers can be started with 
-```
+The search servers can be started with
+```bash
 bash run_on_cluster.bash run_search_servers
 ```
-(adjust to the number of servers that can be used). 
+(adjust to the number of servers that can be used).
 
-Then an example of search client is [`distributed_query_demo.py`](distributed_query_demo.py). 
+Then an example of search client is [`distributed_query_demo.py`](distributed_query_demo.py).
 It connects to the servers and assigns subsets of inverted lists to visit to each of them.
 
-A typical output is [this gist](https://gist.github.com/mdouze/1585b9854a9a2437d71f2b2c3c05c7c5). 
+A typical output is [this gist](https://gist.github.com/mdouze/1585b9854a9a2437d71f2b2c3c05c7c5).
 The number in MiB indicates the amount of data that is read from disk to perform the search.
 In this case, the scale of the dataset is too small for the distributed search to have much impact, but on datasets > 10x larger, the difference becomes more significant.
 
 ## Conclusion
 
-This code contains the core components to make an index that scales up to 1T vectors. 
-There are a few simplifications wrt. the index that was effectively used in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).  
+This code contains the core components to make an index that scales up to 1T vectors.
+There are a few simplifications wrt. the index that was effectively used in [Indexing 1T vectors](https://github.com/facebookresearch/faiss/wiki/Indexing-1T-vectors).
diff --git a/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py b/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
index a37050788..a68359fe2 100755
--- a/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
+++ b/thirdparty/faiss/benchs/distributed_ondisk/distributed_kmeans.py
@@ -9,79 +9,19 @@
 Simple distributed kmeans implementation Relies on an abstraction
 for the training matrix, that can be sharded over several machines.
 """
-
-import faiss
-import time
-import numpy as np
+import os
 import sys
-import pdb
 import argparse
 
-from scipy.sparse import csc_matrix
-
-from multiprocessing.dummy import Pool as ThreadPool
-
-import rpc
-
-
-
-
-class DatasetAssign:
-    """Wrapper for a matrix that offers a function to assign the vectors
-    to centroids. All other implementations offer the same interface"""
-
-    def __init__(self, x):
-        self.x = np.ascontiguousarray(x, dtype='float32')
-
-    def count(self):
-        return self.x.shape[0]
-
-    def dim(self):
-        return self.x.shape[1]
-
-    def get_subset(self, indices):
-        return self.x[indices]
-
-    def perform_search(self, centroids):
-        index = faiss.IndexFlatL2(self.x.shape[1])
-        index.add(centroids)
-        return index.search(self.x, 1)
-
-    def assign_to(self, centroids, weights=None):
-        D, I = self.perform_search(centroids)
-
-        I = I.ravel()
-        D = D.ravel()
-        n = len(self.x)
-        if weights is None:
-            weights = np.ones(n, dtype='float32')
-        nc = len(centroids)
-        m = csc_matrix((weights, I, np.arange(n + 1)),
-                       shape=(nc, n))
-        sum_per_centroid = m * self.x
-
-        return I, D, sum_per_centroid
-
-
-class DatasetAssignGPU(DatasetAssign):
-    """ GPU version of the previous """
-
-    def __init__(self, x, gpu_id, verbose=False):
-        DatasetAssign.__init__(self, x)
-        index = faiss.IndexFlatL2(x.shape[1])
-        if gpu_id >= 0:
-            self.index = faiss.index_cpu_to_gpu(
-                faiss.StandardGpuResources(),
-                gpu_id, index)
-        else:
-            # -1 -> assign to all GPUs
-            self.index = faiss.index_cpu_to_all_gpus(index)
+import numpy as np
 
+import faiss
 
-    def perform_search(self, centroids):
-        self.index.reset()
-        self.index.add(centroids)
-        return self.index.search(self.x, 1)
+from multiprocessing.pool import ThreadPool
+from faiss.contrib import rpc
+from faiss.contrib.datasets import SyntheticDataset
+from faiss.contrib.vecs_io import bvecs_mmap, fvecs_mmap
+from faiss.contrib.clustering import DatasetAssign, DatasetAssignGPU, kmeans
 
 
 class DatasetAssignDispatch:
@@ -136,109 +76,6 @@ def assign_to(self, centroids, weights=None):
         return np.hstack(I), np.hstack(D), sum_per_centroid
 
 
-def imbalance_factor(k , assign):
-    return faiss.imbalance_factor(len(assign), k, faiss.swig_ptr(assign))
-
-
-def reassign_centroids(hassign, centroids, rs=None):
-    """ reassign centroids when some of them collapse """
-    if rs is None:
-        rs = np.random
-    k, d = centroids.shape
-    nsplit = 0
-    empty_cents = np.where(hassign == 0)[0]
-
-    if empty_cents.size == 0:
-        return 0
-
-    fac = np.ones(d)
-    fac[::2] += 1 / 1024.
-    fac[1::2] -= 1 / 1024.
-
-    # this is a single pass unless there are more than k/2
-    # empty centroids
-    while empty_cents.size > 0:
-        # choose which centroids to split
-        probas = hassign.astype('float') - 1
-        probas[probas < 0] = 0
-        probas /= probas.sum()
-        nnz = (probas > 0).sum()
-
-        nreplace = min(nnz, empty_cents.size)
-        cjs = rs.choice(k, size=nreplace, p=probas)
-
-        for ci, cj in zip(empty_cents[:nreplace], cjs):
-
-            c = centroids[cj]
-            centroids[ci] = c * fac
-            centroids[cj] = c / fac
-
-            hassign[ci] = hassign[cj] // 2
-            hassign[cj] -= hassign[ci]
-            nsplit += 1
-
-        empty_cents = empty_cents[nreplace:]
-
-    return nsplit
-
-
-def kmeans(k, data, niter=25, seed=1234, checkpoint=None):
-    """Pure python kmeans implementation. Follows the Faiss C++ version
-    quite closely, but takes a DatasetAssign instead of a training data
-    matrix. Also redo is not implemented. """
-    n, d = data.count(), data.dim()
-
-    print(("Clustering %d points in %dD to %d clusters, " +
-            "%d iterations seed %d") % (n, d, k, niter, seed))
-
-    rs = np.random.RandomState(seed)
-    print("preproc...")
-    t0 = time.time()
-    # initialization
-    perm = rs.choice(n, size=k, replace=False)
-    centroids = data.get_subset(perm)
-
-    print("  done")
-    t_search_tot = 0
-    obj = []
-    for i in range(niter):
-        t0s = time.time()
-
-        print('assigning', end='\r', flush=True)
-        assign, D, sums = data.assign_to(centroids)
-
-        print('compute centroids', end='\r', flush=True)
-
-        # pdb.set_trace()
-
-        t_search_tot += time.time() - t0s;
-
-        err = D.sum()
-        obj.append(err)
-
-        hassign = np.bincount(assign, minlength=k)
-
-        fac = hassign.reshape(-1, 1).astype('float32')
-        fac[fac == 0] = 1 # quiet warning
-
-        centroids = sums / fac
-
-        nsplit = reassign_centroids(hassign, centroids, rs)
-
-        print(("  Iteration %d (%.2f s, search %.2f s): "
-               "objective=%g imbalance=%.3f nsplit=%d") % (
-                   i, (time.time() - t0), t_search_tot,
-                   err, imbalance_factor (k, assign),
-                   nsplit)
-        )
-
-        if checkpoint is not None:
-            print('storing centroids in', checkpoint)
-            np.save(checkpoint, centroids)
-
-    return centroids
-
-
 class AssignServer(rpc.Server):
     """ Assign version that can be exposed via RPC """
 
@@ -251,25 +88,17 @@ def __getattr__(self, f):
 
 
 
-def bvecs_mmap(fname):
-    x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
-    return x.reshape(-1, d + 4)[:, 4:]
-
-
-def ivecs_mmap(fname):
-    a = np.memmap(fname, dtype='int32', mode='r')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:]
-
-def fvecs_mmap(fname):
-    return ivecs_mmap(fname).view('float32')
-
 
 def do_test(todo):
+
     testdata = '/datasets01_101/simsearch/041218/bigann/bigann_learn.bvecs'
 
-    x = bvecs_mmap(testdata)
+    if os.path.exists(testdata):
+        x = bvecs_mmap(testdata)
+    else:
+        print("using synthetic dataset")
+        ds = SyntheticDataset(128, 100000, 0, 0)
+        x = ds.get_train()
 
     # bad distribution to stress-test split code
     xx = x[:100000].copy()
diff --git a/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py b/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
index ca58425b2..4d26aa202 100644
--- a/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
+++ b/thirdparty/faiss/benchs/distributed_ondisk/make_index_vslice.py
@@ -8,7 +8,7 @@
 import numpy as np
 import faiss
 import argparse
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 def ivecs_mmap(fname):
     a = np.memmap(fname, dtype='int32', mode='r')
diff --git a/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py b/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
index 5c8f3ace9..bb5750fdf 100644
--- a/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
+++ b/thirdparty/faiss/benchs/distributed_ondisk/merge_to_ondisk.py
@@ -6,7 +6,7 @@
 import os
 import faiss
 import argparse
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 if __name__ == '__main__':
 
diff --git a/thirdparty/faiss/benchs/distributed_ondisk/rpc.py b/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
deleted file mode 100755
index caa4b3572..000000000
--- a/thirdparty/faiss/benchs/distributed_ondisk/rpc.py
+++ /dev/null
@@ -1,252 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Simplistic RPC implementation.
-Exposes all functions of a Server object.
-
-Uses pickle for serialization and the socket interface.
-"""
-
-import os,pdb,pickle,time,errno,sys,_thread,traceback,socket,threading,gc
-
-
-# default
-PORT=12032
-
-
-#########################################################################
-# simple I/O functions
-
-
-
-def inline_send_handle(f, conn):
-    st = os.fstat(f.fileno())
-    size = st.st_size
-    pickle.dump(size, conn)
-    conn.write(f.read(size))
-
-def inline_send_string(s, conn):
-    size = len(s)
-    pickle.dump(size, conn)
-    conn.write(s)
-
-
-class FileSock:
-    " wraps a socket so that it is usable by pickle/cPickle "
-
-    def __init__(self,sock):
-        self.sock = sock
-        self.nr=0
-
-    def write(self, buf):
-        # print("sending %d bytes"%len(buf))
-        #self.sock.sendall(buf)
-        # print("...done")
-        bs = 512 * 1024
-        ns = 0
-        while ns < len(buf):
-            sent = self.sock.send(buf[ns:ns + bs])
-            ns += sent
-
-
-    def read(self,bs=512*1024):
-        #if self.nr==10000: pdb.set_trace()
-        self.nr+=1
-        # print("read bs=%d"%bs)
-        b = []
-        nb = 0
-        while len(b)<bs:
-            # print('   loop')
-            rb = self.sock.recv(bs - nb)
-            if not rb: break
-            b.append(rb)
-            nb += len(rb)
-        return b''.join(b)
-
-    def readline(self):
-        # print("readline!")
-        """may be optimized..."""
-        s=bytes()
-        while True:
-            c=self.read(1)
-            s+=c
-        if len(c)==0 or chr(c[0])=='\n':
-            return s
-
-class ClientExit(Exception):
-    pass
-
-class ServerException(Exception):
-    pass
-
-
-class Server:
-    """
-    server protocol. Methods from classes that subclass Server can be called
-    transparently from a client
-    """
-
-    def __init__(self, s, logf=sys.stderr, log_prefix=''):
-        self.logf = logf
-        self.log_prefix = log_prefix
-
-        # connection
-
-        self.conn = s
-        self.fs = FileSock(s)
-
-
-    def log(self, s):
-        self.logf.write("Sever log %s: %s\n" % (self.log_prefix, s))
-
-    def one_function(self):
-        """
-        Executes a single function with associated I/O.
-        Protocol:
-        - the arguments and results are serialized with the pickle protocol
-        - client sends : (fname,args)
-            fname = method name to call
-            args = tuple of arguments
-        - server sends result: (rid,st,ret)
-            rid = request id
-            st = None, or exception if there was during execution
-            ret = return value or None if st!=None
-        """
-
-        try:
-            (fname,args)=pickle.load(self.fs)
-        except EOFError:
-            raise ClientExit("read args")
-        self.log("executing method %s"%(fname))
-        st = None
-        ret = None
-        try:
-            f=getattr(self,fname)
-        except AttributeError:
-            st = AttributeError("unknown method "+fname)
-            self.log("unknown method ")
-
-        try:
-            ret = f(*args)
-        except Exception as e:
-            # due to a bug (in mod_python?), ServerException cannot be
-            # unpickled, so send the string and make the exception on the client side
-
-            #st=ServerException(
-            #  "".join(traceback.format_tb(sys.exc_info()[2]))+
-            #  str(e))
-            st="".join(traceback.format_tb(sys.exc_info()[2]))+str(e)
-            self.log("exception in method")
-            traceback.print_exc(50,self.logf)
-            self.logf.flush()
-
-        print("return")
-        try:
-            pickle.dump((st ,ret), self.fs, protocol=4)
-        except EOFError:
-            raise ClientExit("function return")
-
-    def exec_loop(self):
-        """ main execution loop. Loops and handles exit states"""
-
-        self.log("in exec_loop")
-        try:
-            while True:
-                self.one_function()
-        except ClientExit as e:
-            self.log("ClientExit %s"%e)
-        except socket.error as e:
-            self.log("socket error %s"%e)
-            traceback.print_exc(50,self.logf)
-        except EOFError:
-            self.log("EOF during communication")
-            traceback.print_exc(50,self.logf)
-        except BaseException:
-            # unexpected
-            traceback.print_exc(50,sys.stderr)
-            sys.exit(1)
-
-        print("exit sever")
-
-    def exec_loop_cleanup(self):
-        pass
-
-    ###################################################################
-    # spying stuff
-
-    def get_ps_stats(self):
-        ret=''
-        f=os.popen("echo ============ `hostname` uptime:; uptime;"+
-                   "echo ============ self:; "+
-                   "ps -p %d -o pid,vsize,rss,%%cpu,nlwp,psr; "%os.getpid()+
-                   "echo ============ run queue:;"+
-                   "ps ar -o user,pid,%cpu,%mem,ni,nlwp,psr,vsz,rss,cputime,command")
-        for l in f:
-            ret+=l
-        return ret
-
-class Client:
-    """
-    Methods of the server object can be called transparently. Exceptions are
-    re-raised.
-    """
-    def __init__(self, HOST, port=PORT, v6=False):
-        socktype = socket.AF_INET6 if v6 else socket.AF_INET
-
-        sock = socket.socket(socktype, socket.SOCK_STREAM)
-        print("connecting",HOST, port, socktype)
-        sock.connect((HOST, port))
-        self.sock = sock
-        self.fs = FileSock(sock)
-
-    def generic_fun(self, fname, args):
-        # int "gen fun",fname
-        pickle.dump((fname, args), self.fs, protocol=4)
-        return self.get_result()
-
-    def get_result(self):
-        (st, ret) = pickle.load(self.fs)
-        if st!=None:
-            raise ServerException(st)
-        else:
-            return ret
-
-    def __getattr__(self,name):
-        return lambda *x: self.generic_fun(name,x)
-
-
-def run_server(new_handler, port=PORT, report_to_file=None, v6=False):
-
-    HOST = ''                 # Symbolic name meaning the local host
-    socktype = socket.AF_INET6 if v6 else socket.AF_INET
-    s = socket.socket(socktype, socket.SOCK_STREAM)
-    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-
-    print("bind %s:%d" % (HOST, port))
-    s.bind((HOST, port))
-    s.listen(5)
-
-    print("accepting connections")
-    if report_to_file is not None:
-        print('storing host+port in', report_to_file)
-        open(report_to_file, 'w').write('%s:%d ' % (socket.gethostname(), port))
-
-    while True:
-        try:
-            conn, addr = s.accept()
-        except socket.error as e:
-            if e[1]=='Interrupted system call': continue
-            raise
-
-        print('Connected by', addr, end=' ')
-
-        ibs = new_handler(conn)
-
-        tid = _thread.start_new_thread(ibs.exec_loop,())
-
-        print("tid",tid)
diff --git a/thirdparty/faiss/benchs/distributed_ondisk/search_server.py b/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
index 9239afd59..a7445d558 100644
--- a/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
+++ b/thirdparty/faiss/benchs/distributed_ondisk/search_server.py
@@ -3,14 +3,15 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import os
 import time
-import rpc
-import combined_index
 
+from faiss.contrib import rpc
+
+import combined_index
 import argparse
 
 
+
 ############################################################
 # Server implementation
 ############################################################
@@ -63,7 +64,7 @@ def aa(*args, **kwargs):
 # Client implementation
 ############################################################
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 import faiss
 import numpy as np
 
diff --git a/thirdparty/faiss/benchs/link_and_code/README.md b/thirdparty/faiss/benchs/link_and_code/README.md
index 28fe20c52..bbf034bc6 100644
--- a/thirdparty/faiss/benchs/link_and_code/README.md
+++ b/thirdparty/faiss/benchs/link_and_code/README.md
@@ -30,7 +30,7 @@ The test runs with 3 files:
 
 - `datasets.py`: code to load the datasets. The example code runs on the
   deep1b and bigann datasets. See the [toplevel README](../README.md)
-  on how to downlod them. They should be put in a directory, edit
+  on how to download them. They should be put in a directory, edit
   datasets.py to set the path.
 
 - `neighbor_codec.py`: this is where the representation is trained.
@@ -46,7 +46,7 @@ Reproducing Table 2 in the paper
 The results of table 2 (accuracy on deep100M) in the paper can be
 obtained with:
 
-```
+```bash
 python bench_link_and_code.py \
    --db deep100M \
    --M0 6 \
@@ -84,7 +84,7 @@ Explanation of the flags:
 - `--neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq4_codes.npy`: filename
   for the encoded weights (beta) of the combination
 
-- `--k_reorder 0,5`: number of restults to reorder. 0 = baseline
+- `--k_reorder 0,5`: number of results to reorder. 0 = baseline
   without reordering, 5 = value used throughout the paper
 
 - `--efSearch 1,1024`: number of nodes to visit (T in the paper)
@@ -98,7 +98,7 @@ ground-truth file is not provided)
 
 2. build the index and store it
 
-3. compute the residuals and train the beta vocabulary to do the reconstuction
+3. compute the residuals and train the beta vocabulary to do the reconstruction
 
 4. encode the vertices
 
@@ -108,7 +108,7 @@ With option `--exhaustive` the results of the exhaustive column can be
 obtained.
 
 The run above should output:
-```
+```bash
 ...
 setting k_reorder=5
 ...
@@ -117,7 +117,7 @@ efSearch=1024      0.3132 ms per query,  R@1: 0.4283 R@10: 0.6337 R@100: 0.6520
 ```
 which matches the paper's table 2.
 
-Note that in multi-threaded mode, the building of the HNSW strcuture
+Note that in multi-threaded mode, the building of the HNSW structure
 is not deterministic. Therefore, the results across runs may not be exactly the same.
 
 Reproducing Figure 5 in the paper
@@ -126,7 +126,7 @@ Reproducing Figure 5 in the paper
 Figure 5 just evaluates the combination of HNSW and PQ. For example,
 the operating point L6&OPQ40 can be obtained with
 
-```
+```bash
 python bench_link_and_code.py \
    --db deep1M \
    --M0 6 \
@@ -144,7 +144,7 @@ reproduction value).
 
 The output should look like:
 
-```
+```bash
 setting k_reorder=0
 efSearch=16        0.0147 ms per query,  R@1: 0.3409 R@10: 0.4388 R@100: 0.4394 ndis 2629735 nreorder 0
 efSearch=64        0.0122 ms per query,  R@1: 0.4836 R@10: 0.6490 R@100: 0.6509 ndis 4623221 nreorder 0
diff --git a/thirdparty/faiss/benchs/link_and_code/bench_link_and_code.py b/thirdparty/faiss/benchs/link_and_code/bench_link_and_code.py
index 34e7022dd..ed8f86d63 100755
--- a/thirdparty/faiss/benchs/link_and_code/bench_link_and_code.py
+++ b/thirdparty/faiss/benchs/link_and_code/bench_link_and_code.py
@@ -8,10 +8,7 @@
 import sys
 import time
 import numpy as np
-import re
 import faiss
-from multiprocessing.dummy import Pool as ThreadPool
-import pdb
 import argparse
 import datasets
 from datasets import sanitize
diff --git a/thirdparty/faiss/conda/Dockerfile.cpu b/thirdparty/faiss/conda/Dockerfile.cpu
deleted file mode 100644
index 6d21ff4eb..000000000
--- a/thirdparty/faiss/conda/Dockerfile.cpu
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-FROM nvidia/cuda:10.2-devel-ubuntu18.04
-
-RUN apt-get update && apt-get install -y wget git
-
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-        bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
-ENV PATH="/root/miniconda3/condabin:${PATH}"
-
-RUN conda install conda-build
-
-COPY ./ faiss
-WORKDIR /faiss/conda
-
-RUN conda build faiss --no-anaconda-upload -c pytorch
diff --git a/thirdparty/faiss/conda/Dockerfile.cuda10.2 b/thirdparty/faiss/conda/Dockerfile.cuda10.2
deleted file mode 100644
index ce93e4f2b..000000000
--- a/thirdparty/faiss/conda/Dockerfile.cuda10.2
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-FROM nvidia/cuda:10.2-devel-centos8
-
-RUN yum install -y wget git libcublas-devel-10-2
-
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-        bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
-ENV PATH="/root/miniconda3/condabin:${PATH}"
-
-RUN conda install -y -q conda-build anaconda-client
-RUN conda config --set anaconda_upload yes
-
-COPY ./ faiss
-WORKDIR /faiss/conda
diff --git a/thirdparty/faiss/conda/Dockerfile.cuda11.3 b/thirdparty/faiss/conda/Dockerfile.cuda11.3
deleted file mode 100644
index 70d52da7f..000000000
--- a/thirdparty/faiss/conda/Dockerfile.cuda11.3
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-FROM nvidia/cuda:11.3.1-devel-centos8
-
-RUN yum install -y wget git libcublas-devel-11-3
-
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-        bash Miniconda3-latest-Linux-x86_64.sh -b -p ~/miniconda3
-ENV PATH="/root/miniconda3/condabin:${PATH}"
-
-RUN conda install -y -q conda-build anaconda-client
-RUN conda config --set anaconda_upload yes
-
-COPY ./ faiss
-WORKDIR /faiss/conda
diff --git a/thirdparty/faiss/conda/conda_build_config.yaml b/thirdparty/faiss/conda/conda_build_config.yaml
index 503b9cb8e..77f0eec0a 100644
--- a/thirdparty/faiss/conda/conda_build_config.yaml
+++ b/thirdparty/faiss/conda/conda_build_config.yaml
@@ -1,6 +1,4 @@
-CONDA_BUILD_SYSROOT:
-  - /opt/MacOSX10.9.sdk        # [osx]
 python:
-  - 3.6
-  - 3.7
-  - 3.8
+  - 3.9
+  - 3.10
+  - 3.11
diff --git a/thirdparty/faiss/conda/faiss-gpu-raft/build-lib.sh b/thirdparty/faiss/conda/faiss-gpu-raft/build-lib.sh
new file mode 100644
index 000000000..7ca17180a
--- /dev/null
+++ b/thirdparty/faiss/conda/faiss-gpu-raft/build-lib.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build libfaiss.so/libfaiss_avx2.so.
+cmake -B _build \
+      -DBUILD_SHARED_LIBS=ON \
+      -DBUILD_TESTING=OFF \
+      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_ENABLE_GPU=ON \
+      -DFAISS_ENABLE_RAFT=ON \
+      -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHS}" \
+      -DFAISS_ENABLE_PYTHON=OFF \
+      -DBLA_VENDOR=Intel10_64lp \
+      -DCMAKE_INSTALL_LIBDIR=lib \
+      -DCMAKE_BUILD_TYPE=Release .
+
+make -C _build -j$(nproc) faiss faiss_avx2
+
+cmake --install _build --prefix $PREFIX
+cmake --install _build --prefix _libfaiss_stage/
diff --git a/thirdparty/faiss/conda/faiss-gpu-raft/build-pkg.sh b/thirdparty/faiss/conda/faiss-gpu-raft/build-pkg.sh
new file mode 100644
index 000000000..3bb61588e
--- /dev/null
+++ b/thirdparty/faiss/conda/faiss-gpu-raft/build-pkg.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build swigfaiss.so/swigfaiss_avx2.so.
+cmake -B _build_python_${PY_VER} \
+      -Dfaiss_ROOT=_libfaiss_stage/ \
+      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_ENABLE_GPU=ON \
+      -DFAISS_ENABLE_RAFT=ON \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DPython_EXECUTABLE=$PYTHON \
+      faiss/python
+
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
+
+# Build actual python module.
+cd _build_python_${PY_VER}/
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX
diff --git a/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml b/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
new file mode 100644
index 000000000..14a5c606b
--- /dev/null
+++ b/thirdparty/faiss/conda/faiss-gpu-raft/meta.yaml
@@ -0,0 +1,104 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
+{% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
+{% set number = GIT_DESCRIBE_NUMBER %}
+
+package:
+  name: faiss-pkg
+  version: {{ version }}
+
+build:
+  number: {{ number }}
+
+about:
+  home: https://github.com/facebookresearch/faiss
+  license: MIT
+  license_family: MIT
+  license_file: LICENSE
+  summary: A library for efficient similarity search and clustering of dense vectors.
+
+source:
+  git_url: ../../
+
+outputs:
+  - name: libfaiss
+    script: build-lib.sh  # [x86_64 and not win and not osx]
+    script: build-lib-osx.sh  # [x86_64 and osx]
+    script: build-lib-arm64.sh  # [not x86_64]
+    script: build-lib.bat  # [win]
+    build:
+      string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}_raft{{ suffix }}"
+      run_exports:
+        - {{ pin_compatible('libfaiss', exact=True) }}
+      script_env:
+        - CUDA_ARCHS
+    requirements:
+      build:
+        - {{ compiler('cxx') }}
+        - sysroot_linux-64  # [linux64]
+        - llvm-openmp  # [osx]
+        - cmake >=3.23.1
+        - make  # [not win]
+        - mkl-devel =2023  # [x86_64]
+      host:
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
+        - cudatoolkit {{ cudatoolkit }}
+        - libraft =23.08
+      run:
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
+        - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
+        - libraft =23.08
+    test:
+      requires:
+        - conda-build
+      commands:
+        - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
+        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
+        - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
+        - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
+
+  - name: faiss-gpu-raft
+    script: build-pkg.sh  # [x86_64 and not win and not osx]
+    script: build-pkg-osx.sh  # [x86_64 and osx]
+    script: build-pkg-arm64.sh # [not x86_64]
+    script: build-pkg.bat  # [win]
+    build:
+      string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
+    requirements:
+      build:
+        - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
+        - swig
+        - cmake >=3.23.1
+        - make  # [not win]
+      host:
+        - python {{ python }}
+        - numpy >=1.19,<2
+        - {{ pin_subpackage('libfaiss', exact=True) }}
+      run:
+        - python {{ python }}
+        - numpy >=1.19,<2
+        - {{ pin_subpackage('libfaiss', exact=True) }}
+    test:
+      requires:
+        - numpy
+        - scipy
+        - pytorch
+      commands:
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
+        - cp tests/common_faiss_tests.py faiss/gpu/test
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
+        - sh test_cpu_dispatch.sh  # [linux64]
+      files:
+        - test_cpu_dispatch.sh  # [linux64]
+      source_files:
+        - tests/
+        - faiss/gpu/test/
diff --git a/thirdparty/faiss/conda/faiss-gpu/install-cmake.sh b/thirdparty/faiss/conda/faiss-gpu-raft/test_cpu_dispatch.sh
similarity index 54%
rename from thirdparty/faiss/conda/faiss-gpu/install-cmake.sh
rename to thirdparty/faiss/conda/faiss-gpu-raft/test_cpu_dispatch.sh
index 88bd9b909..b2891919d 100755
--- a/thirdparty/faiss/conda/faiss-gpu/install-cmake.sh
+++ b/thirdparty/faiss/conda/faiss-gpu-raft/test_cpu_dispatch.sh
@@ -6,5 +6,5 @@
 
 set -e
 
-wget -O - https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.tar.gz | tar xzf -
-cp -R cmake-3.17.1-Linux-x86_64/* $PREFIX
+FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss.so
+LD_DEBUG=libs python -c "import faiss" 2>&1 | grep libfaiss_avx2.so
diff --git a/thirdparty/faiss/conda/faiss-gpu/build-lib.sh b/thirdparty/faiss/conda/faiss-gpu/build-lib.sh
index 31e52fe7c..6b6b1c28d 100755
--- a/thirdparty/faiss/conda/faiss-gpu/build-lib.sh
+++ b/thirdparty/faiss/conda/faiss-gpu/build-lib.sh
@@ -13,13 +13,14 @@ cmake -B _build \
       -DBUILD_TESTING=OFF \
       -DFAISS_OPT_LEVEL=avx2 \
       -DFAISS_ENABLE_GPU=ON \
+      -DFAISS_ENABLE_RAFT=OFF \
       -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHS}" \
       -DFAISS_ENABLE_PYTHON=OFF \
       -DBLA_VENDOR=Intel10_64lp \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j $CPU_COUNT faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/thirdparty/faiss/conda/faiss-gpu/build-pkg.sh b/thirdparty/faiss/conda/faiss-gpu/build-pkg.sh
index 2233e88a8..3a4151192 100755
--- a/thirdparty/faiss/conda/faiss-gpu/build-pkg.sh
+++ b/thirdparty/faiss/conda/faiss-gpu/build-pkg.sh
@@ -12,11 +12,12 @@ cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
       -DFAISS_OPT_LEVEL=avx2 \
       -DFAISS_ENABLE_GPU=ON \
+      -DFAISS_ENABLE_RAFT=OFF \
       -DCMAKE_BUILD_TYPE=Release \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j $CPU_COUNT swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff --git a/thirdparty/faiss/conda/faiss-gpu/meta.yaml b/thirdparty/faiss/conda/faiss-gpu/meta.yaml
index 5e40b36db..fcfd3b4bd 100644
--- a/thirdparty/faiss/conda/faiss-gpu/meta.yaml
+++ b/thirdparty/faiss/conda/faiss-gpu/meta.yaml
@@ -26,7 +26,10 @@ source:
 
 outputs:
   - name: libfaiss
-    script: build-lib.sh
+    script: build-lib.sh  # [x86_64 and not win and not osx]
+    script: build-lib-osx.sh  # [x86_64 and osx]
+    script: build-lib-arm64.sh  # [not x86_64]
+    script: build-lib.bat  # [win]
     build:
       string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
       run_exports:
@@ -36,40 +39,49 @@ outputs:
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.18
+        - cmake >=3.23.1
         - make  # [not win]
+        - mkl-devel =2023  # [x86_64]
       host:
-        - mkl =2018
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
         - cudatoolkit {{ cudatoolkit }}
       run:
-        - mkl >=2018  # [not win]
-        - mkl >=2018,<2021  # [win]
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
         - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
     test:
+      requires:
+        - conda-build
       commands:
-        - test -f $PREFIX/lib/libfaiss.so              # [linux]
-        - test -f $PREFIX/lib/libfaiss.dylib           # [osx]
+        - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
+        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
         - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
         - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
 
   - name: faiss-gpu
-    script: build-pkg.sh
+    script: build-pkg.sh  # [x86_64 and not win and not osx]
+    script: build-pkg-osx.sh  # [x86_64 and osx]
+    script: build-pkg-arm64.sh # [not x86_64]
+    script: build-pkg.bat  # [win]
     build:
       string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.17
+        - cmake >=3.23.1
         - make  # [not win]
       host:
         - python {{ python }}
-        - numpy =1.11
+        - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
         - python {{ python }}
-        - numpy >=1.11,<2
+        - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
     test:
       requires:
@@ -77,12 +89,14 @@ outputs:
         - scipy
         - pytorch
       commands:
-        - python -m unittest discover tests/
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
         - cp tests/common_faiss_tests.py faiss/gpu/test
-        - python -m unittest discover faiss/gpu/test/
-        - sh test_cpu_dispatch.sh  # [linux]
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
+        - sh test_cpu_dispatch.sh  # [linux64]
       files:
-        - test_cpu_dispatch.sh  # [linux]
+        - test_cpu_dispatch.sh  # [linux64]
       source_files:
         - tests/
         - faiss/gpu/test/
diff --git a/thirdparty/faiss/conda/faiss/build-lib-arm64.sh b/thirdparty/faiss/conda/faiss/build-lib-arm64.sh
new file mode 100755
index 000000000..983e0c613
--- /dev/null
+++ b/thirdparty/faiss/conda/faiss/build-lib-arm64.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build libfaiss.so
+cmake -B _build \
+      -DBUILD_SHARED_LIBS=ON \
+      -DBUILD_TESTING=OFF \
+      -DFAISS_ENABLE_GPU=OFF \
+      -DFAISS_ENABLE_PYTHON=OFF \
+      -DCMAKE_INSTALL_LIBDIR=lib \
+      -DCMAKE_BUILD_TYPE=Release .
+
+make -C _build -j$(nproc) faiss
+
+cmake --install _build --prefix $PREFIX
+cmake --install _build --prefix _libfaiss_stage/
diff --git a/thirdparty/faiss/conda/faiss/build-lib-osx.sh b/thirdparty/faiss/conda/faiss/build-lib-osx.sh
new file mode 100755
index 000000000..a30de2d00
--- /dev/null
+++ b/thirdparty/faiss/conda/faiss/build-lib-osx.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build libfaiss.so/libfaiss_avx2.so.
+cmake -B _build \
+      -DBUILD_SHARED_LIBS=ON \
+      -DBUILD_TESTING=OFF \
+      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_ENABLE_GPU=OFF \
+      -DFAISS_ENABLE_PYTHON=OFF \
+      -DBLA_VENDOR=Intel10_64lp \
+      -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
+      -DOpenMP_CXX_LIB_NAMES=libiomp5 \
+      -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \
+      -DCMAKE_INSTALL_LIBDIR=lib \
+      -DCMAKE_BUILD_TYPE=Release .
+
+make -C _build -j$(nproc) faiss faiss_avx2
+
+cmake --install _build --prefix $PREFIX
+cmake --install _build --prefix _libfaiss_stage/
diff --git a/thirdparty/faiss/conda/faiss/build-lib.sh b/thirdparty/faiss/conda/faiss/build-lib.sh
index 23edeefb9..8aed84ba4 100755
--- a/thirdparty/faiss/conda/faiss/build-lib.sh
+++ b/thirdparty/faiss/conda/faiss/build-lib.sh
@@ -18,7 +18,7 @@ cmake -B _build \
       -DCMAKE_INSTALL_LIBDIR=lib \
       -DCMAKE_BUILD_TYPE=Release .
 
-make -C _build -j $CPU_COUNT faiss faiss_avx2
+make -C _build -j$(nproc) faiss faiss_avx2
 
 cmake --install _build --prefix $PREFIX
 cmake --install _build --prefix _libfaiss_stage/
diff --git a/thirdparty/faiss/conda/faiss/build-pkg-arm64.sh b/thirdparty/faiss/conda/faiss/build-pkg-arm64.sh
new file mode 100755
index 000000000..c63380ab0
--- /dev/null
+++ b/thirdparty/faiss/conda/faiss/build-pkg-arm64.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build swigfaiss.so/swigfaiss_avx2.so.
+cmake -B _build_python_${PY_VER} \
+      -Dfaiss_ROOT=_libfaiss_stage/ \
+      -DFAISS_ENABLE_GPU=OFF \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DPython_EXECUTABLE=$PYTHON \
+      faiss/python
+
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss
+
+# Build actual python module.
+cd _build_python_${PY_VER}/
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX
diff --git a/thirdparty/faiss/conda/faiss/build-pkg-osx.sh b/thirdparty/faiss/conda/faiss/build-pkg-osx.sh
new file mode 100755
index 000000000..15016face
--- /dev/null
+++ b/thirdparty/faiss/conda/faiss/build-pkg-osx.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -e
+
+
+# Build swigfaiss.so/swigfaiss_avx2.so.
+cmake -B _build_python_${PY_VER} \
+      -Dfaiss_ROOT=_libfaiss_stage/ \
+      -DFAISS_OPT_LEVEL=avx2 \
+      -DFAISS_ENABLE_GPU=OFF \
+      -DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
+      -DOpenMP_CXX_LIB_NAMES=libiomp5 \
+      -DOpenMP_libiomp5_LIBRARY=$PREFIX/lib/libiomp5.dylib \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DPython_EXECUTABLE=$PYTHON \
+      faiss/python
+
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
+
+# Build actual python module.
+cd _build_python_${PY_VER}/
+$PYTHON setup.py install --single-version-externally-managed --record=record.txt --prefix=$PREFIX
diff --git a/thirdparty/faiss/conda/faiss/build-pkg.sh b/thirdparty/faiss/conda/faiss/build-pkg.sh
index f659da154..005aec2fc 100755
--- a/thirdparty/faiss/conda/faiss/build-pkg.sh
+++ b/thirdparty/faiss/conda/faiss/build-pkg.sh
@@ -16,7 +16,7 @@ cmake -B _build_python_${PY_VER} \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
 
-make -C _build_python_${PY_VER} -j $CPU_COUNT swigfaiss swigfaiss_avx2
+make -C _build_python_${PY_VER} -j$(nproc) swigfaiss swigfaiss_avx2
 
 # Build actual python module.
 cd _build_python_${PY_VER}/
diff --git a/thirdparty/faiss/conda/faiss/install-cmake.sh b/thirdparty/faiss/conda/faiss/install-cmake.sh
deleted file mode 100755
index c92b8d14e..000000000
--- a/thirdparty/faiss/conda/faiss/install-cmake.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh#
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-set -e
-
-wget -O - https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.tar.gz | tar xzf -
-cp -R cmake-3.17.1-Linux-x86_64/* $PREFIX
diff --git a/thirdparty/faiss/conda/faiss/meta.yaml b/thirdparty/faiss/conda/faiss/meta.yaml
index 30a78df36..a0431a404 100644
--- a/thirdparty/faiss/conda/faiss/meta.yaml
+++ b/thirdparty/faiss/conda/faiss/meta.yaml
@@ -26,7 +26,9 @@ source:
 
 outputs:
   - name: libfaiss
-    script: build-lib.sh   # [not win]
+    script: build-lib.sh  # [x86_64 and not win and not osx]
+    script: build-lib-osx.sh  # [x86_64 and osx]
+    script: build-lib-arm64.sh  # [not x86_64]
     script: build-lib.bat  # [win]
     build:
       string: "h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}"
@@ -35,39 +37,47 @@ outputs:
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.17
+        - cmake >=3.23.1
         - make  # [not win]
+        - mkl-devel =2023  # [x86_64]
       host:
-        - mkl =2018
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
       run:
-        - mkl >=2018  # [not win]
-        - mkl >=2018,<2021  # [win]
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
     test:
+      requires:
+        - conda-build
       commands:
         - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
-        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [not win]
+        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
         - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
         - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
 
   - name: faiss-cpu
-    script: build-pkg.sh   # [not win]
+    script: build-pkg.sh  # [x86_64 and not win and not osx]
+    script: build-pkg-osx.sh  # [x86_64 and osx]
+    script: build-pkg-arm64.sh # [not x86_64]
     script: build-pkg.bat  # [win]
     build:
       string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cpu{{ suffix }}"
     requirements:
       build:
         - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.17
+        - cmake >=3.23.1
         - make  # [not win]
       host:
         - python {{ python }}
-        - numpy =1.11
+        - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
         - python {{ python }}
-        - numpy >=1.11,<2
+        - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
     test:
       requires:
@@ -75,10 +85,10 @@ outputs:
         - scipy
         - pytorch
       commands:
-        - python -X faulthandler -m unittest discover -v -s tests -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s tests -p "torch_*"
-        - sh test_cpu_dispatch.sh  # [linux]
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
+        - sh test_cpu_dispatch.sh  # [linux64]
       files:
-        - test_cpu_dispatch.sh  # [linux]
+        - test_cpu_dispatch.sh  # [linux64]
       source_files:
         - tests/
diff --git a/thirdparty/faiss/contrib/README.md b/thirdparty/faiss/contrib/README.md
index a31f85a47..f2b7d0f84 100644
--- a/thirdparty/faiss/contrib/README.md
+++ b/thirdparty/faiss/contrib/README.md
@@ -19,7 +19,7 @@ A very simple Remote Procedure Call library, where function parameters and resul
 ### client_server.py
 
 The server handles requests to a Faiss index. The client calls the remote index.
-This is mainly to shard datasets over several machines, see [Distributd index](https://github.com/facebookresearch/faiss/wiki/Indexes-that-do-not-fit-in-RAM#distributed-index)
+This is mainly to shard datasets over several machines, see [Distributed index](https://github.com/facebookresearch/faiss/wiki/Indexes-that-do-not-fit-in-RAM#distributed-index)
 
 ### ondisk.py
 
@@ -52,7 +52,7 @@ A few functions to override the coarse quantizer in IVF, providing additional fl
 
 (may require h5py)
 
-Defintion of how to access data for some standard datsets.
+Definition of how to access data for some standard datasets.
 
 ### factory_tools.py
 
@@ -61,3 +61,16 @@ Functions related to factory strings.
 ### evaluation.py
 
 A few non-trivial evaluation functions for search results
+
+### clustering.py
+
+Contains:
+
+- a Python implementation of kmeans, that can be used for special datatypes (eg. sparse matrices).
+
+- a 2-level clustering routine and a function that can apply it to train an IndexIVF
+
+### big_batch_search.py
+
+Search IVF indexes with one centroid after another. Useful for large
+databases that do not fit in RAM *and* a large number of queries.
diff --git a/thirdparty/faiss/contrib/big_batch_search.py b/thirdparty/faiss/contrib/big_batch_search.py
new file mode 100644
index 000000000..6b0fd36e9
--- /dev/null
+++ b/thirdparty/faiss/contrib/big_batch_search.py
@@ -0,0 +1,508 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import pickle
+import os
+from multiprocessing.pool import ThreadPool
+import threading
+import _thread
+from queue import Queue
+import traceback
+import datetime
+
+import numpy as np
+import faiss
+
+from faiss.contrib.inspect_tools import get_invlist
+
+
+class BigBatchSearcher:
+    """
+    Object that manages all the data related to the computation
+    except the actual within-bucket matching and the organization of the
+    computation (parallel or not)
+    """
+
+    def __init__(
+            self,
+            index, xq, k,
+            verbose=0,
+            use_float16=False):
+
+        # verbosity
+        self.verbose = verbose
+        self.tictoc = []
+
+        self.xq = xq
+        self.index = index
+        self.use_float16 = use_float16
+        keep_max = faiss.is_similarity_metric(index.metric_type)
+        self.rh = faiss.ResultHeap(len(xq), k, keep_max=keep_max)
+        self.t_accu = [0] * 5
+        self.t_display = self.t0 = time.time()
+
+    def start_t_accu(self):
+        self.t_accu_t0 = time.time()
+
+    def stop_t_accu(self, n):
+        self.t_accu[n] += time.time() - self.t_accu_t0
+
+    def tic(self, name):
+        self.tictoc = (name, time.time())
+        if self.verbose > 0:
+            print(name, end="\r", flush=True)
+
+    def toc(self):
+        name, t0 = self.tictoc
+        dt = time.time() - t0
+        if self.verbose > 0:
+            print(f"{name}: {dt:.3f} s")
+        return dt
+
+    def report(self, l):
+        if self.verbose == 1 or (
+            self.verbose == 2 and (
+                l > 1000 and time.time() < self.t_display + 1.0
+            )
+        ):
+            return
+        t = time.time() - self.t0
+        print(
+            f"[{t:.1f} s] list {l}/{self.index.nlist} "
+            f"times prep q {self.t_accu[0]:.3f} prep b {self.t_accu[1]:.3f} "
+            f"comp {self.t_accu[2]:.3f} res {self.t_accu[3]:.3f} "
+            f"wait {self.t_accu[4]:.3f} "
+            f"eta {datetime.timedelta(seconds=t*self.index.nlist/(l+1)-t)} "
+            f"mem {faiss.get_mem_usage_kb()}",
+            end="\r" if self.verbose <= 2 else "\n",
+            flush=True,
+        )
+        self.t_display = time.time()
+
+    def coarse_quantization(self):
+        self.tic("coarse quantization")
+        bs = 65536
+        nq = len(self.xq)
+        q_assign = np.empty((nq, self.index.nprobe), dtype='int32')
+        for i0 in range(0, nq, bs):
+            i1 = min(nq, i0 + bs)
+            q_dis_i, q_assign_i = self.index.quantizer.search(
+                self.xq[i0:i1], self.index.nprobe)
+            # q_dis[i0:i1] = q_dis_i
+            q_assign[i0:i1] = q_assign_i
+        self.toc()
+        self.q_assign = q_assign
+
+    def reorder_assign(self):
+        self.tic("bucket sort")
+        q_assign = self.q_assign
+        q_assign += 1   # move -1 -> 0
+        self.bucket_lims = faiss.matrix_bucket_sort_inplace(
+            self.q_assign, nbucket=self.index.nlist + 1, nt=16)
+        self.query_ids = self.q_assign.ravel()
+        if self.verbose > 0:
+            print('  number of -1s:', self.bucket_lims[1])
+        self.bucket_lims = self.bucket_lims[1:]  # shift back to ignore -1s
+        del self.q_assign   # inplace so let's forget about the old version...
+        self.toc()
+
+    def prepare_bucket(self, l):
+        """ prepare the queries and database items for bucket l"""
+        t0 = time.time()
+        index = self.index
+        # prepare queries
+        i0, i1 = self.bucket_lims[l], self.bucket_lims[l + 1]
+        q_subset = self.query_ids[i0:i1]
+        xq_l = self.xq[q_subset]
+        if self.by_residual:
+            xq_l = xq_l - index.quantizer.reconstruct(l)
+        t1 = time.time()
+        # prepare database side
+        list_ids, xb_l = get_invlist(index.invlists, l)
+
+        if self.decode_func is None:
+            xb_l = xb_l.ravel()
+        else:
+            xb_l = self.decode_func(xb_l)
+
+        if self.use_float16:
+            xb_l = xb_l.astype('float16')
+            xq_l = xq_l.astype('float16')
+
+        t2 = time.time()
+        self.t_accu[0] += t1 - t0
+        self.t_accu[1] += t2 - t1
+        return q_subset, xq_l, list_ids, xb_l
+
+    def add_results_to_heap(self, q_subset, D, list_ids, I):
+        """add the bucket results to the heap structure"""
+        if D is None:
+            return
+        t0 = time.time()
+        if I is None:
+            I = list_ids
+        else:
+            I = list_ids[I]
+        self.rh.add_result_subset(q_subset, D, I)
+        self.t_accu[3] += time.time() - t0
+
+    def sizes_in_checkpoint(self):
+        return (self.xq.shape, self.index.nprobe, self.index.nlist)
+
+    def write_checkpoint(self, fname, completed):
+        # write to temp file then move to final file
+        tmpname = fname + ".tmp"
+        with open(tmpname, "wb") as f:
+            pickle.dump(
+                {
+                    "sizes": self.sizes_in_checkpoint(),
+                    "completed": completed,
+                    "rh": (self.rh.D, self.rh.I),
+                }, f, -1)
+        os.replace(tmpname, fname)
+
+    def read_checkpoint(self, fname):
+        with open(fname, "rb") as f:
+            ckp = pickle.load(f)
+        assert ckp["sizes"] == self.sizes_in_checkpoint()
+        self.rh.D[:] = ckp["rh"][0]
+        self.rh.I[:] = ckp["rh"][1]
+        return ckp["completed"]
+
+
+class BlockComputer:
+    """ computation within one bucket """
+
+    def __init__(
+            self,
+            index,
+            method="knn_function",
+            pairwise_distances=faiss.pairwise_distances,
+            knn=faiss.knn):
+
+        self.index = index
+        if index.__class__ == faiss.IndexIVFFlat:
+            index_help = faiss.IndexFlat(index.d, index.metric_type)
+            decode_func = lambda x: x.view("float32")
+            by_residual = False
+        elif index.__class__ == faiss.IndexIVFPQ:
+            index_help = faiss.IndexPQ(
+                index.d, index.pq.M, index.pq.nbits, index.metric_type)
+            index_help.pq = index.pq
+            decode_func = index_help.pq.decode
+            index_help.is_trained = True
+            by_residual = index.by_residual
+        elif index.__class__ == faiss.IndexIVFScalarQuantizer:
+            index_help = faiss.IndexScalarQuantizer(
+                index.d, index.sq.qtype, index.metric_type)
+            index_help.sq = index.sq
+            decode_func = index_help.sq.decode
+            index_help.is_trained = True
+            by_residual = index.by_residual
+        else:
+            raise RuntimeError(f"index type {index.__class__} not supported")
+        self.index_help = index_help
+        self.decode_func = None if method == "index" else decode_func
+        self.by_residual = by_residual
+        self.method = method
+        self.pairwise_distances = pairwise_distances
+        self.knn = knn
+
+    def block_search(self, xq_l, xb_l, list_ids, k, **extra_args):
+        metric_type = self.index.metric_type
+        if xq_l.size == 0 or xb_l.size == 0:
+            D = I = None
+        elif self.method == "index":
+            faiss.copy_array_to_vector(xb_l, self.index_help.codes)
+            self.index_help.ntotal = len(list_ids)
+            D, I = self.index_help.search(xq_l, k)
+        elif self.method == "pairwise_distances":
+            # TODO implement blockwise to avoid mem blowup
+            D = self.pairwise_distances(xq_l, xb_l, metric=metric_type)
+            I = None
+        elif self.method == "knn_function":
+            D, I = self.knn(xq_l, xb_l, k, metric=metric_type, **extra_args)
+
+        return D, I
+
+
+def big_batch_search(
+        index, xq, k,
+        method="knn_function",
+        pairwise_distances=faiss.pairwise_distances,
+        knn=faiss.knn,
+        verbose=0,
+        threaded=0,
+        use_float16=False,
+        prefetch_threads=1,
+        computation_threads=1,
+        q_assign=None,
+        checkpoint=None,
+        checkpoint_freq=7200,
+        start_list=0,
+        end_list=None,
+        crash_at=-1
+        ):
+    """
+    Search queries xq in the IVF index, with a search function that collects
+    batches of query vectors per inverted list. This can be faster than the
+    regular search indexes.
+    Supports IVFFlat, IVFPQ and IVFScalarQuantizer.
+
+    Supports three computation methods:
+    method = "index":
+        build a flat index and populate it separately for each index
+    method = "pairwise_distances":
+        decompress codes and compute all pairwise distances for the queries
+        and index and add result to heap
+    method = "knn_function":
+        decompress codes and compute knn results for the queries
+
+    threaded=0: sequential execution
+    threaded=1: prefetch next bucket while computing the current one
+    threaded=2: prefetch prefetch_threads buckets at a time.
+
+    compute_threads>1: the knn function will get an additional thread_no that
+        tells which worker should handle this.
+
+    In threaded mode, the computation is tiled with the bucket perparation and
+    the writeback of results (useful to maximize GPU utilization).
+
+    use_float16: convert all matrices to float16 (faster for GPU gemm)
+
+    q_assign: override coarse assignment, should be a matrix of size nq * nprobe
+
+    checkpointing (only for threaded > 1):
+    checkpoint: file where the checkpoints are stored
+    checkpoint_freq: when to perform checkpoinging. Should be a multiple of threaded
+
+    start_list, end_list: process only a subset of invlists
+    """
+    nprobe = index.nprobe
+
+    assert method in ("index", "pairwise_distances", "knn_function")
+
+    mem_queries = xq.nbytes
+    mem_assign = len(xq) * nprobe * np.dtype('int32').itemsize
+    mem_res = len(xq) * k * (
+        np.dtype('int64').itemsize
+        + np.dtype('float32').itemsize
+    )
+    mem_tot = mem_queries + mem_assign + mem_res
+    if verbose > 0:
+        print(
+            f"memory: queries {mem_queries} assign {mem_assign} "
+            f"result {mem_res} total {mem_tot} = {mem_tot / (1<<30):.3f} GiB"
+        )
+
+    bbs = BigBatchSearcher(
+        index, xq, k,
+        verbose=verbose,
+        use_float16=use_float16
+    )
+
+    comp = BlockComputer(
+        index,
+        method=method,
+        pairwise_distances=pairwise_distances,
+        knn=knn
+    )
+
+    bbs.decode_func = comp.decode_func
+    bbs.by_residual = comp.by_residual
+
+    if q_assign is None:
+        bbs.coarse_quantization()
+    else:
+        bbs.q_assign = q_assign
+    bbs.reorder_assign()
+
+    if end_list is None:
+        end_list = index.nlist
+
+    completed = set()
+    if checkpoint is not None:
+        assert (start_list, end_list) == (0, index.nlist)
+        if os.path.exists(checkpoint):
+            print("recovering checkpoint", checkpoint)
+            completed = bbs.read_checkpoint(checkpoint)
+            print("   already completed", len(completed))
+        else:
+            print("no checkpoint: starting from scratch")
+
+    if threaded == 0:
+        # simple sequential version
+
+        for l in range(start_list, end_list):
+            bbs.report(l)
+            q_subset, xq_l, list_ids, xb_l = bbs.prepare_bucket(l)
+            t0i = time.time()
+            D, I = comp.block_search(xq_l, xb_l, list_ids, k)
+            bbs.t_accu[2] += time.time() - t0i
+            bbs.add_results_to_heap(q_subset, D, list_ids, I)
+
+    elif threaded == 1:
+
+        # parallel version with granularity 1
+
+        def add_results_and_prefetch(to_add, l):
+            """ perform the addition for the previous bucket and
+            prefetch the next (if applicable) """
+            if to_add is not None:
+                bbs.add_results_to_heap(*to_add)
+            if l < index.nlist:
+                return bbs.prepare_bucket(l)
+
+        prefetched_bucket = bbs.prepare_bucket(start_list)
+        to_add = None
+        pool = ThreadPool(1)
+
+        for l in range(start_list, end_list):
+            bbs.report(l)
+            prefetched_bucket_a = pool.apply_async(
+                add_results_and_prefetch, (to_add, l + 1))
+            q_subset, xq_l, list_ids, xb_l = prefetched_bucket
+            bbs.start_t_accu()
+            D, I = comp.block_search(xq_l, xb_l, list_ids, k)
+            bbs.stop_t_accu(2)
+            to_add = q_subset, D, list_ids, I
+            bbs.start_t_accu()
+            prefetched_bucket = prefetched_bucket_a.get()
+            bbs.stop_t_accu(4)
+
+        bbs.add_results_to_heap(*to_add)
+        pool.close()
+    else:
+
+        def task_manager_thread(
+            task,
+            pool_size,
+            start_task,
+            end_task,
+            completed,
+            output_queue,
+            input_queue,
+        ):
+            try:
+                with ThreadPool(pool_size) as pool:
+                    res = [pool.apply_async(
+                        task,
+                        args=(i, output_queue, input_queue))
+                        for i in range(start_task, end_task)
+                        if i not in completed]
+                    for r in res:
+                        r.get()
+                    pool.close()
+                    pool.join()
+                output_queue.put(None)
+            except:
+                traceback.print_exc()
+                _thread.interrupt_main()
+                raise
+
+        def task_manager(*args):
+            task_manager = threading.Thread(
+                target=task_manager_thread,
+                args=args,
+            )
+            task_manager.daemon = True
+            task_manager.start()
+            return task_manager
+
+        def prepare_task(task_id, output_queue, input_queue=None):
+            try:
+                # print(f"Prepare start: {task_id}")
+                q_subset, xq_l, list_ids, xb_l = bbs.prepare_bucket(task_id)
+                output_queue.put((task_id, q_subset, xq_l, list_ids, xb_l))
+                # print(f"Prepare end: {task_id}")
+            except:
+                traceback.print_exc()
+                _thread.interrupt_main()
+                raise
+
+        def compute_task(task_id, output_queue, input_queue):
+            try:
+                # print(f"Compute start: {task_id}")
+                t_wait = 0
+                while True:
+                    t0 = time.time()
+                    input_value = input_queue.get()
+                    t_wait += time.time() - t0
+                    if input_value is None:
+                        # signal for other compute tasks
+                        input_queue.put(None)
+                        break
+                    centroid, q_subset, xq_l, list_ids, xb_l = input_value
+                    # print(f'Compute work start: task {task_id}, centroid {centroid}')
+                    t0 = time.time()
+                    if computation_threads > 1:
+                        D, I = comp.block_search(
+                            xq_l, xb_l, list_ids, k, thread_id=task_id
+                        )
+                    else:
+                        D, I = comp.block_search(xq_l, xb_l, list_ids, k)
+                    t_compute = time.time() - t0
+                    # print(f'Compute work end: task {task_id}, centroid {centroid}')
+                    t0 = time.time()
+                    output_queue.put(
+                        (centroid, t_wait, t_compute, q_subset, D, list_ids, I)
+                    )
+                    t_wait = time.time() - t0
+                # print(f"Compute end: {task_id}")
+            except:
+                traceback.print_exc()
+                _thread.interrupt_main()
+                raise
+
+        prepare_to_compute_queue = Queue(2)
+        compute_to_main_queue = Queue(2)
+        compute_task_manager = task_manager(
+            compute_task,
+            computation_threads,
+            0,
+            computation_threads,
+            set(),
+            compute_to_main_queue,
+            prepare_to_compute_queue,
+        )
+        prepare_task_manager = task_manager(
+            prepare_task,
+            prefetch_threads,
+            start_list,
+            end_list,
+            completed,
+            prepare_to_compute_queue,
+            None,
+        )
+
+        t_checkpoint = time.time()
+        while True:
+            value = compute_to_main_queue.get()
+            if not value:
+                break
+            centroid, t_wait, t_compute, q_subset, D, list_ids, I = value
+            # to test checkpointing
+            if centroid == crash_at:
+                1 / 0
+            bbs.t_accu[2] += t_compute
+            bbs.t_accu[4] += t_wait
+            bbs.add_results_to_heap(q_subset, D, list_ids, I)
+            completed.add(centroid)
+            bbs.report(centroid)
+            if checkpoint is not None:
+                if time.time() - t_checkpoint > checkpoint_freq:
+                    print("writing checkpoint")
+                    bbs.write_checkpoint(checkpoint, completed)
+                    t_checkpoint = time.time()
+
+        prepare_task_manager.join()
+        compute_task_manager.join()
+
+    bbs.tic("finalize heap")
+    bbs.rh.finalize()
+    bbs.toc()
+
+    return bbs.rh.D, bbs.rh.I
diff --git a/thirdparty/faiss/contrib/client_server.py b/thirdparty/faiss/contrib/client_server.py
index 99b5b12fd..ee39798e5 100755
--- a/thirdparty/faiss/contrib/client_server.py
+++ b/thirdparty/faiss/contrib/client_server.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 import faiss
 from typing import List, Tuple
 
diff --git a/thirdparty/faiss/contrib/clustering.py b/thirdparty/faiss/contrib/clustering.py
new file mode 100644
index 000000000..e84a7e63f
--- /dev/null
+++ b/thirdparty/faiss/contrib/clustering.py
@@ -0,0 +1,399 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This contrib module contains a few routines useful to do clustering variants.
+"""
+
+import numpy as np
+import faiss
+import time
+from multiprocessing.pool import ThreadPool
+
+
+try:
+    import scipy.sparse
+except ImportError:
+    print("scipy not accessible, Python k-means will not work")
+
+def print_nop(*arg, **kwargs):
+    pass
+
+def two_level_clustering(xt, nc1, nc2, rebalance=True, clustering_niter=25, **args):
+    """
+    perform 2-level clustering on a training set xt
+    nc1 and nc2 are the number of clusters at each level, the final number of
+    clusters is nc2. Additional arguments are passed to the Kmeans object.
+
+    Rebalance allocates the number of sub-clusters depending on the number of
+    first-level assignment.
+    """
+    d = xt.shape[1]
+
+    verbose = args.get("verbose", False)
+
+    log = print if verbose else print_nop
+
+    log(f"2-level clustering of {xt.shape} nb 1st level clusters = {nc1} total {nc2}")
+    log("perform coarse training")
+
+    km = faiss.Kmeans(
+        d, nc1, niter=clustering_niter,
+        max_points_per_centroid=2000,
+        **args
+    )
+    km.train(xt)
+
+    iteration_stats = [km.iteration_stats]
+    log()
+
+    # coarse centroids
+    centroids1 = km.centroids
+
+    log("assigning the training set")
+    t0 = time.time()
+    _, assign1 = km.assign(xt)
+    bc = np.bincount(assign1, minlength=nc1)
+    log(f"done in {time.time() - t0:.2f} s. Sizes of clusters {min(bc)}-{max(bc)}")
+    o = assign1.argsort()
+    del km
+
+    if not rebalance:
+        # make sure the sub-clusters sum up to exactly nc2
+        cc = np.arange(nc1 + 1) * nc2 // nc1
+        all_nc2 = cc[1:] - cc[:-1]
+    else:
+        bc_sum = np.cumsum(bc)
+        all_nc2 = bc_sum * nc2 // bc_sum[-1]
+        all_nc2[1:] -= all_nc2[:-1]
+        assert sum(all_nc2) == nc2
+        log(f"nb 2nd-level centroids {min(all_nc2)}-{max(all_nc2)}")
+
+    # train sub-clusters
+    i0 = 0
+    c2 = []
+    t0 = time.time()
+    for c1 in range(nc1):
+        nc2 = int(all_nc2[c1])
+        log(f"[{time.time() - t0:.2f} s] training sub-cluster {c1}/{nc1} nc2={nc2}\r", end="", flush=True)
+        i1 = i0 + bc[c1]
+        subset = o[i0:i1]
+        assert np.all(assign1[subset] == c1)
+        km = faiss.Kmeans(d, nc2, **args)
+        xtsub = xt[subset]
+        km.train(xtsub)
+        iteration_stats.append(km.iteration_stats)
+        c2.append(km.centroids)
+        del km
+        i0 = i1
+    log(f"done in {time.time() - t0:.2f} s")
+    return np.vstack(c2), iteration_stats
+
+
+def train_ivf_index_with_2level(index, xt, **args):
+    """
+    Applies 2-level clustering to an index_ivf embedded in an index.
+    """
+    # handle PreTransforms
+    index = faiss.downcast_index(index)
+    if isinstance(index, faiss.IndexPreTransform):
+        for i in range(index.chain.size()):
+            vt = index.chain.at(i)
+            vt.train(xt)
+            xt = vt.apply(xt)
+        train_ivf_index_with_2level(index.index, xt, **args)
+        index.is_trained = True
+        return
+    assert isinstance(index, faiss.IndexIVF)
+    assert index.metric_type == faiss.METRIC_L2
+    # now do 2-level clustering
+    nc1 = int(np.sqrt(index.nlist))
+    print("REBALANCE=", args)
+
+    centroids, _ = two_level_clustering(xt, nc1, index.nlist, **args)
+    index.quantizer.train(centroids)
+    index.quantizer.add(centroids)
+    # finish training
+    index.train(xt)
+
+
+###############################################################################
+# K-means implementation in Python
+#
+# It relies on DatasetAssign, an abstraction of the training vectors that offers
+# the minimal set of operations to perform k-means clustering.
+###############################################################################
+
+
+class DatasetAssign:
+    """Wrapper for a matrix that offers a function to assign the vectors
+    to centroids. All other implementations offer the same interface"""
+
+    def __init__(self, x):
+        self.x = np.ascontiguousarray(x, dtype='float32')
+
+    def count(self):
+        return self.x.shape[0]
+
+    def dim(self):
+        return self.x.shape[1]
+
+    def get_subset(self, indices):
+        return self.x[indices]
+
+    def perform_search(self, centroids):
+        return faiss.knn(self.x, centroids, 1)
+
+    def assign_to(self, centroids, weights=None):
+        D, I = self.perform_search(centroids)
+
+        I = I.ravel()
+        D = D.ravel()
+        n = len(self.x)
+        if weights is None:
+            weights = np.ones(n, dtype='float32')
+        nc = len(centroids)
+        m = scipy.sparse.csc_matrix(
+            (weights, I, np.arange(n + 1)),
+            shape=(nc, n))
+        sum_per_centroid = m * self.x
+
+        return I, D, sum_per_centroid
+
+
+class DatasetAssignGPU(DatasetAssign):
+    """ GPU version of the previous """
+
+    def __init__(self, x, gpu_id, verbose=False):
+        DatasetAssign.__init__(self, x)
+        index = faiss.IndexFlatL2(x.shape[1])
+        if gpu_id >= 0:
+            self.index = faiss.index_cpu_to_gpu(
+                faiss.StandardGpuResources(),
+                gpu_id, index)
+        else:
+            # -1 -> assign to all GPUs
+            self.index = faiss.index_cpu_to_all_gpus(index)
+
+    def perform_search(self, centroids):
+        self.index.reset()
+        self.index.add(centroids)
+        return self.index.search(self.x, 1)
+
+
+def sparse_assign_to_dense(xq, xb, xq_norms=None, xb_norms=None):
+    """ assignment function for xq is sparse, xb is dense
+    uses a matrix multiplication. The squared norms can be provided if available.
+    """
+    nq = xq.shape[0]
+    nb = xb.shape[0]
+    if xb_norms is None:
+        xb_norms = (xb ** 2).sum(1)
+    if xq_norms is None:
+        xq_norms = np.array(xq.power(2).sum(1))
+    d2 =  xb_norms - 2 * xq @ xb.T
+    I = d2.argmin(axis=1)
+    D = d2.ravel()[I + np.arange(nq) * nb] + xq_norms.ravel()
+    return D, I
+
+
+def sparse_assign_to_dense_blocks(
+        xq, xb, xq_norms=None, xb_norms=None, qbs=16384, bbs=16384, nt=None):
+    """
+    decomposes the sparse_assign_to_dense function into blocks to avoid a
+    possible memory blow up. Can be run in multithreaded mode, because scipy's
+    sparse-dense matrix multiplication is single-threaded.
+    """
+    nq = xq.shape[0]
+    nb = xb.shape[0]
+    D = np.empty(nq, dtype="float32")
+    D.fill(np.inf)
+    I = -np.ones(nq, dtype=int)
+
+    if xb_norms is None:
+        xb_norms = (xb ** 2).sum(1)
+
+    def handle_query_block(i):
+        xq_block = xq[i : i + qbs]
+        Iblock = I[i : i + qbs]
+        Dblock = D[i : i + qbs]
+        if xq_norms is None:
+            xq_norms_block = np.array(xq_block.power(2).sum(1))
+        else:
+            xq_norms_block = xq_norms[i : i + qbs]
+        for j in range(0, nb, bbs):
+            Di, Ii = sparse_assign_to_dense(
+                xq_block,
+                xb[j : j + bbs],
+                xq_norms=xq_norms_block,
+                xb_norms=xb_norms[j : j + bbs],
+            )
+            if j == 0:
+                Iblock[:] = Ii
+                Dblock[:] = Di
+            else:
+                mask = Di < Dblock
+                Iblock[mask] = Ii[mask] + j
+                Dblock[mask] = Di[mask]
+
+    if nt == 0 or nt == 1 or nq <= qbs:
+        list(map(handle_query_block, range(0, nq, qbs)))
+    else:
+        pool = ThreadPool(nt)
+        pool.map(handle_query_block, range(0, nq, qbs))
+
+    return D, I
+
+
+class DatasetAssignSparse(DatasetAssign):
+    """Wrapper for a matrix that offers a function to assign the vectors
+    to centroids. All other implementations offer the same interface"""
+
+    def __init__(self, x):
+        assert x.__class__ == scipy.sparse.csr_matrix
+        self.x = x
+        self.squared_norms = np.array(x.power(2).sum(1))
+
+    def get_subset(self, indices):
+        return np.array(self.x[indices].todense())
+
+    def perform_search(self, centroids):
+        return sparse_assign_to_dense_blocks(
+            self.x, centroids, xq_norms=self.squared_norms)
+
+    def assign_to(self, centroids, weights=None):
+        D, I = self.perform_search(centroids)
+
+        I = I.ravel()
+        D = D.ravel()
+        n = self.x.shape[0]
+        if weights is None:
+            weights = np.ones(n, dtype='float32')
+        nc = len(centroids)
+        m = scipy.sparse.csc_matrix(
+            (weights, I, np.arange(n + 1)),
+            shape=(nc, n))
+        sum_per_centroid = np.array((m * self.x).todense())
+
+        return I, D, sum_per_centroid
+
+
+def imbalance_factor(k, assign):
+    assign = np.ascontiguousarray(assign, dtype='int64')
+    return faiss.imbalance_factor(len(assign), k, faiss.swig_ptr(assign))
+
+
+def reassign_centroids(hassign, centroids, rs=None):
+    """ reassign centroids when some of them collapse """
+    if rs is None:
+        rs = np.random
+    k, d = centroids.shape
+    nsplit = 0
+    empty_cents = np.where(hassign == 0)[0]
+
+    if empty_cents.size == 0:
+        return 0
+
+    fac = np.ones(d)
+    fac[::2] += 1 / 1024.
+    fac[1::2] -= 1 / 1024.
+
+    # this is a single pass unless there are more than k/2
+    # empty centroids
+    while empty_cents.size > 0:
+        # choose which centroids to split
+        probas = hassign.astype('float') - 1
+        probas[probas < 0] = 0
+        probas /= probas.sum()
+        nnz = (probas > 0).sum()
+
+        nreplace = min(nnz, empty_cents.size)
+        cjs = rs.choice(k, size=nreplace, p=probas)
+
+        for ci, cj in zip(empty_cents[:nreplace], cjs):
+
+            c = centroids[cj]
+            centroids[ci] = c * fac
+            centroids[cj] = c / fac
+
+            hassign[ci] = hassign[cj] // 2
+            hassign[cj] -= hassign[ci]
+            nsplit += 1
+
+        empty_cents = empty_cents[nreplace:]
+
+    return nsplit
+
+
+def kmeans(k, data, niter=25, seed=1234, checkpoint=None, verbose=True,
+           return_stats=False):
+    """Pure python kmeans implementation. Follows the Faiss C++ version
+    quite closely, but takes a DatasetAssign instead of a training data
+    matrix. Also redo is not implemented. """
+    n, d = data.count(), data.dim()
+
+    log = print if verbose else print_nop
+
+    log(("Clustering %d points in %dD to %d clusters, " +
+            "%d iterations seed %d") % (n, d, k, niter, seed))
+
+    rs = np.random.RandomState(seed)
+    print("preproc...")
+    t0 = time.time()
+    # initialization
+    perm = rs.choice(n, size=k, replace=False)
+    centroids = data.get_subset(perm)
+
+    iteration_stats = []
+
+    log("  done")
+    t_search_tot = 0
+    obj = []
+    for i in range(niter):
+        t0s = time.time()
+
+        log('assigning', end='\r', flush=True)
+        assign, D, sums = data.assign_to(centroids)
+
+        log('compute centroids', end='\r', flush=True)
+
+        t_search_tot += time.time() - t0s;
+
+        err = D.sum()
+        obj.append(err)
+
+        hassign = np.bincount(assign, minlength=k)
+
+        fac = hassign.reshape(-1, 1).astype('float32')
+        fac[fac == 0] = 1 # quiet warning
+
+        centroids = sums / fac
+
+        nsplit = reassign_centroids(hassign, centroids, rs)
+
+        s = {
+            "obj": err,
+            "time": (time.time() - t0),
+            "time_search": t_search_tot,
+            "imbalance_factor": imbalance_factor (k, assign),
+            "nsplit": nsplit
+        }
+
+        log(("  Iteration %d (%.2f s, search %.2f s): "
+             "objective=%g imbalance=%.3f nsplit=%d") % (
+                   i, s["time"], s["time_search"],
+                   err, s["imbalance_factor"],
+                   nsplit)
+        )
+        iteration_stats.append(s)
+
+        if checkpoint is not None:
+            log('storing centroids in', checkpoint)
+            np.save(checkpoint, centroids)
+
+    if return_stats:
+        return centroids, iteration_stats
+    else:
+        return centroids
diff --git a/thirdparty/faiss/contrib/datasets.py b/thirdparty/faiss/contrib/datasets.py
index b255b06ad..f37a2fb6e 100644
--- a/thirdparty/faiss/contrib/datasets.py
+++ b/thirdparty/faiss/contrib/datasets.py
@@ -310,3 +310,69 @@ def get_groundtruth(self, k=None):
             assert k <= 100
             gt = gt[:, :k]
         return gt
+
+class DatasetGIST1M(Dataset):
+    """
+    The original dataset is available at: http://corpus-texmex.irisa.fr/
+    (ANN_SIFT1M)
+    """
+
+    def __init__(self):
+        Dataset.__init__(self)
+        self.d, self.nt, self.nb, self.nq = 960, 100000, 1000000, 10000
+        self.basedir = dataset_basedir + 'gist1M/'
+
+    def get_queries(self):
+        return fvecs_read(self.basedir + "gist_query.fvecs")
+
+    def get_train(self, maxtrain=None):
+        maxtrain = maxtrain if maxtrain is not None else self.nt
+        return fvecs_read(self.basedir + "gist_learn.fvecs")[:maxtrain]
+
+    def get_database(self):
+        return fvecs_read(self.basedir + "gist_base.fvecs")
+
+    def get_groundtruth(self, k=None):
+        gt = ivecs_read(self.basedir + "gist_groundtruth.ivecs")
+        if k is not None:
+            assert k <= 100
+            gt = gt[:, :k]
+        return gt
+
+
+def dataset_from_name(dataset='deep1M', download=False):
+    """ converts a string describing a dataset to a Dataset object
+    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
+    """
+
+    if dataset == 'sift1M':
+        return DatasetSIFT1M()
+
+    elif dataset == 'gist1M':
+        return DatasetGIST1M()
+
+    elif dataset.startswith('bigann'):
+        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
+        return DatasetBigANN(nb_M=dbsize)
+
+    elif dataset.startswith("deep"):
+
+        szsuf = dataset[4:]
+        if szsuf[-1] == 'M':
+            dbsize = 10 ** 6 * int(szsuf[:-1])
+        elif szsuf == '1B':
+            dbsize = 10 ** 9
+        elif szsuf[-1] == 'k':
+            dbsize = 1000 * int(szsuf[:-1])
+        else:
+            assert False, "did not recognize suffix " + szsuf
+        return DatasetDeep1B(nb=dbsize)
+
+    elif dataset == "music-100":
+        return DatasetMusic100()
+
+    elif dataset == "glove":
+        return DatasetGlove(download=download)
+
+    else:
+        raise RuntimeError("unknown dataset " + dataset)
diff --git a/thirdparty/faiss/contrib/evaluation.py b/thirdparty/faiss/contrib/evaluation.py
index 51a4a7499..1f4068734 100644
--- a/thirdparty/faiss/contrib/evaluation.py
+++ b/thirdparty/faiss/contrib/evaluation.py
@@ -5,8 +5,10 @@
 
 import numpy as np
 import unittest
+import time
+import faiss
 
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 ###############################################################
 # Simple functions to evaluate knn results
@@ -224,31 +226,49 @@ def compute_PR_for(q):
 # Functions that compare search results with a reference result.
 # They are intended for use in tests
 
-def test_ref_knn_with_draws(Dref, Iref, Dnew, Inew):
-    """ test that knn search results are identical, raise if not """
-    np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
+def _cluster_tables_with_tolerance(tab1, tab2, thr):
+    """ for two tables, cluster them by merging values closer than thr.
+    Returns the cluster ids for each table element """
+    tab = np.hstack([tab1, tab2])
+    tab.sort()
+    n = len(tab)
+    diffs = np.ones(n)
+    diffs[1:] = tab[1:] - tab[:-1]
+    unique_vals = tab[diffs > thr]
+    idx1 = np.searchsorted(unique_vals, tab1, side='right') - 1
+    idx2 = np.searchsorted(unique_vals, tab2, side='right') - 1
+    return idx1, idx2
+
+
+def check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, rtol=1e-5):
+    """ test that knn search results are identical, with possible ties.
+    Raise if not. """
+    np.testing.assert_allclose(Dref, Dnew, rtol=rtol)
     # here we have to be careful because of draws
     testcase = unittest.TestCase()   # because it makes nice error messages
     for i in range(len(Iref)):
         if np.all(Iref[i] == Inew[i]): # easy case
             continue
-        # we can deduce nothing about the latest line
-        skip_dis = Dref[i, -1]
-        for dis in np.unique(Dref):
-            if dis == skip_dis:
+
+        # otherwise collect elements per distance
+        r = rtol * Dref[i].max()
+
+        DrefC, DnewC = _cluster_tables_with_tolerance(Dref[i], Dnew[i], r)
+
+        for dis in np.unique(DrefC):
+            if dis == DrefC[-1]:
                 continue
-            mask = Dref[i, :] == dis
+            mask = DrefC == dis
             testcase.assertEqual(set(Iref[i, mask]), set(Inew[i, mask]))
 
-
-def test_ref_range_results(lims_ref, Dref, Iref,
-                           lims_new, Dnew, Inew):
+def check_ref_range_results(Lref, Dref, Iref,
+                            Lnew, Dnew, Inew):
     """ compare range search results wrt. a reference result,
     throw if it fails """
-    np.testing.assert_array_equal(lims_ref, lims_new)
-    nq = len(lims_ref) - 1
+    np.testing.assert_array_equal(Lref, Lnew)
+    nq = len(Lref) - 1
     for i in range(nq):
-        l0, l1 = lims_ref[i], lims_ref[i + 1]
+        l0, l1 = Lref[i], Lref[i + 1]
         Ii_ref = Iref[l0:l1]
         Ii_new = Inew[l0:l1]
         Di_ref = Dref[l0:l1]
@@ -264,3 +284,192 @@ def sort_by_ids(I, D):
             (Ii_new, Di_new) = sort_by_ids(Ii_new, Di_new)
             np.testing.assert_array_equal(Ii_ref, Ii_new)
         np.testing.assert_array_almost_equal(Di_ref, Di_new, decimal=5)
+
+
+###############################################################
+# OperatingPoints functions
+# this is the Python version of the AutoTune object in C++
+
+class OperatingPoints:
+    """
+    Manages a set of search parameters with associated performance and time.
+    Keeps the Pareto optimal points.
+    """
+
+    def __init__(self):
+        # list of (key, perf, t)
+        self.operating_points = [
+            #  (self.do_nothing_key(), 0.0, 0.0)
+        ]
+        self.suboptimal_points = []
+
+    def compare_keys(self, k1, k2):
+        """ return -1 if k1 > k2, 1 if k2 > k1, 0 otherwise """
+        raise NotImplemented
+
+    def do_nothing_key(self):
+        """ parameters to say we do noting, takes 0 time and has 0 performance"""
+        raise NotImplemented
+
+    def is_pareto_optimal(self, perf_new, t_new):
+        for _, perf, t in self.operating_points:
+            if perf >= perf_new and t <= t_new:
+                return False
+        return True
+
+    def predict_bounds(self, key):
+        """ predicts the bound on time and performance """
+        min_time = 0.0
+        max_perf = 1.0
+        for key2, perf, t in self.operating_points + self.suboptimal_points:
+            cmp = self.compare_keys(key, key2)
+            if cmp > 0: # key2 > key
+                if t > min_time:
+                    min_time = t
+            if cmp < 0: # key2 < key
+                if perf < max_perf:
+                    max_perf = perf
+        return max_perf, min_time
+
+    def should_run_experiment(self, key):
+        (max_perf, min_time) = self.predict_bounds(key)
+        return self.is_pareto_optimal(max_perf, min_time)
+
+    def add_operating_point(self, key, perf, t):
+        if self.is_pareto_optimal(perf, t):
+            i = 0
+            # maybe it shadows some other operating point completely?
+            while i < len(self.operating_points):
+                op_Ls, perf2, t2 = self.operating_points[i]
+                if perf >= perf2 and t < t2:
+                    self.suboptimal_points.append(
+                        self.operating_points.pop(i))
+                else:
+                    i += 1
+            self.operating_points.append((key, perf, t))
+            return True
+        else:
+            self.suboptimal_points.append((key, perf, t))
+            return False
+
+
+class OperatingPointsWithRanges(OperatingPoints):
+    """
+    Set of parameters that are each picked from a discrete range of values.
+    An increase of each parameter is assumed to make the operation slower
+    and more accurate.
+    A key = int array of indices in the ordered set of parameters.
+    """
+
+    def __init__(self):
+        OperatingPoints.__init__(self)
+        # list of (name, values)
+        self.ranges = []
+
+    def add_range(self, name, values):
+        self.ranges.append((name, values))
+
+    def compare_keys(self, k1, k2):
+        if np.all(k1 >= k2):
+            return 1
+        if np.all(k2 >= k1):
+            return -1
+        return 0
+
+    def do_nothing_key(self):
+        return np.zeros(len(self.ranges), dtype=int)
+
+    def num_experiments(self):
+        return np.prod([len(values) for name, values in self.ranges])
+
+    def cno_to_key(self, cno):
+        """Convert a sequential experiment number to a key"""
+        k = np.zeros(len(self.ranges), dtype=int)
+        for i, (name, values) in enumerate(self.ranges):
+            k[i] = cno % len(values)
+            cno //= len(values)
+        assert cno == 0
+        return k
+
+    def get_parameters(self, k):
+        """Convert a key to a dictionary with parameter values"""
+        return {
+            name: values[k[i]]
+            for i, (name, values) in enumerate(self.ranges)
+        }
+
+    def restrict_range(self, name, max_val):
+        """ remove too large values from a range"""
+        for name2, values in self.ranges:
+            if name == name2:
+                val2 = [v for v in values if v < max_val]
+                values[:] = val2
+                return
+        raise RuntimeError(f"parameter {name} not found")
+
+
+###############################################################
+# Timer object
+
+class TimerIter:
+    def __init__(self, timer):
+        self.ts = []
+        self.runs = timer.runs
+        self.timer = timer
+        if timer.nt >= 0:
+            faiss.omp_set_num_threads(timer.nt)
+
+    def __next__(self):
+        timer = self.timer
+        self.runs -= 1
+        self.ts.append(time.time())
+        total_time = self.ts[-1] - self.ts[0] if len(self.ts) >= 2 else 0
+        if self.runs == -1 or total_time > timer.max_secs:
+            if timer.nt >= 0:
+                faiss.omp_set_num_threads(timer.remember_nt)
+            ts = np.array(self.ts)
+            times = ts[1:] - ts[:-1]
+            if len(times) == timer.runs:
+                timer.times = times[timer.warmup :]
+            else:
+                # if timeout, we use all the runs
+                timer.times = times[:]
+            raise StopIteration
+
+class RepeatTimer:
+    """
+    This is yet another timer object. It is adapted to Faiss by
+    taking a number of openmp threads to set on input. It should be called
+    in an explicit loop as:
+
+    timer = RepeatTimer(warmup=1, nt=1, runs=6)
+
+    for _ in timer:
+        # perform operation
+
+    print(f"time={timer.get_ms():.1f} ± {timer.get_ms_std():.1f} ms")
+
+    the same timer can be re-used. In that case it is reset each time it
+    enters a loop. It focuses on ms-scale times because for second scale
+    it's usually less relevant to repeat the operation.
+    """
+    def __init__(self, warmup=0, nt=-1, runs=1, max_secs=np.inf):
+        assert warmup < runs
+        self.warmup = warmup
+        self.nt = nt
+        self.runs = runs
+        self.max_secs = max_secs
+        self.remember_nt = faiss.omp_get_max_threads()
+
+    def __iter__(self):
+        return TimerIter(self)
+
+    def ms(self):
+        return np.mean(self.times) * 1000
+
+    def ms_std(self):
+        return np.std(self.times) * 1000 if len(self.times) > 1 else 0.0
+
+    def nruns(self):
+        """ effective number of runs (may be lower than runs - warmup due to timeout)"""
+        return len(self.times)
diff --git a/thirdparty/faiss/contrib/exhaustive_search.py b/thirdparty/faiss/contrib/exhaustive_search.py
index 4f3bd1a89..eadb097fa 100644
--- a/thirdparty/faiss/contrib/exhaustive_search.py
+++ b/thirdparty/faiss/contrib/exhaustive_search.py
@@ -11,7 +11,6 @@
 
 LOG = logging.getLogger(__name__)
 
-
 def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2):
     """Computes the exact KNN search results for a dataset that possibly
     does not fit in RAM but for which we have an iterator that
@@ -20,7 +19,8 @@ def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2):
     LOG.info("knn_ground_truth queries size %s k=%d" % (xq.shape, k))
     t0 = time.time()
     nq, d = xq.shape
-    rh = faiss.ResultHeap(nq, k)
+    keep_max = faiss.is_similarity_metric(metric_type)
+    rh = faiss.ResultHeap(nq, k, keep_max=keep_max)
 
     index = faiss.IndexFlat(d, metric_type)
     if faiss.get_num_gpus():
@@ -50,47 +50,98 @@ def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2):
 
 
 
-def range_search_gpu(xq, r2, index_gpu, index_cpu):
+def range_search_gpu(xq, r2, index_gpu, index_cpu, gpu_k=1024):
     """GPU does not support range search, so we emulate it with
     knn search + fallback to CPU index.
 
-    The index_cpu can either be a CPU index or a numpy table that will
-    be used to construct a Flat index if needed.
+    The index_cpu can either be:
+    - a CPU index that supports range search
+    - a numpy table, that will be used to construct a Flat index if needed.
+    - None. In that case, at most gpu_k results will be returned
     """
     nq, d = xq.shape
-    LOG.debug("GPU search %d queries" % nq)
-    k = min(index_gpu.ntotal, 1024)
+    is_binary_index = isinstance(index_gpu, faiss.IndexBinary)
+    keep_max = faiss.is_similarity_metric(index_gpu.metric_type)
+    r2 = int(r2) if is_binary_index else float(r2)
+    k = min(index_gpu.ntotal, gpu_k)
+    LOG.debug(
+        f"GPU search {nq} queries with {k=:} {is_binary_index=:} {keep_max=:}")
+    t0 = time.time()
     D, I = index_gpu.search(xq, k)
-    if index_gpu.metric_type == faiss.METRIC_L2:
-        mask = D[:, k - 1] < r2
-    else:
-        mask = D[:, k - 1] > r2
-    if mask.sum() > 0:
-        LOG.debug("CPU search remain %d" % mask.sum())
-        if isinstance(index_cpu, np.ndarray):
-            # then it in fact an array that we have to make flat
-            xb = index_cpu
-            index_cpu = faiss.IndexFlat(d, index_gpu.metric_type)
-            index_cpu.add(xb)
-        lim_remain, D_remain, I_remain = index_cpu.range_search(xq[mask], r2)
+    t1 = time.time() - t0
+    if is_binary_index:
+        assert d * 8 < 32768  # let's compact the distance matrix
+        D = D.astype('int16')
+    t2 = 0
+    lim_remain = None
+    if index_cpu is not None:
+        if not keep_max:
+            mask = D[:, k - 1] < r2
+        else:
+            mask = D[:, k - 1] > r2
+        if mask.sum() > 0:
+            LOG.debug("CPU search remain %d" % mask.sum())
+            t0 = time.time()
+            if isinstance(index_cpu, np.ndarray):
+                # then it in fact an array that we have to make flat
+                xb = index_cpu
+                if is_binary_index:
+                    index_cpu = faiss.IndexBinaryFlat(d * 8)
+                else:
+                    index_cpu = faiss.IndexFlat(d, index_gpu.metric_type)
+                index_cpu.add(xb)
+            lim_remain, D_remain, I_remain = index_cpu.range_search(xq[mask], r2)
+            if is_binary_index:
+                D_remain = D_remain.astype('int16')
+            t2 = time.time() - t0
     LOG.debug("combine")
-    D_res, I_res = [], []
-    nr = 0
-    for i in range(nq):
-        if not mask[i]:
-            if index_gpu.metric_type == faiss.METRIC_L2:
-                nv = (D[i, :] < r2).sum()
+    t0 = time.time()
+
+    CombinerRangeKNN = (
+        faiss.CombinerRangeKNNint16 if is_binary_index else
+        faiss.CombinerRangeKNNfloat
+    )
+
+    combiner = CombinerRangeKNN(nq, k, r2, keep_max)
+    if True:
+        sp = faiss.swig_ptr
+        combiner.I = sp(I)
+        combiner.D = sp(D)
+        # combiner.set_knn_result(sp(I), sp(D))
+        if lim_remain is not None:
+            combiner.mask = sp(mask)
+            combiner.D_remain = sp(D_remain)
+            combiner.lim_remain = sp(lim_remain.view("int64"))
+            combiner.I_remain = sp(I_remain)
+            # combiner.set_range_result(sp(mask), sp(lim_remain.view("int64")), sp(D_remain), sp(I_remain))
+        L_res = np.empty(nq + 1, dtype='int64')
+        combiner.compute_sizes(sp(L_res))
+        nres = L_res[-1]
+        D_res = np.empty(nres, dtype=D.dtype)
+        I_res = np.empty(nres, dtype='int64')
+        combiner.write_result(sp(D_res), sp(I_res))
+    else:
+        D_res, I_res = [], []
+        nr = 0
+        for i in range(nq):
+            if not mask[i]:
+                if index_gpu.metric_type == faiss.METRIC_L2:
+                    nv = (D[i, :] < r2).sum()
+                else:
+                    nv = (D[i, :] > r2).sum()
+                D_res.append(D[i, :nv])
+                I_res.append(I[i, :nv])
             else:
-                nv = (D[i, :] > r2).sum()
-            D_res.append(D[i, :nv])
-            I_res.append(I[i, :nv])
-        else:
-            l0, l1 = lim_remain[nr], lim_remain[nr + 1]
-            D_res.append(D_remain[l0:l1])
-            I_res.append(I_remain[l0:l1])
-            nr += 1
-    lims = np.cumsum([0] + [len(di) for di in D_res])
-    return lims, np.hstack(D_res), np.hstack(I_res)
+                l0, l1 = lim_remain[nr], lim_remain[nr + 1]
+                D_res.append(D_remain[l0:l1])
+                I_res.append(I_remain[l0:l1])
+                nr += 1
+        L_res = np.cumsum([0] + [len(di) for di in D_res])
+        D_res = np.hstack(D_res)
+        I_res = np.hstack(I_res)
+    t3 = time.time() - t0
+    LOG.debug(f"times {t1:.3f}s {t2:.3f}s {t3:.3f}s")
+    return L_res, D_res, I_res
 
 
 def range_ground_truth(xq, db_iterator, threshold, metric_type=faiss.METRIC_L2,
@@ -216,6 +267,7 @@ def range_search_max_results(index, query_iterator, radius,
     """
     # TODO: all result manipulations are in python, should move to C++ if perf
     # critical
+    is_binary_index = isinstance(index, faiss.IndexBinary)
 
     if min_results is None:
         assert max_results is not None
@@ -233,6 +285,8 @@ def range_search_max_results(index, query_iterator, radius,
         co = faiss.GpuMultipleClonerOptions()
         co.shard = shard
         index_gpu = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=ngpu)
+    else:
+        index_gpu = None
 
     t_start = time.time()
     t_search = t_post_process = 0
@@ -241,7 +295,8 @@ def range_search_max_results(index, query_iterator, radius,
 
     for xqi in query_iterator:
         t0 = time.time()
-        if ngpu > 0:
+        LOG.debug(f"searching {len(xqi)} vectors")
+        if index_gpu:
             lims_i, Di, Ii = range_search_gpu(xqi, radius, index_gpu, index)
         else:
             lims_i, Di, Ii = index.range_search(xqi, radius)
@@ -251,8 +306,7 @@ def range_search_max_results(index, query_iterator, radius,
         qtot += len(xqi)
 
         t1 = time.time()
-        if xqi.dtype != np.float32:
-            # for binary indexes
+        if is_binary_index:
             # weird Faiss quirk that returns floats for Hamming distances
             Di = Di.astype('int16')
 
diff --git a/thirdparty/faiss/contrib/inspect_tools.py b/thirdparty/faiss/contrib/inspect_tools.py
index 1182156a8..cc22ff536 100644
--- a/thirdparty/faiss/contrib/inspect_tools.py
+++ b/thirdparty/faiss/contrib/inspect_tools.py
@@ -68,6 +68,20 @@ def get_LinearTransform_matrix(pca):
     return A, b
 
 
+def make_LinearTransform_matrix(A, b=None):
+    """ make a linear transform from a matrix and a bias term (optional)"""
+    d_out, d_in = A.shape
+    if b is not None:
+        assert b.shape == (d_out, )
+    lt = faiss.LinearTransform(d_in, d_out, b is not None)
+    faiss.copy_array_to_vector(A.ravel(), lt.A)
+    if b is not None:
+        faiss.copy_array_to_vector(b, lt.b)
+    lt.is_trained = True
+    lt.set_is_orthonormal()
+    return lt
+
+
 def get_additive_quantizer_codebooks(aq):
     """ return to codebooks of an additive quantizer """
     codebooks = faiss.vector_to_array(aq.codebooks).reshape(-1, aq.d)
@@ -82,3 +96,16 @@ def get_flat_data(index):
     """ copy and return the data matrix in an IndexFlat """
     xb = faiss.vector_to_array(index.codes).view("float32")
     return xb.reshape(index.ntotal, index.d)
+
+
+def get_NSG_neighbors(nsg):
+    """ get the neighbor list for the vectors stored in the NSG structure, as
+    a N-by-K matrix of indices """
+    graph = nsg.get_final_graph()
+    neighbors = np.zeros((graph.N, graph.K), dtype='int32')
+    faiss.memcpy(
+        faiss.swig_ptr(neighbors),
+        graph.data,
+        neighbors.nbytes
+    )
+    return neighbors
diff --git a/thirdparty/faiss/contrib/ivf_tools.py b/thirdparty/faiss/contrib/ivf_tools.py
index 8d1e2be6f..26ada886a 100644
--- a/thirdparty/faiss/contrib/ivf_tools.py
+++ b/thirdparty/faiss/contrib/ivf_tools.py
@@ -6,6 +6,9 @@
 import numpy as np
 import faiss
 
+from faiss.contrib.inspect_tools import get_invlist_sizes
+
+
 def add_preassigned(index_ivf, x, a, ids=None):
     """
     Add elements to an IVF index, where the assignment is already computed
@@ -25,7 +28,9 @@ def add_preassigned(index_ivf, x, a, ids=None):
 
 def search_preassigned(index_ivf, xq, k, list_nos, coarse_dis=None):
     """
-    Perform a search in the IVF index, with predefined lists to search into
+    Perform a search in the IVF index, with predefined lists to search into.
+    Supports indexes with pretransforms (as opposed to the
+    IndexIVF.search_preassigned, that cannot be applied with pretransform).
     """
     n, d = xq.shape
     if isinstance(index_ivf, faiss.IndexBinaryIVF):
@@ -37,26 +42,20 @@ def search_preassigned(index_ivf, xq, k, list_nos, coarse_dis=None):
     assert d == index_ivf.d
     assert list_nos.shape == (n, index_ivf.nprobe)
 
-    # the coarse distances are used in IVFPQ with L2 distance and by_residual=True
-    # otherwise we provide dummy coarse_dis
+    # the coarse distances are used in IVFPQ with L2 distance and
+    # by_residual=True otherwise we provide dummy coarse_dis
     if coarse_dis is None:
         coarse_dis = np.zeros((n, index_ivf.nprobe), dtype=dis_type)
     else:
         assert coarse_dis.shape == (n, index_ivf.nprobe)
 
-    D = np.empty((n, k), dtype=dis_type)
-    I = np.empty((n, k), dtype='int64')
-
-    sp = faiss.swig_ptr
-    index_ivf.search_preassigned(
-        n, sp(xq), k,
-        sp(list_nos), sp(coarse_dis), sp(D), sp(I), False)
-    return D, I
+    return index_ivf.search_preassigned(xq, k, list_nos, coarse_dis)
 
 
 def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
     """
-    Perform a range search in the IVF index, with predefined lists to search into
+    Perform a range search in the IVF index, with predefined lists to
+    search into
     """
     n, d = x.shape
     if isinstance(index_ivf, faiss.IndexBinaryIVF):
@@ -65,8 +64,8 @@ def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
     else:
         dis_type = "float32"
 
-    # the coarse distances are used in IVFPQ with L2 distance and by_residual=True
-    # otherwise we provide dummy coarse_dis
+    # the coarse distances are used in IVFPQ with L2 distance and
+    # by_residual=True otherwise we provide dummy coarse_dis
     if coarse_dis is None:
         coarse_dis = np.empty((n, index_ivf.nprobe), dtype=dis_type)
     else:
@@ -78,7 +77,7 @@ def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
     res = faiss.RangeSearchResult(n)
     sp = faiss.swig_ptr
 
-    index_ivf.range_search_preassigned(
+    index_ivf.range_search_preassigned_c(
         n, sp(x), radius,
         sp(list_nos), sp(coarse_dis),
         res
@@ -89,3 +88,56 @@ def range_search_preassigned(index_ivf, x, radius, list_nos, coarse_dis=None):
     dist = faiss.rev_swig_ptr(res.distances, num_results).copy()
     indices = faiss.rev_swig_ptr(res.labels, num_results).copy()
     return lims, dist, indices
+
+
+def replace_ivf_quantizer(index_ivf, new_quantizer):
+    """ replace the IVF quantizer with a flat quantizer and return the
+    old quantizer"""
+    if new_quantizer.ntotal == 0:
+        centroids = index_ivf.quantizer.reconstruct_n()
+        new_quantizer.train(centroids)
+        new_quantizer.add(centroids)
+    else:
+        assert new_quantizer.ntotal == index_ivf.nlist
+
+    # cleanly dealloc old quantizer
+    old_own = index_ivf.own_fields
+    index_ivf.own_fields = False
+    old_quantizer = faiss.downcast_index(index_ivf.quantizer)
+    old_quantizer.this.own(old_own)
+    index_ivf.quantizer = new_quantizer
+
+    if hasattr(index_ivf, "referenced_objects"):
+        index_ivf.referenced_objects.append(new_quantizer)
+    else:
+        index_ivf.referenced_objects = [new_quantizer]
+    return old_quantizer
+
+
+def permute_invlists(index_ivf, perm):
+    """ Apply some permutation to the inverted lists, and modify the quantizer
+    entries accordingly.
+    Perm is an array of size nlist, where old_index = perm[new_index]
+    """
+    nlist, = perm.shape
+    assert index_ivf.nlist == nlist
+    quantizer = faiss.downcast_index(index_ivf.quantizer)
+    assert quantizer.ntotal == index_ivf.nlist
+    perm = np.ascontiguousarray(perm, dtype='int64')
+
+    # just make sure it's a permutation...
+    bc = np.bincount(perm, minlength=nlist)
+    assert np.all(bc == np.ones(nlist, dtype=int))
+
+    # handle quantizer
+    quantizer.permute_entries(perm)
+
+    # handle inverted lists
+    invlists = faiss.downcast_InvertedLists(index_ivf.invlists)
+    invlists.permute_invlists(faiss.swig_ptr(perm))
+
+
+def sort_invlists_by_size(index_ivf):
+    invlist_sizes = get_invlist_sizes(index_ivf.invlists)
+    perm = np.argsort(invlist_sizes)
+    permute_invlists(index_ivf, perm)
diff --git a/thirdparty/faiss/contrib/ondisk.py b/thirdparty/faiss/contrib/ondisk.py
index 37288a16d..26a95f44f 100644
--- a/thirdparty/faiss/contrib/ondisk.py
+++ b/thirdparty/faiss/contrib/ondisk.py
@@ -9,11 +9,15 @@
 
 LOG = logging.getLogger(__name__)
 
-def merge_ondisk(trained_index: faiss.Index,
-                 shard_fnames: List[str],
-                 ivfdata_fname: str) -> None:
-    """ Add the contents of the indexes stored in shard_fnames into the index
-    trained_index. The on-disk data is stored in ivfdata_fname """
+
+def merge_ondisk(
+    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str
+) -> None:
+    """Add the contents of the indexes stored in shard_fnames into the index
+    trained_index. The on-disk data is stored in ivfdata_fname"""
+    assert not isinstance(
+        trained_index, faiss.IndexIVFPQR
+    ), "IndexIVFPQR is not supported as an on disk index."
     # merge the images into an on-disk index
     # first load the inverted lists
     ivfs = []
@@ -38,8 +42,8 @@ def merge_ondisk(trained_index: faiss.Index,
     # prepare the output inverted lists. They will be written
     # to merged_index.ivfdata
     invlists = faiss.OnDiskInvertedLists(
-        index_ivf.nlist, index_ivf.code_size,
-        ivfdata_fname)
+        index_ivf.nlist, index_ivf.code_size, ivfdata_fname
+    )
 
     # merge all the inverted lists
     ivf_vector = faiss.InvertedListsPtrVector()
diff --git a/thirdparty/faiss/contrib/rpc.py b/thirdparty/faiss/contrib/rpc.py
index f9d7ee828..cf8986226 100755
--- a/thirdparty/faiss/contrib/rpc.py
+++ b/thirdparty/faiss/contrib/rpc.py
@@ -10,14 +10,18 @@
 Uses pickle for serialization and the socket interface.
 """
 
-import os,pdb,pickle,time,errno,sys,_thread,traceback,socket,threading,gc
-
+import os
+import pickle
+import sys
+import _thread
+import traceback
+import socket
 import logging
 
 LOG = logging.getLogger(__name__)
 
 # default
-PORT=12032
+PORT = 12032
 
 
 #########################################################################
@@ -30,6 +34,7 @@ def inline_send_handle(f, conn):
     pickle.dump(size, conn)
     conn.write(f.read(size))
 
+
 def inline_send_string(s, conn):
     size = len(s)
     pickle.dump(size, conn)
@@ -128,7 +133,7 @@ def one_function(self):
             f=getattr(self,fname)
         except AttributeError:
             st = AttributeError("unknown method "+fname)
-            self.log("unknown method ")
+            self.log("unknown method")
 
         try:
             ret = f(*args)
@@ -198,7 +203,7 @@ def __init__(self, HOST, port=PORT, v6=False):
         socktype = socket.AF_INET6 if v6 else socket.AF_INET
 
         sock = socket.socket(socktype, socket.SOCK_STREAM)
-        LOG.info("connecting", HOST, port, socktype)
+        LOG.info("connecting to %s:%d, socket type: %s", HOST, port, socktype)
         sock.connect((HOST, port))
         self.sock = sock
         self.fs = FileSock(sock)
@@ -226,13 +231,13 @@ def run_server(new_handler, port=PORT, report_to_file=None, v6=False):
     s = socket.socket(socktype, socket.SOCK_STREAM)
     s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
 
-    LOG.info("bind %s:%d" % (HOST, port))
+    LOG.info("bind %s:%d", HOST, port)
     s.bind((HOST, port))
     s.listen(5)
 
     LOG.info("accepting connections")
     if report_to_file is not None:
-        LOG.info('storing host+port in', report_to_file)
+        LOG.info('storing host+port in %s', report_to_file)
         open(report_to_file, 'w').write('%s:%d ' % (socket.gethostname(), port))
 
     while True:
@@ -242,10 +247,10 @@ def run_server(new_handler, port=PORT, report_to_file=None, v6=False):
             if e[1]=='Interrupted system call': continue
             raise
 
-        LOG.info('Connected by', addr, end=' ')
+        LOG.info('Connected to %s', addr)
 
         ibs = new_handler(conn)
 
         tid = _thread.start_new_thread(ibs.exec_loop,())
 
-        LOG.info("tid",tid)
+        LOG.debug("Thread ID: %d", tid)
diff --git a/thirdparty/faiss/contrib/torch_utils.py b/thirdparty/faiss/contrib/torch_utils.py
index 9799c076b..790c295e4 100644
--- a/thirdparty/faiss/contrib/torch_utils.py
+++ b/thirdparty/faiss/contrib/torch_utils.py
@@ -41,7 +41,7 @@ def swig_ptr_from_HalfTensor(x):
     assert x.dtype == torch.float16
     # no canonical half type in C/C++
     return faiss.cast_integer_to_void_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.storage().data_ptr() + x.storage_offset() * 2)
 
 def swig_ptr_from_FloatTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
@@ -55,7 +55,7 @@ def swig_ptr_from_IntTensor(x):
     assert x.is_contiguous()
     assert x.dtype == torch.int32, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_int_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 8)
+        x.storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IndicesTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
@@ -204,7 +204,7 @@ def torch_replacement_train(self, x):
     def torch_replacement_search(self, x, k, D=None, I=None):
         if type(x) is np.ndarray:
             # forward to faiss __init__.py base method
-            return self.search_numpy(x, k, D, I)
+            return self.search_numpy(x, k, D=D, I=I)
 
         assert type(x) is torch.Tensor
         n, d = x.shape
@@ -240,7 +240,7 @@ def torch_replacement_search(self, x, k, D=None, I=None):
     def torch_replacement_search_and_reconstruct(self, x, k, D=None, I=None, R=None):
         if type(x) is np.ndarray:
             # Forward to faiss __init__.py base method
-            return self.search_and_reconstruct_numpy(x, k, D, I, R)
+            return self.search_and_reconstruct_numpy(x, k, D=D, I=I, R=R)
 
         assert type(x) is torch.Tensor
         n, d = x.shape
@@ -319,7 +319,10 @@ def torch_replacement_reconstruct(self, key, x=None):
 
         return x
 
-    def torch_replacement_reconstruct_n(self, n0, ni, x=None):
+    def torch_replacement_reconstruct_n(self, n0=0, ni=-1, x=None):
+        if ni == -1:
+            ni = self.ntotal
+
         # No tensor inputs are required, but with importing this module, we
         # assume that the default should be torch tensors. If we are passed a
         # numpy array, however, assume that the user is overriding this default
@@ -490,10 +493,10 @@ def torch_replacement_sa_decode(self, codes, x=None):
             handle_torch_Index(the_class)
 
 # allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2):
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
-        return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric)
+        return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
 
     nb, d = xb.size()
     if xb.is_contiguous():
@@ -570,6 +573,7 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI
     args.outDistances = D_ptr
     args.outIndices = I_ptr
     args.outIndicesType = I_type
+    args.device = device
 
     with using_stream(res):
         faiss.bfKnn(res, args)
@@ -579,7 +583,7 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI
 torch_replace_method(faiss_module, 'knn_gpu', torch_replacement_knn_gpu, True, True)
 
 # allows torch tensor usage with bfKnn for all pairwise distances
-def torch_replacement_pairwise_distance_gpu(res, xq, xb, D=None, metric=faiss.METRIC_L2):
+def torch_replacement_pairwise_distance_gpu(res, xq, xb, D=None, metric=faiss.METRIC_L2, device=-1):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
         return faiss.pairwise_distance_gpu_numpy(res, xq, xb, D, metric)
@@ -643,6 +647,7 @@ def torch_replacement_pairwise_distance_gpu(res, xq, xb, D=None, metric=faiss.ME
     args.queryType = xq_type
     args.numQueries = nq
     args.outDistances = D_ptr
+    args.device = device
 
     with using_stream(res):
         faiss.bfKnn(res, args)
diff --git a/thirdparty/faiss/demos/CMakeLists.txt b/thirdparty/faiss/demos/CMakeLists.txt
index a2b8596dd..0e0bc4171 100644
--- a/thirdparty/faiss/demos/CMakeLists.txt
+++ b/thirdparty/faiss/demos/CMakeLists.txt
@@ -21,3 +21,6 @@ target_link_libraries(demo_sift1M PRIVATE faiss)
 
 add_executable(demo_weighted_kmeans EXCLUDE_FROM_ALL demo_weighted_kmeans.cpp)
 target_link_libraries(demo_weighted_kmeans PRIVATE faiss)
+
+add_executable(demo_residual_quantizer EXCLUDE_FROM_ALL demo_residual_quantizer.cpp)
+target_link_libraries(demo_residual_quantizer PRIVATE faiss)
diff --git a/thirdparty/faiss/demos/demo_imi_flat.cpp b/thirdparty/faiss/demos/demo_imi_flat.cpp
index d35a4c597..1480422ac 100644
--- a/thirdparty/faiss/demos/demo_imi_flat.cpp
+++ b/thirdparty/faiss/demos/demo_imi_flat.cpp
@@ -43,7 +43,7 @@ int main() {
     //
     // We here assume that its lifespan of this coarse quantizer will cover the
     // lifespan of the inverted-file quantizer IndexIVFFlat below
-    // With dynamic allocation, one may give the responsability to free the
+    // With dynamic allocation, one may give the responsibility to free the
     // quantizer to the inverted-file index (with attribute do_delete_quantizer)
     //
     // Note: a regular clustering algorithm would be defined as:
@@ -132,7 +132,7 @@ int main() {
                k,
                nq);
 
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff --git a/thirdparty/faiss/demos/demo_imi_pq.cpp b/thirdparty/faiss/demos/demo_imi_pq.cpp
index a5af6798d..a2af65e79 100644
--- a/thirdparty/faiss/demos/demo_imi_pq.cpp
+++ b/thirdparty/faiss/demos/demo_imi_pq.cpp
@@ -44,7 +44,7 @@ int main() {
     //
     // We here assume that its lifespan of this coarse quantizer will cover the
     // lifespan of the inverted-file quantizer IndexIVFFlat below
-    // With dynamic allocation, one may give the responsability to free the
+    // With dynamic allocation, one may give the responsibility to free the
     // quantizer to the inverted-file index (with attribute do_delete_quantizer)
     //
     // Note: a regular clustering algorithm would be defined as:
@@ -126,7 +126,7 @@ int main() {
                nb);
 
         std::vector<float> database(nb * d);
-        std::vector<faiss::Index::idx_t> ids(nb);
+        std::vector<faiss::idx_t> ids(nb);
         for (size_t i = 0; i < nb; i++) {
             for (size_t j = 0; j < d; j++) {
                 database[i * d + j] = distrib(rng);
@@ -169,7 +169,7 @@ int main() {
     // - given a vector float *x, finding which k centroids are
     //   closest to it (ie to find the nearest neighbors) can be done with
     //
-    //   faiss::Index::idx_t *centroid_ids = new faiss::Index::idx_t[k];
+    //   faiss::idx_t *centroid_ids = new faiss::idx_t[k];
     //   float *distances = new float[k];
     //   index.quantizer->search (1, x, k, dis, centroids_ids);
     //
@@ -184,7 +184,7 @@ int main() {
                k,
                nq);
 
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff --git a/thirdparty/faiss/demos/demo_ivfpq_indexing.cpp b/thirdparty/faiss/demos/demo_ivfpq_indexing.cpp
index 15af713f9..9773cac29 100644
--- a/thirdparty/faiss/demos/demo_ivfpq_indexing.cpp
+++ b/thirdparty/faiss/demos/demo_ivfpq_indexing.cpp
@@ -18,7 +18,7 @@
 
 double elapsed() {
     struct timeval tv;
-    gettimeofday(&tv, NULL);
+    gettimeofday(&tv, nullptr);
     return tv.tv_sec + tv.tv_usec * 1e-6;
 }
 
@@ -118,7 +118,7 @@ int main() {
                k,
                nq);
 
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff --git a/thirdparty/faiss/demos/demo_nndescent.cpp b/thirdparty/faiss/demos/demo_nndescent.cpp
index 6b83256a4..34594b035 100644
--- a/thirdparty/faiss/demos/demo_nndescent.cpp
+++ b/thirdparty/faiss/demos/demo_nndescent.cpp
@@ -58,8 +58,8 @@ int main(void) {
         }
 
         int k = 5;
-        std::vector<faiss::IndexNNDescent::idx_t> nns(k * nq);
-        std::vector<faiss::IndexFlat::idx_t> gt_nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> gt_nns(k * nq);
         std::vector<float> dis(k * nq);
 
         auto start = high_resolution_clock::now();
diff --git a/thirdparty/faiss/demos/demo_residual_quantizer.cpp b/thirdparty/faiss/demos/demo_residual_quantizer.cpp
new file mode 100644
index 000000000..6a0fe4e7f
--- /dev/null
+++ b/thirdparty/faiss/demos/demo_residual_quantizer.cpp
@@ -0,0 +1,297 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <climits>
+#include <cstdio>
+#include <memory>
+
+#include <faiss/IVFlib.h>
+#include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexIVFAdditiveQuantizer.h>
+#include <faiss/MetricType.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+/* This demo file shows how to:
+ * - use a DistanceComputer to compute distances with encoded vectors
+ * - in the context of an IVF, how to split an additive quantizer into an
+ * AdditiveCoarseQuantizer and a ResidualQuantizer, in two different ways, with
+ * and without storing the prefix.
+ */
+
+int main() {
+    /******************************************
+     * Generate a test dataset
+     ******************************************/
+    using idx_t = faiss::idx_t;
+    size_t d = 128;
+    size_t nt = 10000;
+    size_t nb = 10000;
+    size_t nq = 100;
+    double t0 = faiss::getmillisecs();
+
+    auto tic = [t0]() {
+        printf("[%.3f s] ", (faiss::getmillisecs() - t0) / 1000);
+    };
+
+    tic();
+    printf("samping dataset of %zd dim vectors, Q %zd B %zd T %zd\n",
+           d,
+           nq,
+           nb,
+           nt);
+
+    std::vector<float> buf(d * (nq + nt + nb));
+    faiss::rand_smooth_vectors(nq + nt + nb, d, buf.data(), 1234);
+    const float* xt = buf.data();
+    const float* xb = buf.data() + nt * d;
+    const float* xq = buf.data() + (nt + nb) * d;
+
+    idx_t k = 10;
+    std::vector<idx_t> gt(k * nq);
+    std::vector<float> unused(k * nq);
+    tic();
+    printf("compute ground truth, k=%zd\n", k);
+    faiss::knn_L2sqr(xq, xb, d, nq, nb, k, unused.data(), gt.data());
+
+    // a function to compute the accuracy
+    auto accuracy = [&](const idx_t* I) {
+        idx_t accu = 0;
+        for (idx_t q = 0; q < nq; q++) {
+            accu += faiss::ranklist_intersection_size(
+                    k, gt.data() + q * k, k, I + q * k);
+        }
+        return double(accu) / (k * nq);
+    };
+
+    /******************************************
+     * Prepare the residual quantizer
+     ******************************************/
+
+    faiss::ResidualQuantizer rq(
+            d, 7, 6, faiss::AdditiveQuantizer::ST_norm_qint8);
+    // do cheap an inaccurate training
+    rq.cp.niter = 5;
+    rq.max_beam_size = 5;
+    rq.train_type = 0;
+    tic();
+    printf("training the residual quantizer beam_size=%d\n", rq.max_beam_size);
+    rq.train(nt, xt);
+
+    tic();
+    printf("encoding the database, code_size=%zd\n", rq.code_size);
+    size_t code_size = rq.code_size;
+    std::vector<uint8_t> raw_codes(nb * code_size);
+    rq.compute_codes(xb, raw_codes.data(), nb);
+
+    /****************************************************************
+     * Make an index that uses that residual quantizer
+     * Verify that a distance computer gives the same distances
+     ****************************************************************/
+    {
+        faiss::IndexResidualQuantizer index(
+                rq.d, rq.nbits, faiss::METRIC_L2, rq.search_type);
+
+        // override trained index
+        index.rq = rq;
+        index.is_trained = true;
+
+        // override vectors
+        index.codes = raw_codes;
+        index.ntotal = nb;
+
+        tic();
+        printf("IndexResidualQuantizer ready, searching\n");
+
+        std::vector<float> D(k * nq);
+        std::vector<idx_t> I(k * nq);
+        index.search(nq, xq, k, D.data(), I.data());
+
+        tic();
+        printf("Accuracy (intersection @ %zd): %.3f\n", k, accuracy(I.data()));
+        std::unique_ptr<faiss::FlatCodesDistanceComputer> dc(
+                index.get_FlatCodesDistanceComputer());
+
+        float max_diff12 = 0, max_diff13 = 0;
+
+        for (idx_t q = 0; q < nq; q++) {
+            const float* query = xq + q * d;
+            dc->set_query(query);
+            for (int i = 0; i < k; i++) {
+                // 3 ways of computing the same distance
+
+                // distance returned by the index
+                float dis1 = D[q * k + i];
+
+                // distance returned by the DistanceComputer that accesses the
+                // index
+                idx_t db_index = I[q * k + i];
+                float dis2 = (*dc)(db_index);
+
+                // distance computer from a code that does not belong to the
+                // index
+                const uint8_t* code = raw_codes.data() + code_size * db_index;
+                float dis3 = dc->distance_to_code(code);
+
+                max_diff12 = std::max(std::abs(dis1 - dis2), max_diff12);
+                max_diff13 = std::max(std::abs(dis1 - dis3), max_diff13);
+            }
+        }
+        tic();
+        printf("Max DistanceComputer discrepancy 1-2: %g 1-3: %g\n",
+               max_diff12,
+               max_diff13);
+    }
+
+    /****************************************************************
+     * Make an IVF index that uses the first 2 levels as a coarse quantizer
+     * The IVF codes contain the full code (ie. redundant with the coarse
+     *quantizer code)
+     ****************************************************************/
+    {
+        // build a coarse quantizer from the 2 first levels of the RQ
+        std::vector<size_t> nbits(2);
+        std::copy(rq.nbits.begin(), rq.nbits.begin() + 2, nbits.begin());
+        faiss::ResidualCoarseQuantizer rcq(rq.d, nbits);
+
+        // set the coarse quantizer from the 2 first quantizers
+        rcq.rq.initialize_from(rq);
+        rcq.is_trained = true;
+        rcq.ntotal = (idx_t)1 << rcq.rq.tot_bits;
+
+        // settings for exhaustive search in RCQ
+        rcq.centroid_norms.resize(rcq.ntotal);
+        rcq.aq->compute_centroid_norms(rcq.centroid_norms.data());
+        rcq.beam_factor = -1.0; // use exact search
+        size_t nlist = rcq.ntotal;
+        tic();
+        printf("RCQ nlist = %zd tot_bits=%zd\n", nlist, rcq.rq.tot_bits);
+
+        // build a IVFResidualQuantizer from that
+        faiss::IndexIVFResidualQuantizer index(
+                &rcq, rcq.d, nlist, rq.nbits, faiss::METRIC_L2, rq.search_type);
+        index.by_residual = false;
+        index.rq = rq;
+        index.is_trained = true;
+
+        // there are 3 ways of filling up the index...
+        for (std::string filled_with : {"add", "manual", "derived"}) {
+            tic();
+            printf("filling up the index with %s, code_size=%zd\n",
+                   filled_with.c_str(),
+                   index.code_size);
+
+            index.reset();
+
+            if (filled_with == "add") {
+                // standard add method
+                index.add(nb, xb);
+            } else if (filled_with == "manual") {
+                // compute inverted lists and add elements manually
+                // fill in the inverted index manually
+                faiss::InvertedLists& invlists = *index.invlists;
+
+                // assign vectors to inverted lists
+                std::vector<idx_t> listnos(nb);
+                std::vector<float> unused(nb);
+                rcq.search(nb, xb, 1, unused.data(), listnos.data());
+
+                // populate inverted lists
+                for (idx_t i = 0; i < nb; i++) {
+                    invlists.add_entry(
+                            listnos[i], i, &raw_codes[i * code_size]);
+                }
+
+                index.ntotal = nb;
+            } else if (filled_with == "derived") {
+                // Since we have the raw codes precomputed, their prefix is the
+                // inverted list index, so let's use that.
+                faiss::InvertedLists& invlists = *index.invlists;
+
+                // populate inverted lists
+                for (idx_t i = 0; i < nb; i++) {
+                    const uint8_t* code = &raw_codes[i * code_size];
+                    faiss::BitstringReader rd(code, code_size);
+                    idx_t list_no =
+                            rd.read(rcq.rq.tot_bits); // read the list number
+                    invlists.add_entry(list_no, i, code);
+                }
+
+                index.ntotal = nb;
+            }
+
+            tic();
+            printf("Index filled in\n");
+
+            for (int nprobe : {1, 4, 16, 64, int(nlist)}) {
+                printf("setting nprobe=%-4d", nprobe);
+
+                index.nprobe = nprobe;
+                std::vector<float> D(k * nq);
+                std::vector<idx_t> I(k * nq);
+                index.search(nq, xq, k, D.data(), I.data());
+
+                tic();
+                printf("Accuracy (intersection @ %zd): %.3f\n",
+                       k,
+                       accuracy(I.data()));
+            }
+        }
+    }
+
+    /****************************************************************
+     * Make an IVF index that uses the first 2 levels as a coarse
+     * quantizer, but this time does not store the code prefix from the index
+     ****************************************************************/
+
+    {
+        // build a coarse quantizer from the 2 first levels of the RQ
+        int nlevel = 2;
+
+        std::unique_ptr<faiss::IndexIVFResidualQuantizer> index(
+                faiss::ivflib::ivf_residual_from_quantizer(rq, nlevel));
+
+        // there are 2 ways of filling up the index...
+        for (std::string filled_with : {"add", "derived"}) {
+            tic();
+            printf("filling up the IVF index with %s, code_size=%zd\n",
+                   filled_with.c_str(),
+                   index->code_size);
+
+            index->reset();
+
+            if (filled_with == "add") {
+                // standard add method
+                index->add(nb, xb);
+            } else if (filled_with == "derived") {
+                faiss::ivflib::ivf_residual_add_from_flat_codes(
+                        index.get(), nb, raw_codes.data(), rq.code_size);
+            }
+
+            tic();
+            printf("Index filled in\n");
+
+            for (int nprobe : {1, 4, 16, 64, int(index->nlist)}) {
+                printf("setting nprobe=%-4d", nprobe);
+
+                index->nprobe = nprobe;
+                std::vector<float> D(k * nq);
+                std::vector<idx_t> I(k * nq);
+                index->search(nq, xq, k, D.data(), I.data());
+
+                tic();
+                printf("Accuracy (intersection @ %zd): %.3f\n",
+                       k,
+                       accuracy(I.data()));
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/thirdparty/faiss/demos/demo_sift1M.cpp b/thirdparty/faiss/demos/demo_sift1M.cpp
index aa4975f77..598565f85 100644
--- a/thirdparty/faiss/demos/demo_sift1M.cpp
+++ b/thirdparty/faiss/demos/demo_sift1M.cpp
@@ -140,8 +140,8 @@ int main() {
         assert(d == d2 || !"query does not have same dimension as train set");
     }
 
-    size_t k;                // nb of results per query in the GT
-    faiss::Index::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
+    size_t k;         // nb of results per query in the GT
+    faiss::idx_t* gt; // nq * k matrix of ground-truth nearest-neighbors
 
     {
         printf("[%.3f s] Loading ground truth for %ld queries\n",
@@ -153,7 +153,7 @@ int main() {
         int* gt_int = ivecs_read("sift1M/sift_groundtruth.ivecs", &k, &nq2);
         assert(nq2 == nq || !"incorrect nb of ground truth entries");
 
-        gt = new faiss::Index::idx_t[k * nq];
+        gt = new faiss::idx_t[k * nq];
         for (int i = 0; i < k * nq; i++) {
             gt[i] = gt_int[i];
         }
@@ -219,7 +219,7 @@ int main() {
                nq);
 
         // output buffers
-        faiss::Index::idx_t* I = new faiss::Index::idx_t[nq * k];
+        faiss::idx_t* I = new faiss::idx_t[nq * k];
         float* D = new float[nq * k];
 
         index->search(nq, xq, k, D, I);
diff --git a/thirdparty/faiss/demos/demo_weighted_kmeans.cpp b/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
index 9892fd08a..245029131 100644
--- a/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
+++ b/thirdparty/faiss/demos/demo_weighted_kmeans.cpp
@@ -7,6 +7,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 
 #include <faiss/Clustering.h>
 #include <faiss/IndexFlat.h>
@@ -39,13 +40,13 @@ float weighted_kmeans_clustering(
 
     switch (index_num) {
         case WKMT_FlatL2:
-            index.reset(new IndexFlatL2(d));
+            index = std::make_unique<IndexFlatL2>(d);
             break;
         case WKMT_FlatIP:
-            index.reset(new IndexFlatIP(d));
+            index = std::make_unique<IndexFlatIP>(d);
             break;
         case WKMT_FlatIP_spherical:
-            index.reset(new IndexFlatIP(d));
+            index = std::make_unique<IndexFlatIP>(d);
             clus.spherical = true;
             break;
         case WKMT_HNSW:
@@ -155,7 +156,7 @@ int main(int argc, char** argv) {
                 faiss::IndexFlatL2 cent_index(d);
                 cent_index.add(nc, centroids.data());
                 std::vector<float> dis(n);
-                std::vector<faiss::Index::idx_t> idx(n);
+                std::vector<faiss::idx_t> idx(n);
 
                 cent_index.search(
                         nc * 2, ccent.data(), 1, dis.data(), idx.data());
diff --git a/thirdparty/faiss/faiss/AutoTune.cpp b/thirdparty/faiss/faiss/AutoTune.cpp
index 5e41aa0e0..7c663a1dd 100644
--- a/thirdparty/faiss/faiss/AutoTune.cpp
+++ b/thirdparty/faiss/faiss/AutoTune.cpp
@@ -32,6 +32,7 @@
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexScalarQuantizer.h>
+#include <faiss/IndexShardsIVF.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
 
@@ -151,12 +152,10 @@ bool OperatingPoints::add(
             return false;
         }
     }
-    { // remove non-optimal points from array
-        int i = a.size() - 1;
-        while (i > 0) {
-            if (a[i].t < a[i - 1].t)
-                a.erase(a.begin() + (i - 1));
-            i--;
+    // remove non-optimal points from array
+    for (int i = a.size() - 1; i > 0; --i) {
+        if (a[i].t < a[i - 1].t) {
+            a.erase(a.begin() + (i - 1));
         }
     }
     return true;
@@ -285,6 +284,8 @@ std::string ParameterSpace::combination_name(size_t cno) const {
     char buf[1000], *wp = buf;
     *wp = 0;
     for (int i = 0; i < parameter_ranges.size(); i++) {
+        FAISS_THROW_IF_NOT_MSG(
+                buf + 1000 - wp >= 0, "Overflow detected in snprintf");
         const ParameterRange& pr = parameter_ranges[i];
         size_t j = cno % pr.values.size();
         cno /= pr.values.size();
@@ -333,7 +334,7 @@ ParameterRange& ParameterSpace::add_range(const std::string& name) {
             return pr;
         }
     }
-    parameter_ranges.push_back(ParameterRange());
+    parameter_ranges.emplace_back();
     parameter_ranges.back().name = name;
     return parameter_ranges.back();
 }
@@ -354,7 +355,7 @@ void ParameterSpace::initialize(const Index* index) {
         index = ix->index;
     }
 
-    if (DC(IndexIVF)) {
+    if (DC(IndexIVFInterface)) {
         {
             ParameterRange& pr = add_range("nprobe");
             for (int i = 0; i < 13; i++) {
@@ -461,6 +462,16 @@ void ParameterSpace::set_index_parameter(
         set_index_parameter(ix->index, name, val);
         return;
     }
+    if (DC(IndexShardsIVF)) {
+        // special handling because the nprobe is set at the sub-class level
+        // but other params are set on the class itself
+        if (name.find("quantizer_") == 0 && name != "nprobe" &&
+            name != "quantizer_nprobe") {
+            std::string sub_name = name.substr(strlen("quantizer_"));
+            set_index_parameter(ix->quantizer, sub_name, val);
+            return;
+        }
+    }
     if (DC(ThreadedIndex<Index>)) {
         // call on all sub-indexes
         auto fn = [this, name, val](int /* no */, Index* subIndex) {
@@ -523,6 +534,19 @@ void ParameterSpace::set_index_parameter(
         }
     }
 
+    if (name == "efConstruction") {
+        if (DC(IndexHNSW)) {
+            ix->hnsw.efConstruction = int(val);
+            return;
+        }
+        if (DC(IndexIVF)) {
+            if (IndexHNSW* cq = dynamic_cast<IndexHNSW*>(ix->quantizer)) {
+                cq->hnsw.efConstruction = int(val);
+                return;
+            }
+        }
+    }
+
     if (name == "efSearch") {
         if (DC(IndexHNSW)) {
             ix->hnsw.efSearch = int(val);
@@ -595,7 +619,7 @@ void ParameterSpace::explore(
     if (n_experiments == 0) {
         for (size_t cno = 0; cno < n_comb; cno++) {
             set_index_parameters(index, cno);
-            std::vector<Index::idx_t> I(nq * crit.nnn);
+            std::vector<idx_t> I(nq * crit.nnn);
             std::vector<float> D(nq * crit.nnn);
 
             double t0 = getmillisecs();
@@ -664,7 +688,7 @@ void ParameterSpace::explore(
         }
 
         set_index_parameters(index, cno);
-        std::vector<Index::idx_t> I(nq * crit.nnn);
+        std::vector<idx_t> I(nq * crit.nnn);
         std::vector<float> D(nq * crit.nnn);
 
         double t0 = getmillisecs();
@@ -675,7 +699,7 @@ void ParameterSpace::explore(
         do {
             if (thread_over_batches) {
 #pragma omp parallel for
-                for (Index::idx_t q0 = 0; q0 < nq; q0 += batchsize) {
+                for (idx_t q0 = 0; q0 < nq; q0 += batchsize) {
                     size_t q1 = q0 + batchsize;
                     if (q1 > nq)
                         q1 = nq;
diff --git a/thirdparty/faiss/faiss/AutoTune.h b/thirdparty/faiss/faiss/AutoTune.h
index 58be60f96..d8c80e84a 100644
--- a/thirdparty/faiss/faiss/AutoTune.h
+++ b/thirdparty/faiss/faiss/AutoTune.h
@@ -11,7 +11,6 @@
 #define FAISS_AUTO_TUNE_H
 
 #include <stdint.h>
-#include <unordered_map>
 #include <vector>
 
 #include <faiss/Index.h>
@@ -24,7 +23,6 @@ namespace faiss {
  * higher is better.
  */
 struct AutoTuneCriterion {
-    typedef Index::idx_t idx_t;
     idx_t nq;     ///< nb of queries this criterion is evaluated on
     idx_t nnn;    ///< nb of NNs that the query should request
     idx_t gt_nnn; ///< nb of GT NNs required to evaluate criterion
diff --git a/thirdparty/faiss/faiss/Clustering.cpp b/thirdparty/faiss/faiss/Clustering.cpp
index 6e15865b1..c47200294 100644
--- a/thirdparty/faiss/faiss/Clustering.cpp
+++ b/thirdparty/faiss/faiss/Clustering.cpp
@@ -28,20 +28,6 @@
 
 namespace faiss {
 
-ClusteringParameters::ClusteringParameters()
-        : niter(25),
-          nredo(1),
-          verbose(false),
-          spherical(false),
-          int_centroids(false),
-          update_index(false),
-          frozen_centroids(false),
-          min_points_per_centroid(39),
-          max_points_per_centroid(256),
-          seed(1234),
-          decode_block_size(32768) {}
-// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k
-
 Clustering::Clustering(int d, int k) : d(d), k(k) {}
 
 Clustering::Clustering(int d, int k, const ClusteringParameters& cp)
@@ -89,8 +75,6 @@ void Clustering::train(
 
 namespace {
 
-using idx_t = Clustering::idx_t;
-
 idx_t subsample_training_set(
         const Clustering& clus,
         idx_t nx,
@@ -234,7 +218,7 @@ int split_clusters(
     for (size_t ci = 0; ci < k; ci++) {
         if (hassign[ci] == 0) { /* need to redefine a centroid */
             size_t cj;
-            for (cj = 0; 1; cj = (cj + 1) % k) {
+            for (cj = 0; true; cj = (cj + 1) % k) {
                 /* probability to pick this cluster for split */
                 float p = (hassign[cj] - 1.0) / (float)(n - k);
                 float r = rng.rand_float();
@@ -492,7 +476,7 @@ void Clustering::train_encoded(
     std::unique_ptr<float[]> dis(new float[nx]);
 
     // remember best iteration for redo
-    bool lower_is_better = index.metric_type != METRIC_INNER_PRODUCT;
+    bool lower_is_better = !is_similarity_metric(index.metric_type);
     float best_obj = lower_is_better ? HUGE_VALF : -HUGE_VALF;
     std::vector<ClusteringIterationStats> best_iteration_stats;
     std::vector<float> best_centroids;
@@ -573,8 +557,12 @@ void Clustering::train_encoded(
             double t0s = getmillisecs();
 
             if (!codec) {
-                index.assign(nx, reinterpret_cast<const float *>(x),
-                             assign.get(), dis.get());
+                index.search(
+                        nx,
+                        reinterpret_cast<const float*>(x),
+                        1,
+                        dis.get(),
+                        assign.get());
             } else {
                 // search by blocks of decode_block_size vectors
                 size_t code_size = codec->sa_code_size();
@@ -726,7 +714,7 @@ float kmeans_clustering(
         const float* x,
         float* centroids) {
     Clustering clus(d, k);
-    clus.verbose = d * n * k > (1L << 30);
+    clus.verbose = d * n * k > (size_t(1) << 30);
     // display logs if > 1Gflop per iteration
     IndexFlatL2 index(d);
     clus.train(n, x, index);
@@ -758,8 +746,6 @@ ProgressiveDimClustering::ProgressiveDimClustering(
 
 namespace {
 
-using idx_t = Index::idx_t;
-
 void copy_columns(idx_t n, idx_t d1, const float* src, idx_t d2, float* dest) {
     idx_t d = std::min(d1, d2);
     for (idx_t i = 0; i < n; i++) {
diff --git a/thirdparty/faiss/faiss/Clustering.h b/thirdparty/faiss/faiss/Clustering.h
index dbf11296f..1c171b18c 100644
--- a/thirdparty/faiss/faiss/Clustering.h
+++ b/thirdparty/faiss/faiss/Clustering.h
@@ -5,7 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
+/** Implementation of k-means clustering with many variants. */
 
 #ifndef FAISS_CLUSTERING_H
 #define FAISS_CLUSTERING_H
@@ -35,25 +35,35 @@ extern double early_stop_threshold;
  * constructor of the Clustering object.
  */
 struct ClusteringParameters {
-    int niter; ///< clustering iterations
-    int nredo; ///< redo clustering this many times and keep best
-
-    bool verbose;
-    bool spherical;        ///< do we want normalized centroids?
-    bool int_centroids;    ///< round centroids coordinates to integer
-    bool update_index;     ///< re-train index after each iteration?
-    bool frozen_centroids; ///< use the centroids provided as input and do not
-                           ///< change them during iterations
-
-    int min_points_per_centroid; ///< otherwise you get a warning
-    int max_points_per_centroid; ///< to limit size of dataset
-
-    int seed; ///< seed for the random number generator
-
-    size_t decode_block_size; ///< how many vectors at a time to decode
-
-    /// sets reasonable defaults
-    ClusteringParameters();
+    /// number of clustering iterations
+    int niter = 25;
+    /// redo clustering this many times and keep the clusters with the best
+    /// objective
+    int nredo = 1;
+
+    bool verbose = false;
+    /// whether to normalize centroids after each iteration (useful for inner
+    /// product clustering)
+    bool spherical = false;
+    /// round centroids coordinates to integer after each iteration?
+    bool int_centroids = false;
+    /// re-train index after each iteration?
+    bool update_index = false;
+
+    /// Use the subset of centroids provided as input and do not change them
+    /// during iterations
+    bool frozen_centroids = false;
+    /// If fewer than this number of training vectors per centroid are provided,
+    /// writes a warning. Note that fewer than 1 point per centroid raises an
+    /// exception.
+    int min_points_per_centroid = 39;
+    /// to limit size of dataset, otherwise the training set is subsampled
+    int max_points_per_centroid = 256;
+    /// seed for the random number generator
+    int seed = 1234;
+
+    /// when the training set is encoded, batch size of the codec decoder
+    size_t decode_block_size = 32768;
 };
 
 struct ClusteringIterationStats {
@@ -77,7 +87,6 @@ struct ClusteringIterationStats {
  *
  */
 struct Clustering : ClusteringParameters {
-    typedef Index::idx_t idx_t;
     size_t d; ///< dimension of the vectors
     size_t k; ///< nb of centroids
 
@@ -140,7 +149,7 @@ struct Clustering : ClusteringParameters {
      * to decode the input vectors.
      *
      * @param codec      codec used to decode the vectors (nullptr =
-     *                   vectors are in fact floats)     *
+     *                   vectors are in fact floats)
      */
     void train_encoded(
             idx_t nx,
@@ -199,7 +208,6 @@ struct ProgressiveDimIndexFactory {
  * https://arxiv.org/abs/1509.05195
  */
 struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
-    using idx_t = Index::idx_t;
     size_t d; ///< dimension of the vectors
     size_t k; ///< nb of centroids
 
diff --git a/thirdparty/faiss/faiss/FaissHook.h b/thirdparty/faiss/faiss/FaissHook.h
index c01ea2a0e..5f6ad4574 100644
--- a/thirdparty/faiss/faiss/FaissHook.h
+++ b/thirdparty/faiss/faiss/FaissHook.h
@@ -9,13 +9,18 @@
 #include <faiss/impl/ScalarQuantizer.h>
 #include "simd/hook.h"
 namespace faiss {
-typedef SQDistanceComputer* (*sq_get_distance_computer_func_ptr)(
+
+// todo aguzhva: replace FaissHook.h with simd/hook.h
+
+struct IDSelector;
+
+typedef ScalarQuantizer::SQDistanceComputer* (*sq_get_distance_computer_func_ptr)(
         MetricType,
-        QuantizerType,
+        ScalarQuantizer::QuantizerType,
         size_t,
         const std::vector<float>&);
-typedef Quantizer* (*sq_sel_quantizer_func_ptr)(
-        QuantizerType,
+typedef ScalarQuantizer::SQuantizer* (*sq_sel_quantizer_func_ptr)(
+        ScalarQuantizer::QuantizerType,
         size_t,
         const std::vector<float>&);
 typedef InvertedListScanner* (*sq_sel_inv_list_scanner_func_ptr)(
@@ -24,6 +29,7 @@ typedef InvertedListScanner* (*sq_sel_inv_list_scanner_func_ptr)(
         const Index*,
         size_t,
         bool,
+        const IDSelector*,
         bool);
 
 extern sq_get_distance_computer_func_ptr sq_get_distance_computer;
diff --git a/thirdparty/faiss/faiss/IVFlib.cpp b/thirdparty/faiss/faiss/IVFlib.cpp
index a38cecd66..d75b34531 100644
--- a/thirdparty/faiss/faiss/IVFlib.cpp
+++ b/thirdparty/faiss/faiss/IVFlib.cpp
@@ -5,15 +5,20 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IVFlib.h>
+#include <omp.h>
 
 #include <memory>
 
+#include <faiss/FaissHook.h>
+#include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexIVFAdditiveQuantizer.h>
+#include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
@@ -64,6 +69,10 @@ const IndexIVF* try_extract_index_ivf(const Index* index) {
     if (auto* idmap = dynamic_cast<const IndexIDMap2*>(index)) {
         index = idmap->index;
     }
+    if (auto* indep =
+                dynamic_cast<const IndexIVFIndependentQuantizer*>(index)) {
+        index = indep->index_ivf;
+    }
 
     auto* ivf = dynamic_cast<const IndexIVF*>(index);
 
@@ -349,6 +358,7 @@ void search_with_parameters(
     index_ivf->search_preassigned(
             n, x, k, Iq.data(), Dq.data(), distances, labels, false, params);
     double t3 = getmillisecs();
+
     if (ms_per_stage) {
         ms_per_stage[0] = t1 - t0;
         ms_per_stage[1] = t2 - t1;
@@ -406,5 +416,100 @@ void range_search_with_parameters(
     }
 }
 
+IndexIVFResidualQuantizer* ivf_residual_from_quantizer(
+        const ResidualQuantizer& rq,
+        int nlevel) {
+    FAISS_THROW_IF_NOT(nlevel > 0 && nlevel + 1 < rq.M);
+
+    std::vector<size_t> nbits(nlevel);
+    std::copy(rq.nbits.begin(), rq.nbits.begin() + nlevel, nbits.begin());
+    std::unique_ptr<ResidualCoarseQuantizer> rcq(
+            new ResidualCoarseQuantizer(rq.d, nbits));
+
+    // set the coarse quantizer from the 2 first quantizers
+    rcq->rq.initialize_from(rq);
+    rcq->is_trained = true;
+    rcq->ntotal = (idx_t)1 << rcq->rq.tot_bits;
+
+    // settings for exhaustive search in RCQ
+    rcq->centroid_norms.resize(rcq->ntotal);
+    rcq->aq->compute_centroid_norms(rcq->centroid_norms.data());
+    rcq->beam_factor = -1.0; // use exact search
+    size_t nlist = rcq->ntotal;
+
+    // build a IVFResidualQuantizer from that
+    std::vector<size_t> nbits_refined;
+    for (int i = nlevel; i < rq.M; i++) {
+        nbits_refined.push_back(rq.nbits[i]);
+    }
+    std::unique_ptr<IndexIVFResidualQuantizer> index(
+            new IndexIVFResidualQuantizer(
+                    rcq.get(),
+                    rq.d,
+                    nlist,
+                    nbits_refined,
+                    faiss::METRIC_L2,
+                    rq.search_type));
+    index->own_fields = true;
+    rcq.release();
+    index->by_residual = true;
+    index->rq.initialize_from(rq, nlevel);
+    index->is_trained = true;
+
+    return index.release();
+}
+
+void ivf_residual_add_from_flat_codes(
+        IndexIVFResidualQuantizer* index,
+        size_t nb,
+        const uint8_t* raw_codes,
+        int64_t code_size) {
+    const ResidualCoarseQuantizer* rcq =
+            dynamic_cast<const faiss::ResidualCoarseQuantizer*>(
+                    index->quantizer);
+    FAISS_THROW_IF_NOT_MSG(rcq, "the coarse quantizer must be a RCQ");
+    if (code_size < 0) {
+        code_size = index->code_size;
+    }
+    InvertedLists& invlists = *index->invlists;
+    const ResidualQuantizer& rq = index->rq;
+
+    // populate inverted lists
+#pragma omp parallel if (nb > 10000)
+    {
+        std::vector<uint8_t> tmp_code(index->code_size);
+        std::vector<float> tmp(rq.d);
+        int nt = omp_get_num_threads();
+        int rank = omp_get_thread_num();
+
+#pragma omp for
+        for (idx_t i = 0; i < nb; i++) {
+            const uint8_t* code = &raw_codes[i * code_size];
+            BitstringReader rd(code, code_size);
+            idx_t list_no = rd.read(rcq->rq.tot_bits);
+
+            if (list_no % nt ==
+                rank) { // each thread takes care of 1/nt of the invlists
+                // copy AQ indexes one by one
+                BitstringWriter wr(tmp_code.data(), tmp_code.size());
+                for (int j = 0; j < rq.M; j++) {
+                    int nbit = rq.nbits[j];
+                    wr.write(rd.read(nbit), nbit);
+                }
+                // we need to recompute the norm
+                // decode first, does not use the norm component, so that's
+                // ok
+                index->rq.decode(tmp_code.data(), tmp.data(), 1);
+                float norm = fvec_norm_L2sqr(tmp.data(), rq.d);
+                wr.write(rq.encode_norm(norm), rq.norm_bits);
+
+                // add code to the inverted list
+                invlists.add_entry(list_no, i, tmp_code.data());
+            }
+        }
+    }
+    index->ntotal += nb;
+}
+
 } // namespace ivflib
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IVFlib.h b/thirdparty/faiss/faiss/IVFlib.h
index 5375c5fe9..5524b41e2 100644
--- a/thirdparty/faiss/faiss/IVFlib.h
+++ b/thirdparty/faiss/faiss/IVFlib.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #ifndef FAISS_IVFLIB_H
 #define FAISS_IVFLIB_H
 
@@ -20,6 +18,11 @@
 #include <vector>
 
 namespace faiss {
+
+struct IndexIVFResidualQuantizer;
+struct IndexResidualQuantizer;
+struct ResidualQuantizer;
+
 namespace ivflib {
 
 /** check if two indexes have the same parameters and are trained in
@@ -45,8 +48,6 @@ IndexIVF* try_extract_index_ivf(Index* index);
  */
 void merge_into(Index* index0, Index* index1, bool shift_ids);
 
-typedef Index::idx_t idx_t;
-
 /* Returns the cluster the embeddings belong to.
  *
  * @param index      Index, which should be an IVF index
@@ -145,6 +146,27 @@ void range_search_with_parameters(
         size_t* nb_dis = nullptr,
         double* ms_per_stage = nullptr);
 
+/** Build an IndexIVFResidualQuantizer from an ResidualQuantizer, using the
+ * nlevel first components as coarse quantizer and the rest as codes in invlists
+ */
+IndexIVFResidualQuantizer* ivf_residual_from_quantizer(
+        const ResidualQuantizer&,
+        int nlevel);
+
+/** add from codes. NB that the norm component is not used, so the code_size can
+ * be provided.
+ *
+ * @param ivfrq      index to populate with the codes
+ * @param codes      codes to add, size (ncode, code_size)
+ * @param code_size  override the ivfrq's code_size, useful if the norm encoding
+ *                   is different
+ */
+void ivf_residual_add_from_flat_codes(
+        IndexIVFResidualQuantizer* ivfrq,
+        size_t ncode,
+        const uint8_t* codes,
+        int64_t code_size = -1);
+
 } // namespace ivflib
 } // namespace faiss
 
diff --git a/thirdparty/faiss/faiss/Index.cpp b/thirdparty/faiss/faiss/Index.cpp
index 737d9d540..123ec5e02 100644
--- a/thirdparty/faiss/faiss/Index.cpp
+++ b/thirdparty/faiss/faiss/Index.cpp
@@ -10,7 +10,9 @@
 #include <faiss/Index.h>
 
 #include <faiss/FaissHook.h>
+
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
 
@@ -18,7 +20,7 @@
 
 namespace faiss {
 
-Index::~Index() {}
+Index::~Index() = default;
 
 void Index::train(idx_t /*n*/, const float* /*x*/) {
     // does nothing by default
@@ -29,17 +31,13 @@ void Index::range_search(
         const float*,
         float,
         RangeSearchResult*,
-        const BitsetView) const {
+        const SearchParameters* params) const {
     FAISS_THROW_MSG("range search not implemented");
 }
 
-void Index::assign(idx_t n, const float* x, idx_t* labels, float* distances)
-        const {
-    float* dis_inner = (distances == nullptr) ? new float[n] : distances;
-    search(n, x, 1, dis_inner, labels);
-    if (distances == nullptr) {
-        delete[] dis_inner;
-    }
+void Index::assign(idx_t n, const float* x, idx_t* labels, idx_t k) const {
+    std::vector<float> distances(n * k);
+    search(n, x, k, distances.data(), labels);
 }
 
 void Index::add_with_ids(
@@ -58,7 +56,25 @@ void Index::reconstruct(idx_t, float*) const {
     FAISS_THROW_MSG("reconstruct not implemented for this type of index");
 }
 
+void Index::reconstruct_batch(idx_t n, const idx_t* keys, float* recons) const {
+    std::mutex exception_mutex;
+    std::string exception_string;
+#pragma omp parallel for if (n > 1000)
+    for (idx_t i = 0; i < n; i++) {
+        try {
+            reconstruct(keys[i], &recons[i * d]);
+        } catch (const std::exception& e) {
+            std::lock_guard<std::mutex> lock(exception_mutex);
+            exception_string = e.what();
+        }
+    }
+    if (!exception_string.empty()) {
+        FAISS_THROW_MSG(exception_string.c_str());
+    }
+}
+
 void Index::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
+#pragma omp parallel for if (ni > 1000)
     for (idx_t i = 0; i < ni; i++) {
         reconstruct(i0 + i, recons + i * d);
     }
@@ -70,10 +86,11 @@ void Index::search_and_reconstruct(
         idx_t k,
         float* distances,
         idx_t* labels,
-        float* recons) const {
+        float* recons,
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
 
-    search(n, x, k, distances, labels);
+    search(n, x, k, distances, labels, params);
     for (idx_t i = 0; i < n; ++i) {
         for (idx_t j = 0; j < k; ++j) {
             idx_t ij = i * k + j;
@@ -159,4 +176,12 @@ DistanceComputer* Index::get_distance_computer() const {
     }
 }
 
+void Index::merge_from(Index& /* otherIndex */, idx_t /* add_id */) {
+    FAISS_THROW_MSG("merge_from() not implemented");
+}
+
+void Index::check_compatible_for_merge(const Index& /* otherIndex */) const {
+    FAISS_THROW_MSG("check_compatible_for_merge() not implemented");
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/Index.h b/thirdparty/faiss/faiss/Index.h
index 5da0950f6..ba9b6ba7d 100644
--- a/thirdparty/faiss/faiss/Index.h
+++ b/thirdparty/faiss/faiss/Index.h
@@ -10,17 +10,18 @@
 #ifndef FAISS_INDEX_H
 #define FAISS_INDEX_H
 
+#include <faiss/MetricType.h>
 #include <cstdio>
 #include <sstream>
 #include <string>
 #include <typeinfo>
 
-#include <faiss/MetricType.h>
-#include <knowhere/bitsetview.h>
-using knowhere::BitsetView;
+// #include <knowhere/bitsetview.h>
+// using knowhere::BitsetView;
+
 #define FAISS_VERSION_MAJOR 1
 #define FAISS_VERSION_MINOR 7
-#define FAISS_VERSION_PATCH 2
+#define FAISS_VERSION_PATCH 4
 
 /**
  * @namespace faiss
@@ -40,18 +41,34 @@ using knowhere::BitsetView;
 
 namespace faiss {
 
-/// Forward declarations see AuxIndexStructures.h
+/// Forward declarations see impl/AuxIndexStructures.h, impl/IDSelector.h and
+/// impl/DistanceComputer.h
 struct IDSelector;
 struct RangeSearchResult;
 struct DistanceComputer;
 
+/** Parent class for the optional search paramenters.
+ *
+ * Sub-classes with additional search parameters should inherit this class.
+ * Ownership of the object fields is always to the caller.
+ */
+struct SearchParameters {
+    // BitsetView bitset = nullptr;
+
+    // // Disabled for Knowhere.
+    /// if non-null, only these IDs will be considered during search.
+    IDSelector* sel = nullptr;
+
+    /// make sure we can dynamic_cast this
+    virtual ~SearchParameters() {}
+};
+
 /** Abstract structure for an index, supports adding vectors and searching them.
  *
  * All vectors provided at add or search time are 32-bit float arrays,
  * although the internal representation may vary.
  */
 struct Index {
-    using idx_t = int64_t; ///< all indices are this type
     using component_t = float;
     using distance_t = float;
 
@@ -113,7 +130,6 @@ struct Index {
      * @param x           input vectors to search, size n * d
      * @param labels      output labels of the NNs, size n*k
      * @param distances   output pairwise distances, size n*k
-     * @param bitset      flags to check the validity of vectors
      */
     virtual void search(
             idx_t n,
@@ -121,7 +137,7 @@ struct Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const = 0;
+            const SearchParameters* params = nullptr) const = 0;
 
     /** query n vectors of dimension d to the index.
      *
@@ -138,7 +154,7 @@ struct Index {
             const float* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const;
+            const SearchParameters* params = nullptr) const;
 
     /** return the indexes of the k vectors closest to the query x.
      *
@@ -146,11 +162,8 @@ struct Index {
      * @param x           input vectors to search, size n * d
      * @param labels      output labels of the NNs, size n*k
      */
-    virtual void assign(
-            idx_t n,
-            const float* x,
-            idx_t* labels,
-            float* distances = nullptr) const;
+    virtual void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
+            const;
 
     /// removes all elements from the database.
     virtual void reset() = 0;
@@ -168,6 +181,16 @@ struct Index {
      */
     virtual void reconstruct(idx_t key, float* recons) const;
 
+    /** Reconstruct several stored vectors (or an approximation if lossy coding)
+     *
+     * this function may not be defined for some indexes
+     * @param n        number of vectors to reconstruct
+     * @param keys        ids of the vectors to reconstruct (size n)
+     * @param recons      reconstucted vector (size n * d)
+     */
+    virtual void reconstruct_batch(idx_t n, const idx_t* keys, float* recons)
+            const;
+
     /** Reconstruct vectors i0 to i0 + ni - 1
      *
      * this function may not be defined for some indexes
@@ -189,7 +212,8 @@ struct Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            float* recons) const;
+            float* recons,
+            const SearchParameters* params = nullptr) const;
 
     /** Computes a residual vector after indexing encoding.
      *
@@ -245,13 +269,24 @@ struct Index {
      */
     virtual void sa_encode(idx_t n, const float* x, uint8_t* bytes) const;
 
-    /** encode a set of vectors
+    /** decode a set of vectors
      *
      * @param n       number of vectors
      * @param bytes   input encoded vectors, size n * sa_code_size()
      * @param x       output vectors, size n * d
      */
     virtual void sa_decode(idx_t n, const uint8_t* bytes, float* x) const;
+
+    /** moves the entries from another dataset to self.
+     * On output, other is empty.
+     * add_id is added to all moved ids
+     * (for sequential ids, this would be this->ntotal) */
+    virtual void merge_from(Index& otherIndex, idx_t add_id = 0);
+
+    /** check that the two indexes are compatible (ie, they are
+     * trained in the same way and have the same
+     * parameters). Otherwise throw. */
+    virtual void check_compatible_for_merge(const Index& otherIndex) const;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/Index2Layer.cpp b/thirdparty/faiss/faiss/Index2Layer.cpp
index aa1c34fbb..45c6680e6 100644
--- a/thirdparty/faiss/faiss/Index2Layer.cpp
+++ b/thirdparty/faiss/faiss/Index2Layer.cpp
@@ -10,10 +10,10 @@
 #include <faiss/Index2Layer.h>
 
 #include <faiss/impl/platform_macros.h>
-#include <stdint.h>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
+#include <cstdint>
 #include <cstdio>
 
 #ifdef __SSE3__
@@ -24,6 +24,7 @@
 
 #include <faiss/FaissHook.h>
 #include <faiss/IndexIVFPQ.h>
+
 #include <faiss/IndexFlat.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
@@ -47,7 +48,7 @@ Index2Layer::Index2Layer(
           pq(quantizer->d, M, nbit) {
     is_trained = false;
     for (int nbyte = 0; nbyte < 7; nbyte++) {
-        if ((1L << (8 * nbyte)) >= nlist) {
+        if (((size_t)1 << (8 * nbyte)) >= nlist) {
             code_size_1 = nbyte;
             break;
         }
@@ -60,7 +61,7 @@ Index2Layer::Index2Layer() {
     code_size = code_size_1 = code_size_2 = 0;
 }
 
-Index2Layer::~Index2Layer() {}
+Index2Layer::~Index2Layer() = default;
 
 void Index2Layer::train(idx_t n, const float* x) {
     if (verbose) {
@@ -112,7 +113,7 @@ void Index2Layer::search(
         idx_t /*k*/,
         float* /*distances*/,
         idx_t* /*labels*/,
-        const BitsetView /*bitset*/) const {
+        const SearchParameters* /* params */) const {
     FAISS_THROW_MSG("not implemented");
 }
 
@@ -134,6 +135,10 @@ void Index2Layer::transfer_to_IVFPQ(IndexIVFPQ& other) const {
     other.ntotal = ntotal;
 }
 
+size_t Index2Layer::cal_size() const {
+    return sizeof(*this) + codes.size() * sizeof(uint8_t) + pq.cal_size();
+}
+
 namespace {
 
 struct Distance2Level : DistanceComputer {
@@ -179,7 +184,7 @@ struct DistanceXPQ4 : Distance2Level {
     float operator()(idx_t i) override {
 #ifdef __SSE3__
         const uint8_t* code = storage.codes.data() + i * storage.code_size;
-        long key = 0;
+        idx_t key = 0;
         memcpy(&key, code, storage.code_size_1);
         code += storage.code_size_1;
 
@@ -225,7 +230,7 @@ struct Distance2xXPQ4 : Distance2Level {
 
     float operator()(idx_t i) override {
         const uint8_t* code = storage.codes.data() + i * storage.code_size;
-        long key01 = 0;
+        int64_t key01 = 0;
         memcpy(&key01, code, storage.code_size_1);
         code += storage.code_size_1;
 #ifdef __SSE3__
@@ -237,7 +242,7 @@ struct Distance2xXPQ4 : Distance2Level {
         __m128 accu = _mm_setzero_ps();
 
         for (int mi_m = 0; mi_m < 2; mi_m++) {
-            long l1_idx = key01 & ((1L << mi_nbits) - 1);
+            int64_t l1_idx = key01 & (((int64_t)1 << mi_nbits) - 1);
             const __m128* pq_l1 = pq_l1_t + M_2 * l1_idx;
 
             for (int m = 0; m < M_2; m++) {
@@ -283,10 +288,13 @@ DistanceComputer* Index2Layer::get_distance_computer() const {
 
 /* The standalone codec interface */
 
+// block size used in Index2Layer::sa_encode
+int index2layer_sa_encode_bs = 32768;
+
 void Index2Layer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
     FAISS_THROW_IF_NOT(is_trained);
 
-    idx_t bs = 32768;
+    idx_t bs = index2layer_sa_encode_bs;
     if (n > bs) {
         for (idx_t i0 = 0; i0 < n; i0 += bs) {
             idx_t i1 = std::min(i0 + bs, n);
diff --git a/thirdparty/faiss/faiss/Index2Layer.h b/thirdparty/faiss/faiss/Index2Layer.h
index eea02fbab..3b0499908 100644
--- a/thirdparty/faiss/faiss/Index2Layer.h
+++ b/thirdparty/faiss/faiss/Index2Layer.h
@@ -14,6 +14,7 @@
 #include <faiss/IndexFlatCodes.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexPQ.h>
+#include <faiss/impl/platform_macros.h>
 
 namespace faiss {
 
@@ -57,7 +58,7 @@ struct Index2Layer : IndexFlatCodes {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     DistanceComputer* get_distance_computer() const override;
 
@@ -68,9 +69,10 @@ struct Index2Layer : IndexFlatCodes {
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 
-    size_t cal_size() {
-        return sizeof(*this) + codes.size() * sizeof(uint8_t) + pq.cal_size();
-    }
+    size_t cal_size() const;
 };
 
+// block size used in Index2Layer::sa_encode
+FAISS_API extern int index2layer_sa_encode_bs;
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp b/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp
index 278c5d4df..5bf06c4a4 100644
--- a/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp
+++ b/thirdparty/faiss/faiss/IndexAdditiveQuantizer.cpp
@@ -5,9 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// quiet the noise
-// clang-format off
-
 #include <faiss/IndexAdditiveQuantizer.h>
 
 #include <algorithm>
@@ -21,7 +18,6 @@
 #include <faiss/utils/extra_distances.h>
 #include <faiss/utils/utils.h>
 
-
 namespace faiss {
 
 /**************************************************************************************
@@ -29,17 +25,95 @@ namespace faiss {
  **************************************************************************************/
 
 IndexAdditiveQuantizer::IndexAdditiveQuantizer(
-            idx_t d,
-            AdditiveQuantizer* aq,
-            MetricType metric):
-        IndexFlatCodes(aq->code_size, d, metric), aq(aq)
-{
+        idx_t d,
+        AdditiveQuantizer* aq,
+        MetricType metric)
+        : IndexFlatCodes(aq->code_size, d, metric), aq(aq) {
     FAISS_THROW_IF_NOT(metric == METRIC_INNER_PRODUCT || metric == METRIC_L2);
 }
 
-
 namespace {
 
+/************************************************************
+ * DistanceComputer implementation
+ ************************************************************/
+
+template <class VectorDistance>
+struct AQDistanceComputerDecompress : FlatCodesDistanceComputer {
+    std::vector<float> tmp;
+    const AdditiveQuantizer& aq;
+    VectorDistance vd;
+    size_t d;
+
+    AQDistanceComputerDecompress(
+            const IndexAdditiveQuantizer& iaq,
+            VectorDistance vd)
+            : FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size),
+              tmp(iaq.d * 2),
+              aq(*iaq.aq),
+              vd(vd),
+              d(iaq.d) {}
+
+    const float* q;
+    void set_query(const float* x) final {
+        q = x;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) final {
+        aq.decode(codes + i * d, tmp.data(), 1);
+        aq.decode(codes + j * d, tmp.data() + d, 1);
+        return vd(tmp.data(), tmp.data() + d);
+    }
+
+    float distance_to_code(const uint8_t* code) final {
+        aq.decode(code, tmp.data(), 1);
+        return vd(q, tmp.data());
+    }
+
+    virtual ~AQDistanceComputerDecompress() = default;
+};
+
+template <bool is_IP, AdditiveQuantizer::Search_type_t st>
+struct AQDistanceComputerLUT : FlatCodesDistanceComputer {
+    std::vector<float> LUT;
+    const AdditiveQuantizer& aq;
+    size_t d;
+
+    explicit AQDistanceComputerLUT(const IndexAdditiveQuantizer& iaq)
+            : FlatCodesDistanceComputer(iaq.codes.data(), iaq.code_size),
+              LUT(iaq.aq->total_codebook_size + iaq.d * 2),
+              aq(*iaq.aq),
+              d(iaq.d) {}
+
+    float bias;
+    void set_query(const float* x) final {
+        // this is quite sub-optimal for multiple queries
+        aq.compute_LUT(1, x, LUT.data());
+        if (is_IP) {
+            bias = 0;
+        } else {
+            bias = fvec_norm_L2sqr(x, d);
+        }
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) final {
+        float* tmp = LUT.data();
+        aq.decode(codes + i * d, tmp, 1);
+        aq.decode(codes + j * d, tmp + d, 1);
+        return fvec_L2sqr(tmp, tmp + d, d);
+    }
+
+    float distance_to_code(const uint8_t* code) final {
+        return bias + aq.compute_1_distance_LUT<is_IP, st>(code, LUT.data());
+    }
+
+    virtual ~AQDistanceComputerLUT() = default;
+};
+
+/************************************************************
+ * scanning implementation for search
+ ************************************************************/
+
 template <class VectorDistance, class ResultHandler>
 void search_with_decompress(
         const IndexAdditiveQuantizer& ir,
@@ -49,11 +123,11 @@ void search_with_decompress(
     const uint8_t* codes = ir.codes.data();
     size_t ntotal = ir.ntotal;
     size_t code_size = ir.code_size;
-    const AdditiveQuantizer *aq = ir.aq;
+    const AdditiveQuantizer* aq = ir.aq;
 
     using SingleResultHandler = typename ResultHandler::SingleResultHandler;
 
-#pragma omp parallel for if(res.nq > 100)
+#pragma omp parallel for if (res.nq > 100)
     for (int64_t q = 0; q < res.nq; q++) {
         SingleResultHandler resi(res);
         resi.begin(q);
@@ -68,13 +142,12 @@ void search_with_decompress(
     }
 }
 
-template<bool is_IP, AdditiveQuantizer::Search_type_t st, class ResultHandler>
+template <bool is_IP, AdditiveQuantizer::Search_type_t st, class ResultHandler>
 void search_with_LUT(
         const IndexAdditiveQuantizer& ir,
         const float* xq,
-        ResultHandler& res)
-{
-    const AdditiveQuantizer & aq = *ir.aq;
+        ResultHandler& res) {
+    const AdditiveQuantizer& aq = *ir.aq;
     const uint8_t* codes = ir.codes.data();
     size_t ntotal = ir.ntotal;
     size_t code_size = aq.code_size;
@@ -82,42 +155,88 @@ void search_with_LUT(
     size_t d = ir.d;
 
     using SingleResultHandler = typename ResultHandler::SingleResultHandler;
-    std::unique_ptr<float []> LUT(new float[nq * aq.total_codebook_size]);
+    std::unique_ptr<float[]> LUT(new float[nq * aq.total_codebook_size]);
 
     aq.compute_LUT(nq, xq, LUT.get());
 
-#pragma omp parallel for if(nq > 100)
+#pragma omp parallel for if (nq > 100)
     for (int64_t q = 0; q < nq; q++) {
         SingleResultHandler resi(res);
         resi.begin(q);
         std::vector<float> tmp(aq.d);
-        const float *LUT_q = LUT.get() + aq.total_codebook_size * q;
+        const float* LUT_q = LUT.get() + aq.total_codebook_size * q;
         float bias = 0;
-        if (!is_IP) { // the LUT function returns ||y||^2 - 2 * <x, y>, need to add ||x||^2
+        if (!is_IP) { // the LUT function returns ||y||^2 - 2 * <x, y>, need to
+                      // add ||x||^2
             bias = fvec_norm_L2sqr(xq + q * d, d);
         }
         for (size_t i = 0; i < ntotal; i++) {
             float dis = aq.compute_1_distance_LUT<is_IP, st>(
-                codes + i * code_size,
-                LUT_q
-            );
+                    codes + i * code_size, LUT_q);
             resi.add_result(dis + bias, i);
         }
         resi.end();
     }
-
 }
 
-
 } // anonymous namespace
 
+FlatCodesDistanceComputer* IndexAdditiveQuantizer::
+        get_FlatCodesDistanceComputer() const {
+    if (aq->search_type == AdditiveQuantizer::ST_decompress) {
+        if (metric_type == METRIC_L2) {
+            using VD = VectorDistance<METRIC_L2>;
+            VD vd = {size_t(d), metric_arg};
+            return new AQDistanceComputerDecompress<VD>(*this, vd);
+        } else if (metric_type == METRIC_INNER_PRODUCT) {
+            using VD = VectorDistance<METRIC_INNER_PRODUCT>;
+            VD vd = {size_t(d), metric_arg};
+            return new AQDistanceComputerDecompress<VD>(*this, vd);
+        } else {
+            FAISS_THROW_MSG("unsupported metric");
+        }
+    } else {
+        if (metric_type == METRIC_INNER_PRODUCT) {
+            return new AQDistanceComputerLUT<
+                    true,
+                    AdditiveQuantizer::ST_LUT_nonorm>(*this);
+        } else {
+            switch (aq->search_type) {
+#define DISPATCH(st)                                                           \
+    case AdditiveQuantizer::st:                                                \
+        return new AQDistanceComputerLUT<false, AdditiveQuantizer::st>(*this); \
+        break;
+                DISPATCH(ST_norm_float)
+                DISPATCH(ST_LUT_nonorm)
+                DISPATCH(ST_norm_qint8)
+                DISPATCH(ST_norm_qint4)
+                DISPATCH(ST_norm_cqint4)
+                case AdditiveQuantizer::ST_norm_cqint8:
+                case AdditiveQuantizer::ST_norm_lsq2x4:
+                case AdditiveQuantizer::ST_norm_rq2x4:
+                    return new AQDistanceComputerLUT<
+                            false,
+                            AdditiveQuantizer::ST_norm_cqint8>(*this);
+                    break;
+#undef DISPATCH
+                default:
+                    FAISS_THROW_FMT(
+                            "search type %d not supported", aq->search_type);
+            }
+        }
+    }
+}
+
 void IndexAdditiveQuantizer::search(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+
     if (aq->search_type == AdditiveQuantizer::ST_decompress) {
         if (metric_type == METRIC_L2) {
             using VD = VectorDistance<METRIC_L2>;
@@ -132,42 +251,46 @@ void IndexAdditiveQuantizer::search(
         }
     } else {
         if (metric_type == METRIC_INNER_PRODUCT) {
-            HeapResultHandler<CMin<float, idx_t> > rh(n, distances, labels, k);
-            search_with_LUT<true, AdditiveQuantizer::ST_LUT_nonorm> (*this, x, rh);
+            HeapResultHandler<CMin<float, idx_t>> rh(n, distances, labels, k);
+            search_with_LUT<true, AdditiveQuantizer::ST_LUT_nonorm>(
+                    *this, x, rh);
         } else {
-            HeapResultHandler<CMax<float, idx_t> > rh(n, distances, labels, k);
-
-            if (aq->search_type == AdditiveQuantizer::ST_norm_float) {
-                search_with_LUT<false, AdditiveQuantizer::ST_norm_float> (*this, x, rh);
-            } else if (aq->search_type == AdditiveQuantizer::ST_LUT_nonorm) {
-                search_with_LUT<false, AdditiveQuantizer::ST_norm_float> (*this, x, rh);
-            } else if (aq->search_type == AdditiveQuantizer::ST_norm_qint8) {
-                search_with_LUT<false, AdditiveQuantizer::ST_norm_qint8> (*this, x, rh);
-            } else if (aq->search_type == AdditiveQuantizer::ST_norm_qint4) {
-                search_with_LUT<false, AdditiveQuantizer::ST_norm_qint4> (*this, x, rh);
-            } else if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8) {
-                search_with_LUT<false, AdditiveQuantizer::ST_norm_cqint8> (*this, x, rh);
-            } else if (aq->search_type == AdditiveQuantizer::ST_norm_cqint4) {
-                search_with_LUT<false, AdditiveQuantizer::ST_norm_cqint4> (*this, x, rh);
-            } else {
-                FAISS_THROW_FMT("search type %d not supported", aq->search_type);
+            HeapResultHandler<CMax<float, idx_t>> rh(n, distances, labels, k);
+            switch (aq->search_type) {
+#define DISPATCH(st)                                                 \
+    case AdditiveQuantizer::st:                                      \
+        search_with_LUT<false, AdditiveQuantizer::st>(*this, x, rh); \
+        break;
+                DISPATCH(ST_norm_float)
+                DISPATCH(ST_LUT_nonorm)
+                DISPATCH(ST_norm_qint8)
+                DISPATCH(ST_norm_qint4)
+                DISPATCH(ST_norm_cqint4)
+                case AdditiveQuantizer::ST_norm_cqint8:
+                case AdditiveQuantizer::ST_norm_lsq2x4:
+                case AdditiveQuantizer::ST_norm_rq2x4:
+                    search_with_LUT<false, AdditiveQuantizer::ST_norm_cqint8>(
+                            *this, x, rh);
+                    break;
+#undef DISPATCH
+                default:
+                    FAISS_THROW_FMT(
+                            "search type %d not supported", aq->search_type);
             }
         }
-
     }
 }
 
-void IndexAdditiveQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes) const {
+void IndexAdditiveQuantizer::sa_encode(idx_t n, const float* x, uint8_t* bytes)
+        const {
     return aq->compute_codes(x, bytes, n);
 }
 
-void IndexAdditiveQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
+void IndexAdditiveQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x)
+        const {
     return aq->decode(bytes, x, n);
 }
 
-
-
-
 /**************************************************************************************
  * IndexResidualQuantizer
  **************************************************************************************/
@@ -178,8 +301,11 @@ IndexResidualQuantizer::IndexResidualQuantizer(
         size_t nbits, ///< number of bit per subvector index
         MetricType metric,
         Search_type_t search_type)
-        : IndexResidualQuantizer(d, std::vector<size_t>(M, nbits), metric, search_type) {
-}
+        : IndexResidualQuantizer(
+                  d,
+                  std::vector<size_t>(M, nbits),
+                  metric,
+                  search_type) {}
 
 IndexResidualQuantizer::IndexResidualQuantizer(
         int d,
@@ -191,14 +317,14 @@ IndexResidualQuantizer::IndexResidualQuantizer(
     is_trained = false;
 }
 
-IndexResidualQuantizer::IndexResidualQuantizer() : IndexResidualQuantizer(0, 0, 0) {}
+IndexResidualQuantizer::IndexResidualQuantizer()
+        : IndexResidualQuantizer(0, 0, 0) {}
 
 void IndexResidualQuantizer::train(idx_t n, const float* x) {
     rq.train(n, x);
     is_trained = true;
 }
 
-
 /**************************************************************************************
  * IndexLocalSearchQuantizer
  **************************************************************************************/
@@ -209,28 +335,79 @@ IndexLocalSearchQuantizer::IndexLocalSearchQuantizer(
         size_t nbits, ///< number of bit per subvector index
         MetricType metric,
         Search_type_t search_type)
-        : IndexAdditiveQuantizer(d, &lsq, metric), lsq(d, M, nbits, search_type) {
+        : IndexAdditiveQuantizer(d, &lsq, metric),
+          lsq(d, M, nbits, search_type) {
     code_size = lsq.code_size;
     is_trained = false;
 }
 
-IndexLocalSearchQuantizer::IndexLocalSearchQuantizer() : IndexLocalSearchQuantizer(0, 0, 0) {}
+IndexLocalSearchQuantizer::IndexLocalSearchQuantizer()
+        : IndexLocalSearchQuantizer(0, 0, 0) {}
 
 void IndexLocalSearchQuantizer::train(idx_t n, const float* x) {
     lsq.train(n, x);
     is_trained = true;
 }
 
+/**************************************************************************************
+ * IndexProductResidualQuantizer
+ **************************************************************************************/
+
+IndexProductResidualQuantizer::IndexProductResidualQuantizer(
+        int d,          ///< dimensionality of the input vectors
+        size_t nsplits, ///< number of residual quantizers
+        size_t Msub,    ///< number of subquantizers per RQ
+        size_t nbits,   ///< number of bit per subvector index
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexAdditiveQuantizer(d, &prq, metric),
+          prq(d, nsplits, Msub, nbits, search_type) {
+    code_size = prq.code_size;
+    is_trained = false;
+}
+
+IndexProductResidualQuantizer::IndexProductResidualQuantizer()
+        : IndexProductResidualQuantizer(0, 0, 0, 0) {}
+
+void IndexProductResidualQuantizer::train(idx_t n, const float* x) {
+    prq.train(n, x);
+    is_trained = true;
+}
+
+/**************************************************************************************
+ * IndexProductLocalSearchQuantizer
+ **************************************************************************************/
+
+IndexProductLocalSearchQuantizer::IndexProductLocalSearchQuantizer(
+        int d,          ///< dimensionality of the input vectors
+        size_t nsplits, ///< number of local search quantizers
+        size_t Msub,    ///< number of subquantizers per LSQ
+        size_t nbits,   ///< number of bit per subvector index
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexAdditiveQuantizer(d, &plsq, metric),
+          plsq(d, nsplits, Msub, nbits, search_type) {
+    code_size = plsq.code_size;
+    is_trained = false;
+}
+
+IndexProductLocalSearchQuantizer::IndexProductLocalSearchQuantizer()
+        : IndexProductLocalSearchQuantizer(0, 0, 0, 0) {}
+
+void IndexProductLocalSearchQuantizer::train(idx_t n, const float* x) {
+    plsq.train(n, x);
+    is_trained = true;
+}
+
 /**************************************************************************************
  * AdditiveCoarseQuantizer
  **************************************************************************************/
 
 AdditiveCoarseQuantizer::AdditiveCoarseQuantizer(
-            idx_t d,
-            AdditiveQuantizer* aq,
-            MetricType metric):
-        Index(d, metric), aq(aq)
-{}
+        idx_t d,
+        AdditiveQuantizer* aq,
+        MetricType metric)
+        : Index(d, metric), aq(aq) {}
 
 void AdditiveCoarseQuantizer::add(idx_t, const float*) {
     FAISS_THROW_MSG("not applicable");
@@ -244,18 +421,25 @@ void AdditiveCoarseQuantizer::reset() {
     FAISS_THROW_MSG("not applicable");
 }
 
-
 void AdditiveCoarseQuantizer::train(idx_t n, const float* x) {
     if (verbose) {
-        printf("AdditiveCoarseQuantizer::train: training on %zd vectors\n", size_t(n));
+        printf("AdditiveCoarseQuantizer::train: training on %zd vectors\n",
+               size_t(n));
     }
+    size_t norms_size = sizeof(float) << aq->tot_bits;
+
+    FAISS_THROW_IF_NOT_MSG(
+            norms_size <= aq->max_mem_distances,
+            "the RCQ norms matrix will become too large, please reduce the number of quantization steps");
+
     aq->train(n, x);
     is_trained = true;
     ntotal = (idx_t)1 << aq->tot_bits;
 
     if (metric_type == METRIC_L2) {
         if (verbose) {
-            printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", size_t(ntotal));
+            printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n",
+                   size_t(ntotal));
         }
         // this is not necessary for the residualcoarsequantizer when
         // using beam search. We'll see if the memory overhead is too high
@@ -270,13 +454,15 @@ void AdditiveCoarseQuantizer::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+
     if (metric_type == METRIC_INNER_PRODUCT) {
         aq->knn_centroids_inner_product(n, x, k, distances, labels);
     } else if (metric_type == METRIC_L2) {
         FAISS_THROW_IF_NOT(centroid_norms.size() == ntotal);
-        aq->knn_centroids_L2(
-                n, x, k, distances, labels, centroid_norms.data());
+        aq->knn_centroids_L2(n, x, k, distances, labels, centroid_norms.data());
     }
 }
 
@@ -285,10 +471,10 @@ void AdditiveCoarseQuantizer::search(
  **************************************************************************************/
 
 ResidualCoarseQuantizer::ResidualCoarseQuantizer(
-        int d,        ///< dimensionality of the input vectors
+        int d, ///< dimensionality of the input vectors
         const std::vector<size_t>& nbits,
         MetricType metric)
-        : AdditiveCoarseQuantizer(d, &rq, metric), rq(d, nbits), beam_factor(4.0) {
+        : AdditiveCoarseQuantizer(d, &rq, metric), rq(d, nbits) {
     FAISS_THROW_IF_NOT(rq.tot_bits <= 63);
     is_trained = false;
 }
@@ -300,21 +486,30 @@ ResidualCoarseQuantizer::ResidualCoarseQuantizer(
         MetricType metric)
         : ResidualCoarseQuantizer(d, std::vector<size_t>(M, nbits), metric) {}
 
-ResidualCoarseQuantizer::ResidualCoarseQuantizer(): ResidualCoarseQuantizer(0, 0, 0) {}
-
-
+ResidualCoarseQuantizer::ResidualCoarseQuantizer()
+        : ResidualCoarseQuantizer(0, 0, 0) {}
 
 void ResidualCoarseQuantizer::set_beam_factor(float new_beam_factor) {
     beam_factor = new_beam_factor;
     if (new_beam_factor > 0) {
         FAISS_THROW_IF_NOT(new_beam_factor >= 1.0);
+        if (rq.codebook_cross_products.size() == 0) {
+            rq.compute_codebook_tables();
+        }
         return;
-    } else if (metric_type == METRIC_L2 && ntotal != centroid_norms.size()) {
-        if (verbose) {
-            printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n", size_t(ntotal));
+    } else {
+        // new_beam_factor = -1: exhaustive computation.
+        // Does not use the cross_products
+        rq.codebook_cross_products.resize(0);
+        // but the centroid norms are necessary!
+        if (metric_type == METRIC_L2 && ntotal != centroid_norms.size()) {
+            if (verbose) {
+                printf("AdditiveCoarseQuantizer::train: computing centroid norms for %zd centroids\n",
+                       size_t(ntotal));
+            }
+            centroid_norms.resize(ntotal);
+            aq->compute_centroid_norms(centroid_norms.data());
         }
-        centroid_norms.resize(ntotal);
-        aq->compute_centroid_norms(centroid_norms.data());
     }
 }
 
@@ -324,9 +519,20 @@ void ResidualCoarseQuantizer::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params_in) const {
+    float beam_factor = this->beam_factor;
+    if (params_in) {
+        auto params =
+                dynamic_cast<const SearchParametersResidualCoarseQuantizer*>(
+                        params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params,
+                "need SearchParametersResidualCoarseQuantizer parameters");
+        beam_factor = params->beam_factor;
+    }
+
     if (beam_factor < 0) {
-        AdditiveCoarseQuantizer::search(n, x, k, distances, labels, bitset);
+        AdditiveCoarseQuantizer::search(n, x, k, distances, labels);
         return;
     }
 
@@ -354,7 +560,12 @@ void ResidualCoarseQuantizer::search(
         }
         for (idx_t i0 = 0; i0 < n; i0 += bs) {
             idx_t i1 = std::min(n, i0 + bs);
-            search(i1 - i0, x + i0 * d, k, distances + i0 * k, labels + i0 * k, bitset);
+            search(i1 - i0,
+                   x + i0 * d,
+                   k,
+                   distances + i0 * k,
+                   labels + i0 * k,
+                   params_in);
             InterruptCallback::check();
         }
         return;
@@ -366,6 +577,7 @@ void ResidualCoarseQuantizer::search(
     rq.refine_beam(
             n, 1, x, beam_size, codes.data(), nullptr, beam_distances.data());
 
+    // pack int32 table
 #pragma omp parallel for if (n > 4000)
     for (idx_t i = 0; i < n; i++) {
         memcpy(distances + i * k,
@@ -385,6 +597,15 @@ void ResidualCoarseQuantizer::search(
     }
 }
 
+void ResidualCoarseQuantizer::initialize_from(
+        const ResidualCoarseQuantizer& other) {
+    FAISS_THROW_IF_NOT(rq.M <= other.rq.M);
+    rq.initialize_from(other.rq);
+    set_beam_factor(other.beam_factor);
+    is_trained = other.is_trained;
+    ntotal = (idx_t)1 << aq->tot_bits;
+}
+
 /**************************************************************************************
  * LocalSearchCoarseQuantizer
  **************************************************************************************/
@@ -399,12 +620,8 @@ LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer(
     is_trained = false;
 }
 
-
 LocalSearchCoarseQuantizer::LocalSearchCoarseQuantizer() {
     aq = &lsq;
 }
 
-
-
-
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexAdditiveQuantizer.h b/thirdparty/faiss/faiss/IndexAdditiveQuantizer.h
index de94c010e..48f54c8b3 100644
--- a/thirdparty/faiss/faiss/IndexAdditiveQuantizer.h
+++ b/thirdparty/faiss/faiss/IndexAdditiveQuantizer.h
@@ -15,6 +15,7 @@
 
 #include <faiss/IndexFlatCodes.h>
 #include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ProductAdditiveQuantizer.h>
 #include <faiss/impl/ResidualQuantizer.h>
 #include <faiss/impl/platform_macros.h>
 
@@ -28,8 +29,8 @@ struct IndexAdditiveQuantizer : IndexFlatCodes {
     using Search_type_t = AdditiveQuantizer::Search_type_t;
 
     explicit IndexAdditiveQuantizer(
-            idx_t d = 0,
-            AdditiveQuantizer* aq = nullptr,
+            idx_t d,
+            AdditiveQuantizer* aq,
             MetricType metric = METRIC_L2);
 
     void search(
@@ -38,12 +39,14 @@ struct IndexAdditiveQuantizer : IndexFlatCodes {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     /* The standalone codec interface */
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+
+    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
 };
 
 /** Index based on a residual quantizer. Stored vectors are
@@ -99,6 +102,58 @@ struct IndexLocalSearchQuantizer : IndexAdditiveQuantizer {
     void train(idx_t n, const float* x) override;
 };
 
+/** Index based on a product residual quantizer.
+ */
+struct IndexProductResidualQuantizer : IndexAdditiveQuantizer {
+    /// The product residual quantizer used to encode the vectors
+    ProductResidualQuantizer prq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param nsplits  number of residual quantizers
+     * @param Msub      number of subquantizers per RQ
+     * @param nbits  number of bit per subvector index
+     */
+    IndexProductResidualQuantizer(
+            int d,          ///< dimensionality of the input vectors
+            size_t nsplits, ///< number of residual quantizers
+            size_t Msub,    ///< number of subquantizers per RQ
+            size_t nbits,   ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexProductResidualQuantizer();
+
+    void train(idx_t n, const float* x) override;
+};
+
+/** Index based on a product local search quantizer.
+ */
+struct IndexProductLocalSearchQuantizer : IndexAdditiveQuantizer {
+    /// The product local search quantizer used to encode the vectors
+    ProductLocalSearchQuantizer plsq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param nsplits  number of local search quantizers
+     * @param Msub     number of subquantizers per LSQ
+     * @param nbits  number of bit per subvector index
+     */
+    IndexProductLocalSearchQuantizer(
+            int d,          ///< dimensionality of the input vectors
+            size_t nsplits, ///< number of local search quantizers
+            size_t Msub,    ///< number of subquantizers per LSQ
+            size_t nbits,   ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexProductLocalSearchQuantizer();
+
+    void train(idx_t n, const float* x) override;
+};
+
 /** A "virtual" index where the elements are the residual quantizer centroids.
  *
  * Intended for use as a coarse quantizer in an IndexIVF.
@@ -123,7 +178,7 @@ struct AdditiveCoarseQuantizer : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
     void train(idx_t n, const float* x) override;
@@ -132,6 +187,11 @@ struct AdditiveCoarseQuantizer : Index {
     void reset() override;
 };
 
+struct SearchParametersResidualCoarseQuantizer : SearchParameters {
+    float beam_factor = 4.0f;
+    ~SearchParametersResidualCoarseQuantizer() {}
+};
+
 /** The ResidualCoarseQuantizer is a bit specialized compared to the
  * default AdditiveCoarseQuantizer because it can use a beam search
  * at search time (slow but may be useful for very large vocabularies) */
@@ -141,7 +201,7 @@ struct ResidualCoarseQuantizer : AdditiveCoarseQuantizer {
 
     /// factor between the beam size and the search k
     /// if negative, use exact search-to-centroid
-    float beam_factor;
+    float beam_factor = 4.0f;
 
     /// computes centroid norms if required
     void set_beam_factor(float new_beam_factor);
@@ -169,7 +229,11 @@ struct ResidualCoarseQuantizer : AdditiveCoarseQuantizer {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
+
+    /** Copy the M first codebook levels from other. Useful to crop a
+     * ResidualQuantizer to its first M quantizers. */
+    void initialize_from(const ResidualCoarseQuantizer& other);
 
     ResidualCoarseQuantizer();
 };
diff --git a/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp
new file mode 100644
index 000000000..709ccc87e
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.cpp
@@ -0,0 +1,299 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexAdditiveQuantizerFastScan.h>
+
+#include <cassert>
+#include <climits>
+#include <memory>
+
+#include <omp.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/LookupTableScaler.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/pq4_fast_scan.h>
+#include <faiss/utils/quantize_lut.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+inline size_t roundup(size_t a, size_t b) {
+    return (a + b - 1) / b * b;
+}
+
+IndexAdditiveQuantizerFastScan::IndexAdditiveQuantizerFastScan(
+        AdditiveQuantizer* aq,
+        MetricType metric,
+        int bbs) {
+    init(aq, metric, bbs);
+}
+
+void IndexAdditiveQuantizerFastScan::init(
+        AdditiveQuantizer* aq,
+        MetricType metric,
+        int bbs) {
+    FAISS_THROW_IF_NOT(aq != nullptr);
+    FAISS_THROW_IF_NOT(!aq->nbits.empty());
+    FAISS_THROW_IF_NOT(aq->nbits[0] == 4);
+    if (metric == METRIC_INNER_PRODUCT) {
+        FAISS_THROW_IF_NOT_MSG(
+                aq->search_type == AdditiveQuantizer::ST_LUT_nonorm,
+                "Search type must be ST_LUT_nonorm for IP metric");
+    } else {
+        FAISS_THROW_IF_NOT_MSG(
+                aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
+                        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4,
+                "Search type must be lsq2x4 or rq2x4 for L2 metric");
+    }
+
+    this->aq = aq;
+    if (metric == METRIC_L2) {
+        M = aq->M + 2; // 2x4 bits AQ
+    } else {
+        M = aq->M;
+    }
+    init_fastscan(aq->d, M, 4, metric, bbs);
+
+    max_train_points = 1024 * ksub * M;
+}
+
+IndexAdditiveQuantizerFastScan::IndexAdditiveQuantizerFastScan()
+        : IndexFastScan() {
+    is_trained = false;
+    aq = nullptr;
+}
+
+IndexAdditiveQuantizerFastScan::IndexAdditiveQuantizerFastScan(
+        const IndexAdditiveQuantizer& orig,
+        int bbs) {
+    init(orig.aq, orig.metric_type, bbs);
+
+    ntotal = orig.ntotal;
+    is_trained = orig.is_trained;
+    orig_codes = orig.codes.data();
+
+    ntotal2 = roundup(ntotal, bbs);
+    codes.resize(ntotal2 * M2 / 2);
+    pq4_pack_codes(orig_codes, ntotal, M, ntotal2, bbs, M2, codes.get());
+}
+
+IndexAdditiveQuantizerFastScan::~IndexAdditiveQuantizerFastScan() = default;
+
+void IndexAdditiveQuantizerFastScan::train(idx_t n, const float* x_in) {
+    if (is_trained) {
+        return;
+    }
+
+    const int seed = 0x12345;
+    size_t nt = n;
+    const float* x = fvecs_maybe_subsample(
+            d, &nt, max_train_points, x_in, verbose, seed);
+    n = nt;
+    if (verbose) {
+        printf("training additive quantizer on %zd vectors\n", nt);
+    }
+
+    aq->verbose = verbose;
+    aq->train(n, x);
+    if (metric_type == METRIC_L2) {
+        estimate_norm_scale(n, x);
+    }
+
+    is_trained = true;
+}
+
+void IndexAdditiveQuantizerFastScan::estimate_norm_scale(
+        idx_t n,
+        const float* x_in) {
+    FAISS_THROW_IF_NOT(metric_type == METRIC_L2);
+
+    constexpr int seed = 0x980903;
+    constexpr size_t max_points_estimated = 65536;
+    size_t ns = n;
+    const float* x = fvecs_maybe_subsample(
+            d, &ns, max_points_estimated, x_in, verbose, seed);
+    n = ns;
+    std::unique_ptr<float[]> del_x;
+    if (x != x_in) {
+        del_x.reset((float*)x);
+    }
+
+    std::vector<float> dis_tables(n * M * ksub);
+    compute_float_LUT(dis_tables.data(), n, x);
+
+    // here we compute the mean of scales for each query
+    // TODO: try max of scales
+    double scale = 0;
+
+#pragma omp parallel for reduction(+ : scale)
+    for (idx_t i = 0; i < n; i++) {
+        const float* lut = dis_tables.data() + i * M * ksub;
+        scale += quantize_lut::aq_estimate_norm_scale(M, ksub, 2, lut);
+    }
+    scale /= n;
+    norm_scale = (int)std::roundf(std::max(scale, 1.0));
+
+    if (verbose) {
+        printf("estimated norm scale: %lf\n", scale);
+        printf("rounded norm scale: %d\n", norm_scale);
+    }
+}
+
+void IndexAdditiveQuantizerFastScan::compute_codes(
+        uint8_t* tmp_codes,
+        idx_t n,
+        const float* x) const {
+    aq->compute_codes(x, tmp_codes, n);
+}
+
+void IndexAdditiveQuantizerFastScan::compute_float_LUT(
+        float* lut,
+        idx_t n,
+        const float* x) const {
+    if (metric_type == METRIC_INNER_PRODUCT) {
+        aq->compute_LUT(n, x, lut, 1.0f);
+    } else {
+        // compute inner product look-up tables
+        const size_t ip_dim12 = aq->M * ksub;
+        const size_t norm_dim12 = 2 * ksub;
+        std::vector<float> ip_lut(n * ip_dim12);
+        aq->compute_LUT(n, x, ip_lut.data(), -2.0f);
+
+        // copy and rescale norm look-up tables
+        auto norm_tabs = aq->norm_tabs;
+        if (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2) {
+            for (size_t i = 0; i < norm_tabs.size(); i++) {
+                norm_tabs[i] /= norm_scale;
+            }
+        }
+        const float* norm_lut = norm_tabs.data();
+        FAISS_THROW_IF_NOT(norm_tabs.size() == norm_dim12);
+
+        // combine them
+        for (idx_t i = 0; i < n; i++) {
+            memcpy(lut, ip_lut.data() + i * ip_dim12, ip_dim12 * sizeof(*lut));
+            lut += ip_dim12;
+            memcpy(lut, norm_lut, norm_dim12 * sizeof(*lut));
+            lut += norm_dim12;
+        }
+    }
+}
+
+void IndexAdditiveQuantizerFastScan::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+    FAISS_THROW_IF_NOT(k > 0);
+    bool rescale = (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2);
+    if (!rescale) {
+        IndexFastScan::search(n, x, k, distances, labels);
+        return;
+    }
+
+    NormTableScaler scaler(norm_scale);
+    if (metric_type == METRIC_L2) {
+        search_dispatch_implem<true>(n, x, k, distances, labels, scaler);
+    } else {
+        search_dispatch_implem<false>(n, x, k, distances, labels, scaler);
+    }
+}
+
+void IndexAdditiveQuantizerFastScan::sa_decode(
+        idx_t n,
+        const uint8_t* bytes,
+        float* x) const {
+    aq->decode(bytes, x, n);
+}
+
+/**************************************************************************************
+ * IndexResidualQuantizerFastScan
+ **************************************************************************************/
+
+IndexResidualQuantizerFastScan::IndexResidualQuantizerFastScan(
+        int d,        ///< dimensionality of the input vectors
+        size_t M,     ///< number of subquantizers
+        size_t nbits, ///< number of bit per subvector index
+        MetricType metric,
+        Search_type_t search_type,
+        int bbs)
+        : rq(d, M, nbits, search_type) {
+    init(&rq, metric, bbs);
+}
+
+IndexResidualQuantizerFastScan::IndexResidualQuantizerFastScan() {
+    aq = &rq;
+}
+
+/**************************************************************************************
+ * IndexLocalSearchQuantizerFastScan
+ **************************************************************************************/
+
+IndexLocalSearchQuantizerFastScan::IndexLocalSearchQuantizerFastScan(
+        int d,
+        size_t M,     ///< number of subquantizers
+        size_t nbits, ///< number of bit per subvector index
+        MetricType metric,
+        Search_type_t search_type,
+        int bbs)
+        : lsq(d, M, nbits, search_type) {
+    init(&lsq, metric, bbs);
+}
+
+IndexLocalSearchQuantizerFastScan::IndexLocalSearchQuantizerFastScan() {
+    aq = &lsq;
+}
+
+/**************************************************************************************
+ * IndexProductResidualQuantizerFastScan
+ **************************************************************************************/
+
+IndexProductResidualQuantizerFastScan::IndexProductResidualQuantizerFastScan(
+        int d,          ///< dimensionality of the input vectors
+        size_t nsplits, ///< number of residual quantizers
+        size_t Msub,    ///< number of subquantizers per RQ
+        size_t nbits,   ///< number of bit per subvector index
+        MetricType metric,
+        Search_type_t search_type,
+        int bbs)
+        : prq(d, nsplits, Msub, nbits, search_type) {
+    init(&prq, metric, bbs);
+}
+
+IndexProductResidualQuantizerFastScan::IndexProductResidualQuantizerFastScan() {
+    aq = &prq;
+}
+
+/**************************************************************************************
+ * IndexProductLocalSearchQuantizerFastScan
+ **************************************************************************************/
+
+IndexProductLocalSearchQuantizerFastScan::
+        IndexProductLocalSearchQuantizerFastScan(
+                int d,          ///< dimensionality of the input vectors
+                size_t nsplits, ///< number of local search quantizers
+                size_t Msub,    ///< number of subquantizers per LSQ
+                size_t nbits,   ///< number of bit per subvector index
+                MetricType metric,
+                Search_type_t search_type,
+                int bbs)
+        : plsq(d, nsplits, Msub, nbits, search_type) {
+    init(&plsq, metric, bbs);
+}
+
+IndexProductLocalSearchQuantizerFastScan::
+        IndexProductLocalSearchQuantizerFastScan() {
+    aq = &plsq;
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h
new file mode 100644
index 000000000..d7d23336f
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexAdditiveQuantizerFastScan.h
@@ -0,0 +1,199 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexFastScan.h>
+#include <faiss/impl/AdditiveQuantizer.h>
+#include <faiss/impl/ProductAdditiveQuantizer.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+/** Fast scan version of IndexAQ. Works for 4-bit AQ for now.
+ *
+ * The codes are not stored sequentially but grouped in blocks of size bbs.
+ * This makes it possible to compute distances quickly with SIMD instructions.
+ *
+ * Implementations:
+ * 12: blocked loop with internal loop on Q with qbs
+ * 13: same with reservoir accumulator to store results
+ * 14: no qbs with heap accumulator
+ * 15: no qbs with reservoir accumulator
+ */
+
+struct IndexAdditiveQuantizerFastScan : IndexFastScan {
+    AdditiveQuantizer* aq;
+    using Search_type_t = AdditiveQuantizer::Search_type_t;
+
+    bool rescale_norm = true;
+    int norm_scale = 1;
+
+    // max number of training vectors
+    size_t max_train_points = 0;
+
+    explicit IndexAdditiveQuantizerFastScan(
+            AdditiveQuantizer* aq,
+            MetricType metric = METRIC_L2,
+            int bbs = 32);
+
+    void init(
+            AdditiveQuantizer* aq,
+            MetricType metric = METRIC_L2,
+            int bbs = 32);
+
+    IndexAdditiveQuantizerFastScan();
+
+    ~IndexAdditiveQuantizerFastScan() override;
+
+    /// build from an existing IndexAQ
+    explicit IndexAdditiveQuantizerFastScan(
+            const IndexAdditiveQuantizer& orig,
+            int bbs = 32);
+
+    void train(idx_t n, const float* x) override;
+
+    void estimate_norm_scale(idx_t n, const float* x);
+
+    void compute_codes(uint8_t* codes, idx_t n, const float* x) const override;
+
+    void compute_float_LUT(float* lut, idx_t n, const float* x) const override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    /** Decode a set of vectors.
+     *
+     *  NOTE: The codes in the IndexAdditiveQuantizerFastScan object are non-
+     *        contiguous. But this method requires a contiguous representation.
+     *
+     * @param n       number of vectors
+     * @param bytes   input encoded vectors, size n * code_size
+     * @param x       output vectors, size n * d
+     */
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+/** Index based on a residual quantizer. Stored vectors are
+ * approximated by residual quantization codes.
+ * Can also be used as a codec
+ */
+struct IndexResidualQuantizerFastScan : IndexAdditiveQuantizerFastScan {
+    /// The residual quantizer used to encode the vectors
+    ResidualQuantizer rq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     * @param metric  metric type
+     * @param search_type AQ search type
+     */
+    IndexResidualQuantizerFastScan(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_rq2x4,
+            int bbs = 32);
+
+    IndexResidualQuantizerFastScan();
+};
+
+/** Index based on a local search quantizer. Stored vectors are
+ * approximated by local search quantization codes.
+ * Can also be used as a codec
+ */
+struct IndexLocalSearchQuantizerFastScan : IndexAdditiveQuantizerFastScan {
+    LocalSearchQuantizer lsq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param M      number of subquantizers
+     * @param nbits  number of bit per subvector index
+     * @param metric  metric type
+     * @param search_type AQ search type
+     */
+    IndexLocalSearchQuantizerFastScan(
+            int d,        ///< dimensionality of the input vectors
+            size_t M,     ///< number of subquantizers
+            size_t nbits, ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
+            int bbs = 32);
+
+    IndexLocalSearchQuantizerFastScan();
+};
+
+/** Index based on a product residual quantizer. Stored vectors are
+ * approximated by product residual quantization codes.
+ * Can also be used as a codec
+ */
+struct IndexProductResidualQuantizerFastScan : IndexAdditiveQuantizerFastScan {
+    /// The product residual quantizer used to encode the vectors
+    ProductResidualQuantizer prq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param nsplits  number of residual quantizers
+     * @param Msub     number of subquantizers per RQ
+     * @param nbits  number of bit per subvector index
+     * @param metric  metric type
+     * @param search_type AQ search type
+     */
+    IndexProductResidualQuantizerFastScan(
+            int d,          ///< dimensionality of the input vectors
+            size_t nsplits, ///< number of residual quantizers
+            size_t Msub,    ///< number of subquantizers per RQ
+            size_t nbits,   ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_rq2x4,
+            int bbs = 32);
+
+    IndexProductResidualQuantizerFastScan();
+};
+
+/** Index based on a product local search quantizer. Stored vectors are
+ * approximated by product local search quantization codes.
+ * Can also be used as a codec
+ */
+struct IndexProductLocalSearchQuantizerFastScan
+        : IndexAdditiveQuantizerFastScan {
+    /// The product local search quantizer used to encode the vectors
+    ProductLocalSearchQuantizer plsq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param nsplits  number of local search quantizers
+     * @param Msub     number of subquantizers per LSQ
+     * @param nbits  number of bit per subvector index
+     * @param metric  metric type
+     * @param search_type AQ search type
+     */
+    IndexProductLocalSearchQuantizerFastScan(
+            int d,          ///< dimensionality of the input vectors
+            size_t nsplits, ///< number of local search quantizers
+            size_t Msub,    ///< number of subquantizers per LSQ
+            size_t nbits,   ///< number of bit per subvector index
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_rq2x4,
+            int bbs = 32);
+
+    IndexProductLocalSearchQuantizerFastScan();
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexBinary.cpp b/thirdparty/faiss/faiss/IndexBinary.cpp
index 367237eab..9c1adf833 100644
--- a/thirdparty/faiss/faiss/IndexBinary.cpp
+++ b/thirdparty/faiss/faiss/IndexBinary.cpp
@@ -15,7 +15,12 @@
 
 namespace faiss {
 
-IndexBinary::~IndexBinary() {}
+IndexBinary::IndexBinary(idx_t d, MetricType metric)
+        : d(d), code_size(d / 8), metric_type(metric) {
+    FAISS_THROW_IF_NOT(d % 8 == 0);
+}
+
+IndexBinary::~IndexBinary() = default;
 
 void IndexBinary::train(idx_t, const uint8_t*) {
     // Does nothing by default.
@@ -26,7 +31,7 @@ void IndexBinary::range_search(
         const uint8_t*,
         float,
         RangeSearchResult*,
-        const BitsetView) const {
+        const SearchParameters*) const {
     FAISS_THROW_MSG("range search not implemented");
 }
 
@@ -51,7 +56,7 @@ void IndexBinary::reconstruct(idx_t, uint8_t*) const {
 
 void IndexBinary::reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const {
     for (idx_t i = 0; i < ni; i++) {
-        reconstruct(i0 + i, recons + i * d);
+        reconstruct(i0 + i, recons + i * code_size);
     }
 }
 
@@ -61,18 +66,19 @@ void IndexBinary::search_and_reconstruct(
         idx_t k,
         int32_t* distances,
         idx_t* labels,
-        uint8_t* recons) const {
+        uint8_t* recons,
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
 
-    search(n, x, k, distances, labels);
+    search(n, x, k, distances, labels, params);
     for (idx_t i = 0; i < n; ++i) {
         for (idx_t j = 0; j < k; ++j) {
             idx_t ij = i * k + j;
             idx_t key = labels[ij];
-            uint8_t* reconstructed = recons + ij * d;
+            uint8_t* reconstructed = recons + ij * code_size;
             if (key < 0) {
                 // Fill with NaNs
-                memset(reconstructed, -1, sizeof(*reconstructed) * d);
+                memset(reconstructed, -1, code_size);
             } else {
                 reconstruct(key, reconstructed);
             }
@@ -86,4 +92,15 @@ void IndexBinary::display() const {
            ntotal);
 }
 
+void IndexBinary::merge_from(
+        IndexBinary& /* otherIndex */,
+        idx_t /* add_id */) {
+    FAISS_THROW_MSG("merge_from() not implemented");
+}
+
+void IndexBinary::check_compatible_for_merge(
+        const IndexBinary& /* otherIndex */) const {
+    FAISS_THROW_MSG("check_compatible_for_merge() not implemented");
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexBinary.h b/thirdparty/faiss/faiss/IndexBinary.h
index a39360f5f..a08c34dbc 100644
--- a/thirdparty/faiss/faiss/IndexBinary.h
+++ b/thirdparty/faiss/faiss/IndexBinary.h
@@ -16,7 +16,6 @@
 #include <typeinfo>
 
 #include <faiss/Index.h>
-#include <faiss/impl/FaissAssert.h>
 
 namespace faiss {
 
@@ -32,31 +31,22 @@ struct RangeSearchResult;
  * vectors.
  */
 struct IndexBinary {
-    using idx_t = Index::idx_t; ///< all indices are this type
     using component_t = uint8_t;
     using distance_t = int32_t;
 
-    int d;         ///< vector dimension
-    int code_size; ///< number of bytes per vector ( = d / 8 )
-    idx_t ntotal;  ///< total nb of indexed vectors
-    bool verbose;  ///< verbosity level
+    int d = 0;            ///< vector dimension
+    int code_size = 0;    ///< number of bytes per vector ( = d / 8 )
+    idx_t ntotal = 0;     ///< total nb of indexed vectors
+    bool verbose = false; ///< verbosity level
 
     /// set if the Index does not require training, or if training is done
     /// already
-    bool is_trained;
+    bool is_trained = true;
 
     /// type of metric this index uses for search
-    MetricType metric_type;
-
-    explicit IndexBinary(idx_t d = 0, MetricType metric = METRIC_Hamming)
-            : d(d),
-              code_size(d / 8),
-              ntotal(0),
-              verbose(false),
-              is_trained(true),
-              metric_type(metric) {
-        FAISS_THROW_IF_NOT(d % 8 == 0);
-    }
+    MetricType metric_type = METRIC_Hamming;
+
+    explicit IndexBinary(idx_t d = 0, MetricType metric = METRIC_Hamming);
 
     virtual ~IndexBinary();
 
@@ -91,7 +81,6 @@ struct IndexBinary {
      * @param x           input vectors to search, size n * d / 8
      * @param labels      output labels of the NNs, size n*k
      * @param distances   output pairwise distances, size n*k
-     * @param bitset      flags to check the validity of vectors
      */
     virtual void search(
             idx_t n,
@@ -99,7 +88,7 @@ struct IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const = 0;
+            const SearchParameters* params = nullptr) const = 0;
 
     /** Query n vectors of dimension d to the index.
      *
@@ -120,7 +109,7 @@ struct IndexBinary {
             const uint8_t* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const;
+            const SearchParameters* params = nullptr) const;
 
     /** Return the indexes of the k vectors closest to the query x.
      *
@@ -167,10 +156,23 @@ struct IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            uint8_t* recons) const;
+            uint8_t* recons,
+            const SearchParameters* params = nullptr) const;
 
     /** Display the actual class name and some more info. */
     void display() const;
+
+    /** moves the entries from another dataset to self.
+     * On output, other is empty.
+     * add_id is added to all moved ids
+     * (for sequential ids, this would be this->ntotal) */
+    virtual void merge_from(IndexBinary& otherIndex, idx_t add_id = 0);
+
+    /** check that the two indexes are compatible (ie, they are
+     * trained in the same way and have the same
+     * parameters). Otherwise throw. */
+    virtual void check_compatible_for_merge(
+            const IndexBinary& otherIndex) const;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexBinaryFlat.cpp b/thirdparty/faiss/faiss/IndexBinaryFlat.cpp
index 82896a1ef..87400c317 100644
--- a/thirdparty/faiss/faiss/IndexBinaryFlat.cpp
+++ b/thirdparty/faiss/faiss/IndexBinaryFlat.cpp
@@ -9,13 +9,15 @@
 
 #include <faiss/IndexBinaryFlat.h>
 
-#include <faiss/Index.h>
-#include <faiss/IndexBinary.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/utils/Heap.h>
+
+// todo aguzhva: merge binary_distances with hamming_distances
 #include <faiss/utils/binary_distances.h>
 #include <faiss/utils/hamming.h>
+
 #include <faiss/utils/utils.h>
 #include <cstring>
 
@@ -42,16 +44,37 @@ void IndexBinaryFlat::search(
         idx_t k,
         int32_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
 
+    IDSelector* sel = (params == nullptr) ? nullptr : params->sel;
+
+    // ====================================================
+    // The following piece of the code is Knowhere-specific.
+    //   As a result, query_batch_size and 
+    //   use_heap variables are not used.
+
     if (metric_type == METRIC_Jaccard) {
         float* D = reinterpret_cast<float*>(distances);
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, D};
-        binary_knn_hc(METRIC_Jaccard, &res, x, xb.data(), ntotal, code_size, bitset);
+        binary_knn_hc(
+                METRIC_Jaccard, 
+                &res, 
+                x, 
+                xb.data(), 
+                ntotal, 
+                code_size, 
+                sel);
     } else if (metric_type == METRIC_Hamming) {
         int_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        binary_knn_hc(METRIC_Hamming, &res, x, xb.data(), ntotal, code_size, bitset);
+        binary_knn_hc(
+                METRIC_Hamming, 
+                &res, 
+                x, 
+                xb.data(), 
+                ntotal, 
+                code_size, 
+                sel);
     } else if (
             metric_type == METRIC_Substructure ||
             metric_type == METRIC_Superstructure) {
@@ -68,10 +91,13 @@ void IndexBinaryFlat::search(
                 code_size,
                 D,
                 labels,
-                bitset);
+                sel);
     } else {
         FAISS_ASSERT_FMT(false, "invalid metric type %d", (int)metric_type);
     }
+
+    // The end of Knowhere-specific code. 
+    // ====================================================
 }
 
 size_t IndexBinaryFlat::remove_ids(const IDSelector& sel) {
@@ -105,7 +131,13 @@ void IndexBinaryFlat::range_search(
         const uint8_t* x,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+
+    // ====================================================
+    // The following piece of the code is Knowhere-specific.
+
+    IDSelector* sel = (params == nullptr) ? nullptr : params->sel;
+
     switch (metric_type) {
         case METRIC_Jaccard: {
             binary_range_search<CMin<float, int64_t>, float>(
@@ -117,7 +149,7 @@ void IndexBinaryFlat::range_search(
                     radius,
                     code_size,
                     result,
-                    bitset);
+                    sel);
             break;
         }
         case METRIC_Hamming: {
@@ -130,7 +162,7 @@ void IndexBinaryFlat::range_search(
                     static_cast<int>(radius),
                     code_size,
                     result,
-                    bitset);
+                    sel);
             break;
         }
         case METRIC_Superstructure:
@@ -143,6 +175,9 @@ void IndexBinaryFlat::range_search(
             FAISS_THROW_FMT("Invalid metric type %d\n", (int)metric_type);
             break;
     }
+
+    // The end of Knowhere-specific code. 
+    // ====================================================
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexBinaryFlat.h b/thirdparty/faiss/faiss/IndexBinaryFlat.h
index 619c07110..b7026312d 100644
--- a/thirdparty/faiss/faiss/IndexBinaryFlat.h
+++ b/thirdparty/faiss/faiss/IndexBinaryFlat.h
@@ -13,7 +13,8 @@
 #include <vector>
 
 #include <faiss/IndexBinary.h>
-#include <faiss/impl/AuxIndexStructures.h>
+
+#include <faiss/utils/approx_topk/mode.h>
 
 namespace faiss {
 
@@ -29,6 +30,9 @@ struct IndexBinaryFlat : IndexBinary {
 
     size_t query_batch_size = 32;
 
+    // // todo aguzhva: disabled for Knowhere at this moment
+    // ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK;
+
     explicit IndexBinaryFlat(idx_t d);
 
     IndexBinaryFlat(idx_t d, MetricType metric);
@@ -43,14 +47,14 @@ struct IndexBinaryFlat : IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void range_search(
             idx_t n,
             const uint8_t* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, uint8_t* recons) const override;
 
diff --git a/thirdparty/faiss/faiss/IndexBinaryFromFloat.cpp b/thirdparty/faiss/faiss/IndexBinaryFromFloat.cpp
index 6e0108990..407c91995 100644
--- a/thirdparty/faiss/faiss/IndexBinaryFromFloat.cpp
+++ b/thirdparty/faiss/faiss/IndexBinaryFromFloat.cpp
@@ -9,13 +9,14 @@
 
 #include <faiss/IndexBinaryFromFloat.h>
 
+#include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/utils.h>
 #include <algorithm>
 #include <memory>
 
 namespace faiss {
 
-IndexBinaryFromFloat::IndexBinaryFromFloat() {}
+IndexBinaryFromFloat::IndexBinaryFromFloat() = default;
 
 IndexBinaryFromFloat::IndexBinaryFromFloat(Index* index)
         : IndexBinary(index->d), index(index), own_fields(false) {
@@ -53,7 +54,9 @@ void IndexBinaryFromFloat::search(
         idx_t k,
         int32_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
     constexpr idx_t bs = 32768;
diff --git a/thirdparty/faiss/faiss/IndexBinaryFromFloat.h b/thirdparty/faiss/faiss/IndexBinaryFromFloat.h
index 68146ccd7..3f6c98c97 100644
--- a/thirdparty/faiss/faiss/IndexBinaryFromFloat.h
+++ b/thirdparty/faiss/faiss/IndexBinaryFromFloat.h
@@ -44,7 +44,7 @@ struct IndexBinaryFromFloat : IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void train(idx_t n, const uint8_t* x) override;
 };
diff --git a/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp b/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp
index f5a014a10..8f784d2a3 100644
--- a/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp
+++ b/thirdparty/faiss/faiss/IndexBinaryHNSW.cpp
@@ -20,12 +20,13 @@
 #include <queue>
 #include <unordered_set>
 
-#include <stdint.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <cstdint>
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/hamming.h>
@@ -195,7 +196,9 @@ void IndexBinaryHNSW::search(
         idx_t k,
         int32_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
 #pragma omp parallel
@@ -278,31 +281,21 @@ struct FlatHammingDis : DistanceComputer {
     }
 };
 
+struct BuildDistanceComputer {
+    using T = DistanceComputer*;
+    template <class HammingComputer>
+    DistanceComputer* f(IndexBinaryFlat* flat_storage) {
+        return new FlatHammingDis<HammingComputer>(*flat_storage);
+    }
+};
+
 } // namespace
 
 DistanceComputer* IndexBinaryHNSW::get_distance_computer() const {
     IndexBinaryFlat* flat_storage = dynamic_cast<IndexBinaryFlat*>(storage);
-
     FAISS_ASSERT(flat_storage != nullptr);
-
-    switch (code_size) {
-        case 4:
-            return new FlatHammingDis<HammingComputer4>(*flat_storage);
-        case 8:
-            return new FlatHammingDis<HammingComputer8>(*flat_storage);
-        case 16:
-            return new FlatHammingDis<HammingComputer16>(*flat_storage);
-        case 20:
-            return new FlatHammingDis<HammingComputer20>(*flat_storage);
-        case 32:
-            return new FlatHammingDis<HammingComputer32>(*flat_storage);
-        case 64:
-            return new FlatHammingDis<HammingComputer64>(*flat_storage);
-        default:
-            break;
-    }
-
-    return new FlatHammingDis<HammingComputerDefault>(*flat_storage);
+    BuildDistanceComputer bd;
+    return dispatch_HammingComputer(code_size, bd, flat_storage);
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexBinaryHNSW.h b/thirdparty/faiss/faiss/IndexBinaryHNSW.h
index 7848ce5f9..3ba919aba 100644
--- a/thirdparty/faiss/faiss/IndexBinaryHNSW.h
+++ b/thirdparty/faiss/faiss/IndexBinaryHNSW.h
@@ -48,7 +48,7 @@ struct IndexBinaryHNSW : IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, uint8_t* recons) const override;
 
diff --git a/thirdparty/faiss/faiss/IndexBinaryHash.cpp b/thirdparty/faiss/faiss/IndexBinaryHash.cpp
index 326b71ad0..d83a6822d 100644
--- a/thirdparty/faiss/faiss/IndexBinaryHash.cpp
+++ b/thirdparty/faiss/faiss/IndexBinaryHash.cpp
@@ -12,6 +12,7 @@
 #include <cinttypes>
 #include <cstdio>
 #include <memory>
+#include <unordered_set>
 
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
@@ -107,8 +108,6 @@ struct FlipEnumerator {
     }
 };
 
-using idx_t = Index::idx_t;
-
 struct RangeSearchResults {
     float radius;
     RangeQueryResult& qres;
@@ -167,7 +166,7 @@ void search_single_query_template(
         } else {
             const uint8_t* codes = il.vecs.data();
             for (size_t i = 0; i < nv; i++) {
-                int dis = hc.compute(codes);
+                auto dis = hc.compute(codes);
                 res.add(dis, il.ids[i]);
                 codes += code_size;
             }
@@ -177,6 +176,14 @@ void search_single_query_template(
     } while (fe.next());
 }
 
+struct Run_search_single_query {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    T f(Types... args) {
+        search_single_query_template<HammingComputer>(args...);
+    }
+};
+
 template <class SearchResults>
 void search_single_query(
         const IndexBinaryHash& index,
@@ -185,29 +192,9 @@ void search_single_query(
         size_t& n0,
         size_t& nlist,
         size_t& ndis) {
-#define HC(name) \
-    search_single_query_template<name>(index, q, res, n0, nlist, ndis);
-    switch (index.code_size) {
-        case 4:
-            HC(HammingComputer4);
-            break;
-        case 8:
-            HC(HammingComputer8);
-            break;
-        case 16:
-            HC(HammingComputer16);
-            break;
-        case 20:
-            HC(HammingComputer20);
-            break;
-        case 32:
-            HC(HammingComputer32);
-            break;
-        default:
-            HC(HammingComputerDefault);
-            break;
-    }
-#undef HC
+    Run_search_single_query r;
+    dispatch_HammingComputer(
+            index.code_size, r, index, q, res, n0, nlist, ndis);
 }
 
 } // anonymous namespace
@@ -217,7 +204,9 @@ void IndexBinaryHash::range_search(
         const uint8_t* x,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     size_t nlist = 0, ndis = 0, n0 = 0;
 
 #pragma omp parallel if (n > 100) reduction(+ : ndis, n0, nlist)
@@ -246,7 +235,9 @@ void IndexBinaryHash::search(
         idx_t k,
         int32_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
     using HeapForL2 = CMax<int32_t, idx_t>;
@@ -346,22 +337,30 @@ namespace {
 
 template <class HammingComputer, class SearchResults>
 static void verify_shortlist(
-        const IndexBinaryFlat& index,
+        const IndexBinaryFlat* index,
         const uint8_t* q,
-        const std::unordered_set<Index::idx_t>& shortlist,
+        const std::unordered_set<idx_t>& shortlist,
         SearchResults& res) {
-    size_t code_size = index.code_size;
+    size_t code_size = index->code_size;
     size_t nlist = 0, ndis = 0, n0 = 0;
 
     HammingComputer hc(q, code_size);
-    const uint8_t* codes = index.xb.data();
+    const uint8_t* codes = index->xb.data();
 
     for (auto i : shortlist) {
-        int dis = hc.compute(codes + i * code_size);
+        auto dis = hc.compute(codes + i * code_size);
         res.add(dis, i);
     }
 }
 
+struct Run_verify_shortlist {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        verify_shortlist<HammingComputer>(args...);
+    }
+};
+
 template <class SearchResults>
 void search_1_query_multihash(
         const IndexBinaryMultiHash& index,
@@ -402,29 +401,9 @@ void search_1_query_multihash(
     ndis += shortlist.size();
 
     // verify shortlist
-
-#define HC(name) verify_shortlist<name>(*index.storage, xi, shortlist, res)
-    switch (index.code_size) {
-        case 4:
-            HC(HammingComputer4);
-            break;
-        case 8:
-            HC(HammingComputer8);
-            break;
-        case 16:
-            HC(HammingComputer16);
-            break;
-        case 20:
-            HC(HammingComputer20);
-            break;
-        case 32:
-            HC(HammingComputer32);
-            break;
-        default:
-            HC(HammingComputerDefault);
-            break;
-    }
-#undef HC
+    Run_verify_shortlist r;
+    dispatch_HammingComputer(
+            index.code_size, r, index.storage, xi, shortlist, res);
 }
 
 } // anonymous namespace
@@ -434,7 +413,9 @@ void IndexBinaryMultiHash::range_search(
         const uint8_t* x,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     size_t nlist = 0, ndis = 0, n0 = 0;
 
 #pragma omp parallel if (n > 100) reduction(+ : ndis, n0, nlist)
@@ -463,7 +444,9 @@ void IndexBinaryMultiHash::search(
         idx_t k,
         int32_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
     using HeapForL2 = CMax<int32_t, idx_t>;
diff --git a/thirdparty/faiss/faiss/IndexBinaryHash.h b/thirdparty/faiss/faiss/IndexBinaryHash.h
index ec65918ed..d39290ca9 100644
--- a/thirdparty/faiss/faiss/IndexBinaryHash.h
+++ b/thirdparty/faiss/faiss/IndexBinaryHash.h
@@ -51,7 +51,7 @@ struct IndexBinaryHash : IndexBinary {
             const uint8_t* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void search(
             idx_t n,
@@ -59,7 +59,7 @@ struct IndexBinaryHash : IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void display() const;
     size_t hashtable_size() const;
@@ -110,7 +110,7 @@ struct IndexBinaryMultiHash : IndexBinary {
             const uint8_t* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void search(
             idx_t n,
@@ -118,7 +118,7 @@ struct IndexBinaryMultiHash : IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     size_t hashtable_size() const;
 };
diff --git a/thirdparty/faiss/faiss/IndexBinaryIVF.cpp b/thirdparty/faiss/faiss/IndexBinaryIVF.cpp
index a5c50cb9c..f04ee97a1 100644
--- a/thirdparty/faiss/faiss/IndexBinaryIVF.cpp
+++ b/thirdparty/faiss/faiss/IndexBinaryIVF.cpp
@@ -16,64 +16,38 @@
 #include <algorithm>
 #include <memory>
 
-#include <faiss/Index.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/binary_distances.h>
-#include <faiss/utils/hamming.h>
 #include <faiss/utils/jaccard-inl.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/sorting.h>
 #include <faiss/utils/utils.h>
-#include <cinttypes>
+
 namespace faiss {
 
 IndexBinaryIVF::IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist)
         : IndexBinary(d),
           invlists(new ArrayInvertedLists(nlist, code_size)),
-          own_invlists(true),
-          nprobe(1),
-          max_codes(0),
           quantizer(quantizer),
-          nlist(nlist),
-          own_fields(false),
-          clustering_index(nullptr) {
+          nlist(nlist) {
     FAISS_THROW_IF_NOT(d == quantizer->d);
     is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
-
     cp.niter = 10;
 }
 
-IndexBinaryIVF::IndexBinaryIVF(
-        IndexBinary* quantizer,
-        size_t d,
-        size_t nlist,
-        MetricType metric)
+IndexBinaryIVF::IndexBinaryIVF(IndexBinary* quantizer, size_t d, size_t nlist, MetricType metric)
         : IndexBinary(d, metric),
           invlists(new ArrayInvertedLists(nlist, code_size)),
-          own_invlists(true),
-          nprobe(1),
-          max_codes(0),
           quantizer(quantizer),
-          nlist(nlist),
-          own_fields(false),
-          clustering_index(nullptr) {
+          nlist(nlist) {
     FAISS_THROW_IF_NOT(d == quantizer->d);
     is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
-
     cp.niter = 10;
 }
 
-IndexBinaryIVF::IndexBinaryIVF()
-        : invlists(nullptr),
-          own_invlists(false),
-          nprobe(1),
-          max_codes(0),
-          quantizer(nullptr),
-          nlist(0),
-          own_fields(false),
-          clustering_index(nullptr) {}
+IndexBinaryIVF::IndexBinaryIVF() = default;
 
 void IndexBinaryIVF::add(idx_t n, const uint8_t* x) {
     add_with_ids(n, x, nullptr);
@@ -151,32 +125,50 @@ void IndexBinaryIVF::search(
         idx_t k,
         int32_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params_in) const {
+    // todo aguzhva: this code is almost similar to IndexIVF::search().
+    //   find a way to merge.
+
     FAISS_THROW_IF_NOT(k > 0);
+    const IVFSearchParameters* params = nullptr;
+    const SearchParameters* quantizer_params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexBinaryIVF params have incorrect type");
+        quantizer_params = params->quantizer_params;
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
-    const size_t nprobe = std::min(nlist, this->nprobe);
+
     std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
     std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
 
     double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    quantizer->search(
+            n,
+            x,
+            nprobe,
+            coarse_dis.get(),
+            idx.get(),
+            quantizer_params);
     indexIVF_stats.quantization_time += getmillisecs() - t0;
 
     t0 = getmillisecs();
     invlists->prefetch_lists(idx.get(), n * nprobe);
 
     search_preassigned(
-            n,
-            x,
-            k,
-            idx.get(),
-            coarse_dis.get(),
-            distances,
-            labels,
+            n, 
+            x, 
+            k, 
+            idx.get(), 
+            coarse_dis.get(), 
+            distances, 
+            labels, 
             false,
-            nullptr,
-            bitset);
+            params,
+            &indexIVF_stats);
     indexIVF_stats.search_time += getmillisecs() - t0;
 }
 
@@ -190,7 +182,7 @@ void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const {
 
     for (idx_t list_no = 0; list_no < nlist; list_no++) {
         size_t list_size = invlists->list_size(list_no);
-        const Index::idx_t* idlist = invlists->get_ids(list_no);
+        const idx_t* idlist = invlists->get_ids(list_no);
 
         for (idx_t offset = 0; offset < list_size; offset++) {
             idx_t id = idlist[offset];
@@ -206,19 +198,29 @@ void IndexBinaryIVF::reconstruct_n(idx_t i0, idx_t ni, uint8_t* recons) const {
 
 void IndexBinaryIVF::search_and_reconstruct(
         idx_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        uint8_t* recons) const {
-    const size_t nprobe = std::min(nlist, this->nprobe);
-    FAISS_THROW_IF_NOT(k > 0);
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
+        uint8_t* __restrict recons,
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    const SearchParameters* quantizer_params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexBinaryIVF params have incorrect type");
+        quantizer_params = params->quantizer_params;
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    FAISS_THROW_IF_NOT(k > 0);
+
     std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
     std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
 
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get(), quantizer_params);
 
     invlists->prefetch_lists(idx.get(), n * nprobe);
 
@@ -232,7 +234,8 @@ void IndexBinaryIVF::search_and_reconstruct(
             coarse_dis.get(),
             distances,
             labels,
-            /* store_pairs */ true);
+            /* store_pairs */ true,
+            params);
     for (idx_t i = 0; i < n; ++i) {
         for (idx_t j = 0; j < k; ++j) {
             idx_t ij = i * k + j;
@@ -242,8 +245,8 @@ void IndexBinaryIVF::search_and_reconstruct(
                 // Fill with NaNs
                 memset(reconstructed, -1, sizeof(*reconstructed) * d);
             } else {
-                int list_no = key >> 32;
-                int offset = key & 0xffffffff;
+                int list_no = lo_listno(key);
+                int offset = lo_offset(key);
 
                 // Update label to the actual id
                 labels[ij] = invlists->get_single_id(list_no, offset);
@@ -325,22 +328,28 @@ void IndexBinaryIVF::train(idx_t n, const uint8_t* x) {
     is_trained = true;
 }
 
-void IndexBinaryIVF::merge_from(IndexBinaryIVF& other, idx_t add_id) {
-    // minimal sanity checks
-    FAISS_THROW_IF_NOT(other.d == d);
-    FAISS_THROW_IF_NOT(other.nlist == nlist);
-    FAISS_THROW_IF_NOT(other.code_size == code_size);
+void IndexBinaryIVF::check_compatible_for_merge(
+        const IndexBinary& otherIndex) const {
+    auto other = dynamic_cast<const IndexBinaryIVF*>(&otherIndex);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->d == d);
+    FAISS_THROW_IF_NOT(other->nlist == nlist);
+    FAISS_THROW_IF_NOT(other->code_size == code_size);
     FAISS_THROW_IF_NOT_MSG(
-            direct_map.no() && other.direct_map.no(),
+            direct_map.no() && other->direct_map.no(),
             "direct map copy not implemented");
     FAISS_THROW_IF_NOT_MSG(
             typeid(*this) == typeid(other),
             "can only merge indexes of the same type");
+}
 
-    invlists->merge_from(other.invlists, add_id);
-
-    ntotal += other.ntotal;
-    other.ntotal = 0;
+void IndexBinaryIVF::merge_from(IndexBinary& otherIndex, idx_t add_id) {
+    // minimal sanity checks
+    check_compatible_for_merge(otherIndex);
+    auto other = static_cast<IndexBinaryIVF*>(&otherIndex);
+    invlists->merge_from(other->invlists, add_id);
+    ntotal += other->ntotal;
+    other->ntotal = 0;
 }
 
 void IndexBinaryIVF::replace_invlists(InvertedLists* il, bool own) {
@@ -354,16 +363,14 @@ void IndexBinaryIVF::replace_invlists(InvertedLists* il, bool own) {
 
 namespace {
 
-using idx_t = Index::idx_t;
-
+// todo aguzhva: check whether templating store_pairs and use_sel makes sense
 template <class HammingComputer>
 struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
     HammingComputer hc;
     size_t code_size;
-    bool store_pairs;
 
-    IVFBinaryScannerL2(size_t code_size, bool store_pairs)
-            : code_size(code_size), store_pairs(store_pairs) {}
+    IVFBinaryScannerL2(size_t code_size, bool store_pairs, const IDSelector* sel)
+            : BinaryInvertedListScanner(store_pairs, sel), code_size(code_size) {}
 
     void set_query(const uint8_t* query_vector) override {
         hc.set(query_vector, code_size);
@@ -380,18 +387,17 @@ struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
 
     size_t scan_codes(
             size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            int32_t* simi,
-            idx_t* idxi,
-            size_t k,
-            const BitsetView bitset) const override {
+            const uint8_t* __restrict codes,
+            const idx_t* __restrict ids,
+            int32_t* __restrict simi,
+            idx_t* __restrict idxi,
+            size_t k) const override {
         using C = CMax<int32_t, idx_t>;
 
         size_t nup = 0;
         for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = hc.compute(codes);
+            if (!this->sel || this->sel->is_member(ids[j])) {
+                auto dis = hc.compute(codes);
                 if (dis < simi[0]) {
                     idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
                     heap_replace_top<C>(k, simi, idxi, dis, id);
@@ -405,14 +411,14 @@ struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
 
     void scan_codes_range(
             size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
+            const uint8_t* __restrict codes,
+            const idx_t* __restrict ids,
             float radius,
-            RangeQueryResult& result,
-            const BitsetView bitset) const override {
+            RangeQueryResult& result) const override {
+        size_t nup = 0;
         for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = hc.compute(codes);
+            if (!this->sel || this->sel->is_member(ids[j])) {
+                auto dis = hc.compute(codes);
                 if (dis < radius) {
                     int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
                     result.add(dis, id);
@@ -423,12 +429,14 @@ struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
     }
 };
 
-template <class DistanceComputer, bool store_pairs>
+// todo aguzhva: check whether templating store_pairs and use_sel makes sense
+template <class DistanceComputer>
 struct IVFBinaryScannerJaccard : BinaryInvertedListScanner {
     DistanceComputer hc;
     size_t code_size;
 
-    IVFBinaryScannerJaccard(size_t code_size) : code_size(code_size) {}
+    IVFBinaryScannerJaccard(size_t code_size, bool store_pairs, const IDSelector* sel)
+        : BinaryInvertedListScanner(store_pairs, sel), code_size(code_size) {}
 
     void set_query(const uint8_t* query_vector) override {
         hc.set(query_vector, code_size);
@@ -449,13 +457,13 @@ struct IVFBinaryScannerJaccard : BinaryInvertedListScanner {
             const idx_t* ids,
             int32_t* simi,
             idx_t* idxi,
-            size_t k,
-            const BitsetView bitset) const override {
+            size_t k) const override {
         using C = CMax<float, idx_t>;
+        // todo aguzhva: this is a dirty hack in the baseline
         float* psimi = (float*)simi;
         size_t nup = 0;
         for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
+            if (!this->sel || this->sel->is_member(ids[j])) {
                 float dis = hc.compute(codes);
                 if (dis < psimi[0]) {
                     idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
@@ -473,10 +481,9 @@ struct IVFBinaryScannerJaccard : BinaryInvertedListScanner {
             const uint8_t* codes,
             const idx_t* ids,
             float radius,
-            RangeQueryResult& result,
-            const BitsetView bitset) const override {
+            RangeQueryResult& result) const override {
         for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
+            if (!this->sel || this->sel->is_member(ids[j])) {
                 float dis = hc.compute(codes);
                 if (dis < radius) {
                     idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
@@ -488,65 +495,24 @@ struct IVFBinaryScannerJaccard : BinaryInvertedListScanner {
     }
 };
 
-template <bool store_pairs>
-BinaryInvertedListScanner* select_IVFBinaryScannerL2(size_t code_size) {
-#define HC(name) return new IVFBinaryScannerL2<name>(code_size, store_pairs)
-    switch (code_size) {
-        case 4:
-            HC(HammingComputer4);
-        case 8:
-            HC(HammingComputer8);
-        case 16:
-            HC(HammingComputer16);
-        case 20:
-            HC(HammingComputer20);
-        case 32:
-            HC(HammingComputer32);
-        case 64:
-            HC(HammingComputer64);
-        default:
-            HC(HammingComputerDefault);
-    }
-#undef HC
-}
-
-template <bool store_pairs>
-BinaryInvertedListScanner* select_IVFBinaryScannerJaccard(size_t code_size) {
-#define HANDLE_CS(cs)                                                         \
-    case cs:                                                                  \
-        return new IVFBinaryScannerJaccard<JaccardComputer##cs, store_pairs>( \
-                cs);
-    switch (code_size) {
-        HANDLE_CS(16)
-        HANDLE_CS(32)
-        HANDLE_CS(64)
-        HANDLE_CS(128)
-        HANDLE_CS(256)
-        HANDLE_CS(512)
-        default:
-            return new IVFBinaryScannerJaccard<
-                    JaccardComputerDefault,
-                    store_pairs>(code_size);
-    }
-#undef HANDLE_CS
-}
 
 void search_knn_hamming_heap(
-        const IndexBinaryIVF& ivf,
+        const IndexBinaryIVF* ivf,
         size_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         idx_t k,
-        const idx_t* keys,
-        const int32_t* coarse_dis,
-        int32_t* distances,
-        idx_t* labels,
+        const idx_t* __restrict keys,
+        const int32_t* __restrict coarse_dis,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
         bool store_pairs,
-        const IVFSearchParameters* params,
-        const BitsetView bitset) {
-    idx_t nprobe = params ? params->nprobe : ivf.nprobe;
-    nprobe = std::min((idx_t)ivf.nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
-    MetricType metric_type = ivf.metric_type;
+        const IVFSearchParameters* params) {
+    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
+    nprobe = std::min((idx_t)ivf->nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
+    MetricType metric_type = ivf->metric_type;
+
+    IDSelector* sel = params ? params->sel : nullptr;
 
     // almost verbatim copy from IndexIVF::search_preassigned
 
@@ -557,11 +523,11 @@ void search_knn_hamming_heap(
 #pragma omp parallel if (n > 1) reduction(+ : nlistv, ndis, nheap)
     {
         std::unique_ptr<BinaryInvertedListScanner> scanner(
-                ivf.get_InvertedListScanner(store_pairs));
+                ivf->get_InvertedListScanner(store_pairs, sel));
 
 #pragma omp for
         for (idx_t i = 0; i < n; i++) {
-            const uint8_t* xi = x + i * ivf.code_size;
+            const uint8_t* xi = x + i * ivf->code_size;
             scanner->set_query(xi);
 
             const idx_t* keysi = keys + i * nprobe;
@@ -583,28 +549,29 @@ void search_knn_hamming_heap(
                     continue;
                 }
                 FAISS_THROW_IF_NOT_FMT(
-                        key < (idx_t)ivf.nlist,
+                        key < (idx_t)ivf->nlist,
                         "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
                         key,
                         ik,
-                        ivf.nlist);
+                        ivf->nlist);
 
                 scanner->set_list(key, coarse_dis[i * nprobe + ik]);
 
                 nlistv++;
 
-                size_t list_size = ivf.invlists->list_size(key);
-                InvertedLists::ScopedCodes scodes(ivf.invlists, key);
+                size_t list_size = ivf->invlists->list_size(key);
+                InvertedLists::ScopedCodes scodes(ivf->invlists, key);
                 std::unique_ptr<InvertedLists::ScopedIds> sids;
-                const Index::idx_t* ids = nullptr;
+                const idx_t* ids = nullptr;
 
                 if (!store_pairs) {
-                    sids.reset(new InvertedLists::ScopedIds(ivf.invlists, key));
+                    sids = std::make_unique<InvertedLists::ScopedIds>(
+                            ivf->invlists, key);
                     ids = sids->get();
                 }
 
                 nheap += scanner->scan_codes(
-                        list_size, scodes.get(), ids, simi, idxi, k, bitset);
+                        list_size, scodes.get(), ids, simi, idxi, k);
 
                 nscan += list_size;
                 if (max_codes && nscan >= max_codes)
@@ -628,7 +595,7 @@ void search_knn_hamming_heap(
 }
 
 void search_knn_binary_dis_heap(
-        const IndexBinaryIVF& ivf,
+        const IndexBinaryIVF* ivf,
         size_t n,
         const uint8_t* x,
         idx_t k,
@@ -637,12 +604,13 @@ void search_knn_binary_dis_heap(
         float* distances,
         idx_t* labels,
         bool store_pairs,
-        const IVFSearchParameters* params,
-        const BitsetView bitset) {
-    idx_t nprobe = params ? params->nprobe : ivf.nprobe;
-    nprobe = std::min((idx_t)ivf.nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
-    MetricType metric_type = ivf.metric_type;
+        const IVFSearchParameters* params) {
+    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
+    nprobe = std::min((idx_t)ivf->nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
+    MetricType metric_type = ivf->metric_type;
+
+    IDSelector* sel = params ? params->sel : nullptr;
 
     // almost verbatim copy from IndexIVF::search_preassigned
 
@@ -652,11 +620,11 @@ void search_knn_binary_dis_heap(
 #pragma omp parallel if (n > 1) reduction(+ : nlistv, ndis, nheap)
     {
         std::unique_ptr<BinaryInvertedListScanner> scanner(
-                ivf.get_InvertedListScanner(store_pairs));
+                ivf->get_InvertedListScanner(store_pairs, sel));
 
 #pragma omp for
         for (size_t i = 0; i < n; i++) {
-            const uint8_t* xi = x + i * ivf.code_size;
+            const uint8_t* xi = x + i * ivf->code_size;
             scanner->set_query(xi);
 
             const idx_t* keysi = keys + i * nprobe;
@@ -674,23 +642,23 @@ void search_knn_binary_dis_heap(
                     continue;
                 }
                 FAISS_THROW_IF_NOT_FMT(
-                        key < (idx_t)ivf.nlist,
+                        key < (idx_t)ivf->nlist,
                         "Invalid key=%" SCNd64 "  at ik=%ld nlist=%ld\n",
                         key,
                         ik,
-                        ivf.nlist);
+                        ivf->nlist);
 
                 scanner->set_list(key, (int32_t)coarse_dis[i * nprobe + ik]);
 
                 nlistv++;
 
-                size_t list_size = ivf.invlists->list_size(key);
-                InvertedLists::ScopedCodes scodes(ivf.invlists, key);
+                size_t list_size = ivf->invlists->list_size(key);
+                InvertedLists::ScopedCodes scodes(ivf->invlists, key);
                 std::unique_ptr<InvertedLists::ScopedIds> sids;
-                const Index::idx_t* ids = nullptr;
+                const idx_t* ids = nullptr;
 
                 if (!store_pairs) {
-                    sids.reset(new InvertedLists::ScopedIds(ivf.invlists, key));
+                    sids.reset(new InvertedLists::ScopedIds(ivf->invlists, key));
                     ids = sids->get();
                 }
 
@@ -700,8 +668,7 @@ void search_knn_binary_dis_heap(
                         ids,
                         (int32_t*)simi,
                         idxi,
-                        k,
-                        bitset);
+                        k);
 
                 nscan += list_size;
                 if (max_codes && nscan >= max_codes)
@@ -722,30 +689,31 @@ void search_knn_binary_dis_heap(
 
 template <class HammingComputer, bool store_pairs>
 void search_knn_hamming_count(
-        const IndexBinaryIVF& ivf,
+        const IndexBinaryIVF* ivf,
         size_t nx,
-        const uint8_t* x,
-        const idx_t* keys,
+        const uint8_t* __restrict x,
+        const idx_t* __restrict keys,
         int k,
-        int32_t* distances,
-        idx_t* labels,
-        const IVFSearchParameters* params,
-        const BitsetView bitset) {
-    const int nBuckets = ivf.d + 1;
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
+        const IVFSearchParameters* params) {
+    const int nBuckets = ivf->d + 1;
     std::vector<int> all_counters(nx * nBuckets, 0);
     std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
 
-    idx_t nprobe = params ? params->nprobe : ivf.nprobe;
-    nprobe = std::min((idx_t)ivf.nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
+    IDSelector* sel = params ? params->sel : nullptr;
+
+    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
+    nprobe = std::min((idx_t)ivf->nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
 
     std::vector<HCounterState<HammingComputer>> cs;
     for (size_t i = 0; i < nx; ++i) {
         cs.push_back(HCounterState<HammingComputer>(
                 all_counters.data() + i * nBuckets,
                 all_ids_per_dis.get() + i * nBuckets * k,
-                x + i * ivf.code_size,
-                ivf.d,
+                x + i * ivf->code_size,
+                ivf->d,
                 k));
     }
 
@@ -765,29 +733,30 @@ void search_knn_hamming_count(
                 continue;
             }
             FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)ivf.nlist,
+                    key < (idx_t)ivf->nlist,
                     "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
                     key,
                     ik,
-                    ivf.nlist);
+                    ivf->nlist);
 
             nlistv++;
-            size_t list_size = ivf.invlists->list_size(key);
-            InvertedLists::ScopedCodes scodes(ivf.invlists, key);
+            size_t list_size = ivf->invlists->list_size(key);
+            InvertedLists::ScopedCodes scodes(ivf->invlists, key);
             const uint8_t* list_vecs = scodes.get();
-            const Index::idx_t* ids =
-                    store_pairs ? nullptr : ivf.invlists->get_ids(key);
+            const idx_t* ids =
+                    store_pairs ? nullptr : ivf->invlists->get_ids(key);
 
             for (size_t j = 0; j < list_size; j++) {
-                if (bitset.empty() || !bitset.test(ids[j])) {
-                    const uint8_t* yj = list_vecs + ivf.code_size * j;
+                if (!sel || sel->is_member(ids[j])) {
+                    const uint8_t* yj = list_vecs + ivf->code_size * j;
 
                     idx_t id = store_pairs ? (key << 32 | j) : ids[j];
                     csi.update_counter(yj, id);
                 }
             }
-            if (ids)
-                ivf.invlists->release_ids(key, ids);
+            if (ids) {
+                ivf->invlists->release_ids(key, ids);
+            }
 
             nscan += list_size;
             if (max_codes && nscan >= max_codes)
@@ -815,54 +784,246 @@ void search_knn_hamming_count(
     indexIVF_stats.ndis += ndis;
 }
 
+/* Manages NQ queries at a time, stores results */
+template <class HammingComputer, int NQ, int K>
+struct BlockSearch {
+    HammingComputer hcs[NQ];
+    // heaps to update for each query
+    int32_t* distances[NQ];
+    idx_t* labels[NQ];
+    // curent top of heap
+    int32_t heap_tops[NQ];
+
+    BlockSearch(
+            size_t code_size,
+            const uint8_t* __restrict x,
+            const int32_t* __restrict keys,
+            int32_t* __restrict all_distances,
+            idx_t* __restrict all_labels) {
+        for (idx_t q = 0; q < NQ; q++) {
+            idx_t qno = keys[q];
+            hcs[q] = HammingComputer(x + qno * code_size, code_size);
+            distances[q] = all_distances + qno * K;
+            labels[q] = all_labels + qno * K;
+            heap_tops[q] = distances[q][0];
+        }
+    }
+
+    void add_bcode(const uint8_t* bcode, idx_t id) {
+        using C = CMax<int32_t, idx_t>;
+        for (int q = 0; q < NQ; q++) {
+            int dis = hcs[q].compute(bcode);
+            if (dis < heap_tops[q]) {
+                heap_replace_top<C>(K, distances[q], labels[q], dis, id);
+                heap_tops[q] = distances[q][0];
+            }
+        }
+    }
+};
+
+template <class HammingComputer, int NQ>
+struct BlockSearchVariableK {
+    int k;
+    HammingComputer hcs[NQ];
+    // heaps to update for each query
+    int32_t* distances[NQ];
+    idx_t* labels[NQ];
+    // curent top of heap
+    int32_t heap_tops[NQ];
+
+    BlockSearchVariableK(
+            size_t code_size,
+            int k,
+            const uint8_t* __restrict x,
+            const int32_t* __restrict keys,
+            int32_t* __restrict all_distances,
+            idx_t* __restrict all_labels)
+            : k(k) {
+        for (idx_t q = 0; q < NQ; q++) {
+            idx_t qno = keys[q];
+            hcs[q] = HammingComputer(x + qno * code_size, code_size);
+            distances[q] = all_distances + qno * k;
+            labels[q] = all_labels + qno * k;
+            heap_tops[q] = distances[q][0];
+        }
+    }
+
+    void add_bcode(const uint8_t* bcode, idx_t id) {
+        using C = CMax<int32_t, idx_t>;
+        for (int q = 0; q < NQ; q++) {
+            int dis = hcs[q].compute(bcode);
+            if (dis < heap_tops[q]) {
+                heap_replace_top<C>(k, distances[q], labels[q], dis, id);
+                heap_tops[q] = distances[q][0];
+            }
+        }
+    }
+};
+
+template <class HammingComputer>
+void search_knn_hamming_per_invlist(
+        const IndexBinaryIVF* ivf,
+        size_t n,
+        const uint8_t* __restrict x,
+        idx_t k,
+        const idx_t* __restrict keys_in,
+        const int32_t* __restrict coarse_dis,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
+        bool store_pairs,
+        const IVFSearchParameters* params) {
+    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
+    nprobe = std::min((idx_t)ivf->nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
+    FAISS_THROW_IF_NOT(max_codes == 0);
+    FAISS_THROW_IF_NOT(!store_pairs);
+    MetricType metric_type = ivf->metric_type;
+
+    // reorder buckets
+    std::vector<int64_t> lims(n + 1);
+    int32_t* keys = new int32_t[n * nprobe];
+    std::unique_ptr<int32_t[]> delete_keys(keys);
+    for (idx_t i = 0; i < n * nprobe; i++) {
+        keys[i] = keys_in[i];
+    }
+    matrix_bucket_sort_inplace(n, nprobe, keys, ivf->nlist, lims.data(), 0);
+
+    using C = CMax<int32_t, idx_t>;
+    heap_heapify<C>(n * k, distances, labels);
+    const size_t code_size = ivf->code_size;
+
+    for (idx_t l = 0; l < ivf->nlist; l++) {
+        idx_t l0 = lims[l], nq = lims[l + 1] - l0;
+
+        InvertedLists::ScopedCodes scodes(ivf->invlists, l);
+        InvertedLists::ScopedIds sidx(ivf->invlists, l);
+        idx_t nb = ivf->invlists->list_size(l);
+        const uint8_t* bcodes = scodes.get();
+        const idx_t* ids = sidx.get();
+
+        idx_t i = 0;
+
+        // process as much as possible by blocks
+        constexpr int BS = 4;
+
+        if (k == 1) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 1> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else if (k == 2) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 2> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else if (k == 4) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 4> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearchVariableK<HammingComputer, BS> bc(
+                        code_size, k, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        }
+
+        // leftovers
+        for (; i < nq; i++) {
+            idx_t qno = keys[l0 + i];
+            HammingComputer hc(x + qno * code_size, code_size);
+            idx_t* __restrict idxi = labels + qno * k;
+            int32_t* __restrict simi = distances + qno * k;
+            int32_t simi0 = simi[0];
+            for (idx_t j = 0; j < nb; j++) {
+                int dis = hc.compute(bcodes + j * code_size);
+
+                if (dis < simi0) {
+                    idx_t id = store_pairs ? lo_build(l, j) : ids[j];
+                    heap_replace_top<C>(k, simi, idxi, dis, id);
+                    simi0 = simi[0];
+                }
+            }
+        }
+    }
+    for (idx_t i = 0; i < n; i++) {
+        heap_reorder<C>(k, distances + i * k, labels + i * k);
+    }
+}
+
+struct Run_search_knn_hamming_per_invlist {
+    using T = void;
+
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        search_knn_hamming_per_invlist<HammingComputer>(args...);
+    }
+};
+
 template <bool store_pairs>
-void search_knn_hamming_count_1(
-        const IndexBinaryIVF& ivf,
-        size_t nx,
-        const uint8_t* x,
-        const idx_t* keys,
-        int k,
-        int32_t* distances,
-        idx_t* labels,
-        const IVFSearchParameters* params,
-        const BitsetView bitset) {
-    switch (ivf.code_size) {
-#define HANDLE_CS(cs)                                                    \
-    case cs:                                                             \
-        search_knn_hamming_count<HammingComputer##cs, store_pairs>(      \
-                ivf, nx, x, keys, k, distances, labels, params, bitset); \
-        break;
-        HANDLE_CS(4);
-        HANDLE_CS(8);
-        HANDLE_CS(16);
-        HANDLE_CS(20);
-        HANDLE_CS(32);
-        HANDLE_CS(64);
-#undef HANDLE_CS
+struct Run_search_knn_hamming_count {
+    using T = void;
+
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        search_knn_hamming_count<HammingComputer, store_pairs>(args...);
+    }
+};
+
+struct BuildScanner {
+    using T = BinaryInvertedListScanner*;
+
+    template <class HammingComputer>
+    T f(size_t code_size, bool store_pairs, const IDSelector* sel) {
+        return new IVFBinaryScannerL2<HammingComputer>(code_size, store_pairs, sel);
+    }
+};
+
+BinaryInvertedListScanner* select_IVFBinaryScannerJaccard(
+    size_t code_size, bool store_pairs, const IDSelector* sel) {
+#define HANDLE_CS(cs)                                                         \
+    case cs:                                                                  \
+        return new IVFBinaryScannerJaccard<JaccardComputer##cs>( \
+                cs, store_pairs, sel);
+    switch (code_size) {
+        HANDLE_CS(16)
+        HANDLE_CS(32)
+        HANDLE_CS(64)
+        HANDLE_CS(128)
+        HANDLE_CS(256)
+        HANDLE_CS(512)
         default:
-            search_knn_hamming_count<HammingComputerDefault, store_pairs>(
-                    ivf, nx, x, keys, k, distances, labels, params, bitset);
-            break;
+            return new IVFBinaryScannerJaccard<
+                    JaccardComputerDefault>(code_size, store_pairs, sel);
     }
+#undef HANDLE_CS
 }
 
-} // namespace
+} // anonymous namespace
 
 BinaryInvertedListScanner* IndexBinaryIVF::get_InvertedListScanner(
-        bool store_pairs) const {
+        bool store_pairs,
+        const IDSelector* sel) const {
     switch (metric_type) {
         case METRIC_Hamming:
-            if (store_pairs) {
-                return select_IVFBinaryScannerL2<true>(code_size);
-            } else {
-                return select_IVFBinaryScannerL2<false>(code_size);
-            }
+            // todo aguzhva: replaced with Faiss facility
+            BuildScanner bs;
+            return dispatch_HammingComputer(code_size, bs, code_size, store_pairs, sel);
         case METRIC_Jaccard:
-            if (store_pairs) {
-                return select_IVFBinaryScannerJaccard<true>(code_size);
-            } else {
-                return select_IVFBinaryScannerJaccard<false>(code_size);
-            }
+            // todo aguzhva: check whether store_pairs makes sense
+            return select_IVFBinaryScannerJaccard(code_size, store_pairs, sel);
         case METRIC_Substructure:
         case METRIC_Superstructure:
             // unsupported
@@ -870,112 +1031,134 @@ BinaryInvertedListScanner* IndexBinaryIVF::get_InvertedListScanner(
         default:
             return nullptr;
     }
+
+    BuildScanner bs;
+    return dispatch_HammingComputer(code_size, bs, code_size, store_pairs, sel);
 }
 
 void IndexBinaryIVF::search_preassigned(
         idx_t n,
         const uint8_t* x,
         idx_t k,
-        const idx_t* idx,
-        const int32_t* coarse_dis,
-        int32_t* distances,
-        idx_t* labels,
+        const idx_t* cidx,
+        const int32_t* cdis,
+        int32_t* dis,
+        idx_t* idx,
         bool store_pairs,
         const IVFSearchParameters* params,
-        const BitsetView bitset) const {
+        IndexIVFStats* stats) const {
+
+    idx_t nprobe = params ? params->nprobe : this->nprobe;
+    nprobe = std::min((idx_t)nlist, nprobe);
+    
     if (metric_type == METRIC_Jaccard) {
         if (use_heap) {
-            float* D = new float[k * n];
-            float* c_dis = new float[n * nprobe];
-            memcpy(c_dis, coarse_dis, sizeof(float) * n * nprobe);
             search_knn_binary_dis_heap(
-                    *this,
+                    this,
                     n,
                     x,
                     k,
+                    cidx,
+                    reinterpret_cast<const float*>(cdis),
+                    reinterpret_cast<float*>(dis),
                     idx,
-                    c_dis,
-                    D,
-                    labels,
                     store_pairs,
-                    params,
-                    bitset);
-            memcpy(distances, D, sizeof(float) * n * k);
-            delete[] D;
-            delete[] c_dis;
+                    params);
         } else {
-            // not implemented
+            FAISS_THROW_MSG("a search with !use_heap is not implemented for METRIC_Jaccard");
         }
     } else if (
             metric_type == METRIC_Substructure ||
-            metric_type == METRIC_Superstructure) {
-        // unsupported
+            metric_type == METRIC_Superstructure) { 
+        FAISS_THROW_MSG("a search is not implemented for METRIC_Substructure / METRIC_Superstructure");
     } else {
-        if (use_heap) {
+        // METRIC_Hamming
+        if (per_invlist_search) {
+            Run_search_knn_hamming_per_invlist r;
+            // clang-format off
+            dispatch_HammingComputer(
+                    code_size, r, this, n, x, k,
+                    cidx, cdis, dis, idx, store_pairs, params);
+            // clang-format on
+        } else if (use_heap) {
             search_knn_hamming_heap(
-                    *this,
-                    n,
-                    x,
-                    k,
-                    idx,
-                    coarse_dis,
-                    distances,
-                    labels,
-                    store_pairs,
-                    params,
-                    bitset);
-        } else {
-            if (store_pairs) {
-                search_knn_hamming_count_1<true>(
-                        *this, n, x, idx, k, distances, labels, params, bitset);
-            } else {
-                search_knn_hamming_count_1<false>(
-                        *this, n, x, idx, k, distances, labels, params, bitset);
-            }
+                    this, n, x, k, cidx, cdis, dis, idx, store_pairs, params);
+        } else if (store_pairs) { // !use_heap && store_pairs
+            Run_search_knn_hamming_count<true> r;
+            dispatch_HammingComputer(
+                    code_size, r, this, n, x, cidx, k, dis, idx, params);
+        } else { // !use_heap && !store_pairs
+            Run_search_knn_hamming_count<false> r;
+            dispatch_HammingComputer(
+                    code_size, r, this, n, x, cidx, k, dis, idx, params);
         }
     }
 }
 
 void IndexBinaryIVF::range_search(
         idx_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         float radius,
-        RangeSearchResult* res,
-        const BitsetView bitset) const {
-    const size_t nprobe = std::min(nlist, this->nprobe);
+        RangeSearchResult* __restrict res,
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    const SearchParameters* quantizer_params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexBinaryIVF params have incorrect type");
+        quantizer_params = params->quantizer_params;
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
     std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
     std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
 
     double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+    quantizer->search(
+            n, x, nprobe, coarse_dis.get(), idx.get(), quantizer_params);
     indexIVF_stats.quantization_time += getmillisecs() - t0;
 
     t0 = getmillisecs();
     invlists->prefetch_lists(idx.get(), n * nprobe);
+
     range_search_preassigned(
-            n, x, radius, idx.get(), coarse_dis.get(), res, bitset);
+            n, 
+            x, 
+            radius, 
+            idx.get(), 
+            coarse_dis.get(), 
+            res, 
+            params, 
+            &indexIVF_stats);
+
     indexIVF_stats.search_time += getmillisecs() - t0;
 }
 
 void IndexBinaryIVF::range_search_preassigned(
         idx_t n,
-        const uint8_t* x,
+        const uint8_t* __restrict x,
         float radius,
-        const idx_t* assign,
-        const int32_t* centroid_dis,
-        RangeSearchResult* res,
-        const BitsetView bitset) const {
-    const size_t nprobe = std::min(nlist, this->nprobe);
+        const idx_t* __restrict assign,
+        const int32_t* __restrict centroid_dis,
+        RangeSearchResult* __restrict res,
+        const IVFSearchParameters* params,
+        IndexIVFStats* stats) const {
+    idx_t nprobe = params ? params->nprobe : this->nprobe;
+    nprobe = std::min((idx_t)nlist, nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+
     bool store_pairs = false;
     size_t nlistv = 0, ndis = 0;
 
+    IDSelector* sel = params ? params->sel : nullptr;
+
     std::vector<RangeSearchPartialResult*> all_pres(omp_get_max_threads());
 
 #pragma omp parallel reduction(+ : nlistv, ndis)
     {
         RangeSearchPartialResult pres(res);
         std::unique_ptr<BinaryInvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs));
+                get_InvertedListScanner(store_pairs, sel));
         FAISS_THROW_IF_NOT(scanner.get());
 
         all_pres[omp_get_thread_num()] = &pres;
@@ -1002,7 +1185,7 @@ void IndexBinaryIVF::range_search_preassigned(
             nlistv++;
             ndis += list_size;
             scanner->scan_codes_range(
-                    list_size, scodes.get(), ids.get(), radius, qres, bitset);
+                    list_size, scodes.get(), ids.get(), radius, qres);
         };
 
 #pragma omp for
@@ -1011,9 +1194,22 @@ void IndexBinaryIVF::range_search_preassigned(
 
             RangeQueryResult& qres = pres.new_result(i);
 
+            // ====================================================
+            // The following piece of the code is Knowhere-specific.
+            //
+            // cbe86cf716dc1969fc716c29ccf8ea63e82a2b4c: 
+            //   Adopt new strategy for faiss IVF range search
+
+            size_t prev_nres = qres.nres;
+
             for (size_t ik = 0; ik < nprobe; ik++) {
                 scan_list_func(i, ik, qres);
+                if (qres.nres == prev_nres) break;
+                prev_nres = qres.nres;
             }
+
+            // The end of Knowhere-specific code. 
+            // ====================================================
         }
 
         pres.finalize();
diff --git a/thirdparty/faiss/faiss/IndexBinaryIVF.h b/thirdparty/faiss/faiss/IndexBinaryIVF.h
index 3772a94f5..bbc05b6a1 100644
--- a/thirdparty/faiss/faiss/IndexBinaryIVF.h
+++ b/thirdparty/faiss/faiss/IndexBinaryIVF.h
@@ -32,29 +32,36 @@ struct BinaryInvertedListScanner;
  */
 struct IndexBinaryIVF : IndexBinary {
     /// Access to the actual data
-    InvertedLists* invlists;
-    bool own_invlists;
+    InvertedLists* invlists = nullptr;
+    bool own_invlists = true;
 
-    size_t nprobe;    ///< number of probes at query time
-    size_t max_codes; ///< max nb of codes to visit to do a query
+    size_t nprobe = 1;    ///< number of probes at query time
+    size_t max_codes = 0; ///< max nb of codes to visit to do a query
 
     /** Select between using a heap or counting to select the k smallest values
      * when scanning inverted lists.
      */
     bool use_heap = true;
 
+    /** collect computations per batch */
+    bool per_invlist_search = false;
+
     /// map for direct access to the elements. Enables reconstruct().
     DirectMap direct_map;
 
-    IndexBinary* quantizer; ///< quantizer that maps vectors to inverted lists
-    size_t nlist;           ///< number of possible key values
+    /// quantizer that maps vectors to inverted lists
+    IndexBinary* quantizer = nullptr;
+
+    /// number of possible key values
+    size_t nlist = 0;
 
-    bool own_fields; ///< whether object owns the quantizer
+    /// whether object owns the quantizer
+    bool own_fields = false;
 
     ClusteringParameters cp; ///< to override default clustering params
-    Index* clustering_index; ///< to override index used during clustering
-    // mutable std::vector<size_t> nprobe_statistics;
-    // mutable IndexIVFStats index_ivf_stats;
+
+    /// to override index used during clustering
+    Index* clustering_index = nullptr;
 
     /** The Inverted file takes a quantizer (an IndexBinary) on input,
      * which implements the function mapping a vector to a list
@@ -121,10 +128,11 @@ struct IndexBinaryIVF : IndexBinary {
             idx_t* labels,
             bool store_pairs,
             const IVFSearchParameters* params = nullptr,
-            const BitsetView bitset = nullptr) const;
+            IndexIVFStats* stats = nullptr) const;
 
     virtual BinaryInvertedListScanner* get_InvertedListScanner(
-            bool store_pairs = false) const;
+            bool store_pairs = false,
+            const IDSelector* sel = nullptr) const;
 
     /** assign the vectors, then call search_preassign */
     void search(
@@ -133,33 +141,17 @@ struct IndexBinaryIVF : IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
-
-    void search_thread_safe(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            size_t nprobe,
-            const BitsetView bitset) const;
+            const SearchParameters* params = nullptr) const override;
 
+    // Knowhere-specific: radius became float because of Jaccard distance
     void range_search(
             idx_t n,
             const uint8_t* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
-
-    void search_and_reconstruct_thread_safe(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            int32_t* distances,
-            idx_t* labels,
-            uint8_t* recons,
-            size_t nprobe) const;
+            const SearchParameters* params = nullptr) const override;
 
+    // Knowhere-specific: radius became float because of Jaccard distance
     void range_search_preassigned(
             idx_t n,
             const uint8_t* x,
@@ -167,7 +159,8 @@ struct IndexBinaryIVF : IndexBinary {
             const idx_t* assign,
             const int32_t* centroid_dis,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const;
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const;
 
     void reconstruct(idx_t key, uint8_t* recons) const override;
 
@@ -197,7 +190,8 @@ struct IndexBinaryIVF : IndexBinary {
             idx_t k,
             int32_t* distances,
             idx_t* labels,
-            uint8_t* recons) const override;
+            uint8_t* recons,
+            const SearchParameters* params = nullptr) const override;
 
     /** Reconstruct a vector given the location in terms of (inv list index +
      * inv list offset) instead of the id.
@@ -214,16 +208,16 @@ struct IndexBinaryIVF : IndexBinary {
     /// Dataset manipulation functions
     size_t remove_ids(const IDSelector& sel) override;
 
-    /** moves the entries from another dataset to self. On output,
-     * other is empty. add_id is added to all moved ids (for
-     * sequential ids, this would be this->ntotal */
-    virtual void merge_from(IndexBinaryIVF& other, idx_t add_id);
+    void merge_from(IndexBinary& other, idx_t add_id) override;
+
+    void check_compatible_for_merge(
+            const IndexBinary& otherIndex) const override;
 
     size_t get_list_size(size_t list_no) const {
         return invlists->list_size(list_no);
     }
 
-    /** intialize a direct map
+    /** initialize a direct map
      *
      * @param new_maintain_direct_map    if true, create a direct map,
      *                                   else clear it
@@ -234,38 +228,17 @@ struct IndexBinaryIVF : IndexBinary {
 
     void replace_invlists(InvertedLists* il, bool own = false);
 
-    void range_search_thread_safe(
-            idx_t n,
-            const uint8_t* x,
-            float radius,
-            RangeSearchResult* res,
-            size_t nprobe,
-            const BitsetView bitset) const;
-    void range_search_preassigned_thread_safe(
-            idx_t n,
-            const uint8_t* x,
-            float radius,
-            const idx_t* assign,
-            const int32_t* centroid_dis,
-            RangeSearchResult* res,
-            size_t nprobe,
-            const BitsetView bitset) const;
-    void search_preassigned_thread_safe(
-            idx_t n,
-            const uint8_t* x,
-            idx_t k,
-            const idx_t* idx,
-            const int32_t* coarse_dis,
-            int32_t* distances,
-            idx_t* labels,
-            bool store_pairs,
-            const IVFSearchParameters* params,
-            const size_t nprobe,
-            const BitsetView bitset) const;
 };
 
 struct BinaryInvertedListScanner {
-    using idx_t = Index::idx_t;
+    bool store_pairs;
+    /// search in this subset of ids
+    const IDSelector* sel;
+
+    BinaryInvertedListScanner(
+            bool store_pairs = false,
+            const IDSelector* sel = nullptr)
+            : store_pairs(store_pairs), sel(sel) {}
 
     /// from now on we handle this query.
     virtual void set_query(const uint8_t* query_vector) = 0;
@@ -292,16 +265,15 @@ struct BinaryInvertedListScanner {
             const idx_t* ids,
             int32_t* distances,
             idx_t* labels,
-            size_t k,
-            const BitsetView bitset = nullptr) const = 0;
+            size_t k) const = 0;
 
+    // Knowhere-specific: radius became float because of Jaccard distance
     virtual void scan_codes_range(
             size_t n,
             const uint8_t* codes,
             const idx_t* ids,
             float radius,
-            RangeQueryResult& result,
-            const BitsetView bitset = nullptr) const = 0;
+            RangeQueryResult& result) const = 0;
 
     virtual ~BinaryInvertedListScanner() {}
 };
diff --git a/thirdparty/faiss/faiss/IndexBinaryIVFThreadSafe.cpp b/thirdparty/faiss/faiss/IndexBinaryIVFThreadSafe.cpp
deleted file mode 100644
index e356a6327..000000000
--- a/thirdparty/faiss/faiss/IndexBinaryIVFThreadSafe.cpp
+++ /dev/null
@@ -1,819 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// -*- c++ -*-
-
-#include <faiss/IndexBinaryIVF.h>
-
-#include <omp.h>
-#include <cinttypes>
-#include <cstdio>
-
-#include <algorithm>
-#include <memory>
-
-#include <faiss/Index.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/binary_distances.h>
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/jaccard-inl.h>
-#include <faiss/utils/utils.h>
-#include <cinttypes>
-namespace faiss {
-
-void IndexBinaryIVF::search_thread_safe(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        size_t nprobe,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    nprobe = std::min(nlist, nprobe);
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
-
-    double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
-    indexIVF_stats.quantization_time += getmillisecs() - t0;
-
-    t0 = getmillisecs();
-    invlists->prefetch_lists(idx.get(), n * nprobe);
-
-    search_preassigned_thread_safe(
-            n,
-            x,
-            k,
-            idx.get(),
-            coarse_dis.get(),
-            distances,
-            labels,
-            false,
-            nullptr,
-            nprobe,
-            bitset);
-    indexIVF_stats.search_time += getmillisecs() - t0;
-}
-
-void IndexBinaryIVF::search_and_reconstruct_thread_safe(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        int32_t* distances,
-        idx_t* labels,
-        uint8_t* recons,
-        size_t nprobe) const {
-    nprobe = std::min(nlist, nprobe);
-    FAISS_THROW_IF_NOT(k > 0);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
-
-    invlists->prefetch_lists(idx.get(), n * nprobe);
-
-    // search_preassigned() with `store_pairs` enabled to obtain the list_no
-    // and offset into `codes` for reconstruction
-    search_preassigned(
-            n,
-            x,
-            k,
-            idx.get(),
-            coarse_dis.get(),
-            distances,
-            labels,
-            /* store_pairs */ true);
-    for (idx_t i = 0; i < n; ++i) {
-        for (idx_t j = 0; j < k; ++j) {
-            idx_t ij = i * k + j;
-            idx_t key = labels[ij];
-            uint8_t* reconstructed = recons + ij * d;
-            if (key < 0) {
-                // Fill with NaNs
-                memset(reconstructed, -1, sizeof(*reconstructed) * d);
-            } else {
-                int list_no = key >> 32;
-                int offset = key & 0xffffffff;
-
-                // Update label to the actual id
-                labels[ij] = invlists->get_single_id(list_no, offset);
-
-                reconstruct_from_offset(list_no, offset, reconstructed);
-            }
-        }
-    }
-}
-
-namespace {
-
-using idx_t = Index::idx_t;
-
-template <class HammingComputer>
-struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
-    HammingComputer hc;
-    size_t code_size;
-    bool store_pairs;
-
-    IVFBinaryScannerL2(size_t code_size, bool store_pairs)
-            : code_size(code_size), store_pairs(store_pairs) {}
-
-    void set_query(const uint8_t* query_vector) override {
-        hc.set(query_vector, code_size);
-    }
-
-    idx_t list_no;
-    void set_list(idx_t list_no, uint8_t /* coarse_dis */) override {
-        this->list_no = list_no;
-    }
-
-    float distance_to_code(const uint8_t* code) const override {
-        return hc.compute(code);
-    }
-
-    size_t scan_codes(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            int32_t* simi,
-            idx_t* idxi,
-            size_t k,
-            const BitsetView bitset) const override {
-        using C = CMax<int32_t, idx_t>;
-
-        size_t nup = 0;
-        for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = hc.compute(codes);
-                if (dis < simi[0]) {
-                    idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    heap_replace_top<C>(k, simi, idxi, dis, id);
-                    nup++;
-                }
-            }
-            codes += code_size;
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& result,
-            const BitsetView bitset) const override {
-        for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = hc.compute(codes);
-                if (dis < radius) {
-                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    result.add(dis, id);
-                }
-            }
-            codes += code_size;
-        }
-    }
-};
-
-template <class DistanceComputer, bool store_pairs>
-struct IVFBinaryScannerJaccard : BinaryInvertedListScanner {
-    DistanceComputer hc;
-    size_t code_size;
-
-    IVFBinaryScannerJaccard(size_t code_size) : code_size(code_size) {}
-
-    void set_query(const uint8_t* query_vector) override {
-        hc.set(query_vector, code_size);
-    }
-
-    idx_t list_no;
-    void set_list(idx_t list_no, uint8_t /* coarse_dis */) override {
-        this->list_no = list_no;
-    }
-
-    float distance_to_code(const uint8_t* code) const override {
-        return hc.compute(code);
-    }
-
-    size_t scan_codes(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            int32_t* simi,
-            idx_t* idxi,
-            size_t k,
-            const BitsetView bitset) const override {
-        using C = CMax<float, idx_t>;
-        float* psimi = (float*)simi;
-        size_t nup = 0;
-        for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = hc.compute(codes);
-                if (dis < psimi[0]) {
-                    idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    heap_replace_top<C>(k, psimi, idxi, dis, id);
-                    nup++;
-                }
-            }
-            codes += code_size;
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t n,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& result,
-            const BitsetView bitset) const override {
-        for (size_t j = 0; j < n; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = hc.compute(codes);
-                if (dis < radius) {
-                    idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    result.add(dis, id);
-                }
-            }
-            codes += code_size;
-        }
-    }
-};
-
-template <bool store_pairs>
-BinaryInvertedListScanner* select_IVFBinaryScannerL2(size_t code_size) {
-#define HC(name) return new IVFBinaryScannerL2<name>(code_size, store_pairs)
-    switch (code_size) {
-        case 4:
-            HC(HammingComputer4);
-        case 8:
-            HC(HammingComputer8);
-        case 16:
-            HC(HammingComputer16);
-        case 20:
-            HC(HammingComputer20);
-        case 32:
-            HC(HammingComputer32);
-        case 64:
-            HC(HammingComputer64);
-        default:
-            HC(HammingComputerDefault);
-    }
-#undef HC
-}
-
-template <bool store_pairs>
-BinaryInvertedListScanner* select_IVFBinaryScannerJaccard(size_t code_size) {
-#define HANDLE_CS(cs)                                                         \
-    case cs:                                                                  \
-        return new IVFBinaryScannerJaccard<JaccardComputer##cs, store_pairs>( \
-                cs);
-    switch (code_size) {
-        HANDLE_CS(16)
-        HANDLE_CS(32)
-        HANDLE_CS(64)
-        HANDLE_CS(128)
-        HANDLE_CS(256)
-        HANDLE_CS(512)
-        default:
-            return new IVFBinaryScannerJaccard<
-                    JaccardComputerDefault,
-                    store_pairs>(code_size);
-    }
-#undef HANDLE_CS
-}
-
-void search_knn_hamming_heap_thread_safe(
-        const IndexBinaryIVF& ivf,
-        size_t n,
-        const uint8_t* x,
-        idx_t k,
-        const idx_t* keys,
-        const int32_t* coarse_dis,
-        int32_t* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        idx_t nprobe,
-        const BitsetView bitset) {
-    nprobe = params ? params->nprobe : nprobe;
-    nprobe = std::min((idx_t)ivf.nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
-    MetricType metric_type = ivf.metric_type;
-
-    // almost verbatim copy from IndexIVF::search_preassigned
-
-    size_t nlistv = 0, ndis = 0, nheap = 0;
-    using HeapForIP = CMin<int32_t, idx_t>;
-    using HeapForL2 = CMax<int32_t, idx_t>;
-
-#pragma omp parallel if (n > 1) reduction(+ : nlistv, ndis, nheap)
-    {
-        std::unique_ptr<BinaryInvertedListScanner> scanner(
-                ivf.get_InvertedListScanner(store_pairs));
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* xi = x + i * ivf.code_size;
-            scanner->set_query(xi);
-
-            const idx_t* keysi = keys + i * nprobe;
-            int32_t* simi = distances + k * i;
-            idx_t* idxi = labels + k * i;
-
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_heapify<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_heapify<HeapForL2>(k, simi, idxi);
-            }
-
-            size_t nscan = 0;
-
-            for (size_t ik = 0; ik < nprobe; ik++) {
-                idx_t key = keysi[ik]; /* select the list  */
-                if (key < 0) {
-                    // not enough centroids for multiprobe
-                    continue;
-                }
-                FAISS_THROW_IF_NOT_FMT(
-                        key < (idx_t)ivf.nlist,
-                        "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                        key,
-                        ik,
-                        ivf.nlist);
-
-                scanner->set_list(key, coarse_dis[i * nprobe + ik]);
-
-                nlistv++;
-
-                size_t list_size = ivf.invlists->list_size(key);
-                InvertedLists::ScopedCodes scodes(ivf.invlists, key);
-                std::unique_ptr<InvertedLists::ScopedIds> sids;
-                const Index::idx_t* ids = nullptr;
-
-                if (!store_pairs) {
-                    sids.reset(new InvertedLists::ScopedIds(ivf.invlists, key));
-                    ids = sids->get();
-                }
-
-                nheap += scanner->scan_codes(
-                        list_size, scodes.get(), ids, simi, idxi, k, bitset);
-
-                nscan += list_size;
-                if (max_codes && nscan >= max_codes)
-                    break;
-            }
-
-            ndis += nscan;
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_reorder<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_reorder<HeapForL2>(k, simi, idxi);
-            }
-
-        } // parallel for
-    }     // parallel
-
-    indexIVF_stats.nq += n;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nheap_updates += nheap;
-}
-
-void search_knn_binary_dis_heap_thread_safe(
-        const IndexBinaryIVF& ivf,
-        size_t n,
-        const uint8_t* x,
-        idx_t k,
-        const idx_t* keys,
-        const float* coarse_dis,
-        float* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        idx_t nprobe,
-        const BitsetView bitset) {
-    nprobe = params ? params->nprobe : nprobe;
-    nprobe = std::min((idx_t)ivf.nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
-    MetricType metric_type = ivf.metric_type;
-
-    // almost verbatim copy from IndexIVF::search_preassigned
-
-    size_t nlistv = 0, ndis = 0, nheap = 0;
-    using HeapForJaccard = CMax<float, idx_t>;
-
-#pragma omp parallel if (n > 1) reduction(+ : nlistv, ndis, nheap)
-    {
-        std::unique_ptr<BinaryInvertedListScanner> scanner(
-                ivf.get_InvertedListScanner(store_pairs));
-
-#pragma omp for
-        for (size_t i = 0; i < n; i++) {
-            const uint8_t* xi = x + i * ivf.code_size;
-            scanner->set_query(xi);
-
-            const idx_t* keysi = keys + i * nprobe;
-            float* simi = distances + k * i;
-            idx_t* idxi = labels + k * i;
-
-            heap_heapify<HeapForJaccard>(k, simi, idxi);
-
-            size_t nscan = 0;
-
-            for (size_t ik = 0; ik < nprobe; ik++) {
-                idx_t key = keysi[ik]; /* select the list  */
-                if (key < 0) {
-                    // not enough centroids for multiprobe
-                    continue;
-                }
-                FAISS_THROW_IF_NOT_FMT(
-                        key < (idx_t)ivf.nlist,
-                        "Invalid key=%" SCNd64 "  at ik=%ld nlist=%ld\n",
-                        key,
-                        ik,
-                        ivf.nlist);
-
-                scanner->set_list(key, (int32_t)coarse_dis[i * nprobe + ik]);
-
-                nlistv++;
-
-                size_t list_size = ivf.invlists->list_size(key);
-                InvertedLists::ScopedCodes scodes(ivf.invlists, key);
-                std::unique_ptr<InvertedLists::ScopedIds> sids;
-                const Index::idx_t* ids = nullptr;
-
-                if (!store_pairs) {
-                    sids.reset(new InvertedLists::ScopedIds(ivf.invlists, key));
-                    ids = sids->get();
-                }
-
-                nheap += scanner->scan_codes(
-                        list_size,
-                        scodes.get(),
-                        ids,
-                        (int32_t*)simi,
-                        idxi,
-                        k,
-                        bitset);
-
-                nscan += list_size;
-                if (max_codes && nscan >= max_codes)
-                    break;
-            }
-
-            ndis += nscan;
-            heap_reorder<HeapForJaccard>(k, simi, idxi);
-
-        } // parallel for
-    }     // parallel
-
-    indexIVF_stats.nq += n;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nheap_updates += nheap;
-}
-
-template <class HammingComputer, bool store_pairs>
-void search_knn_hamming_count_thread_safe(
-        const IndexBinaryIVF& ivf,
-        size_t nx,
-        const uint8_t* x,
-        const idx_t* keys,
-        int k,
-        int32_t* distances,
-        idx_t* labels,
-        const IVFSearchParameters* params,
-        idx_t nprobe,
-        const BitsetView bitset) {
-    const int nBuckets = ivf.d + 1;
-    std::vector<int> all_counters(nx * nBuckets, 0);
-    std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
-
-    nprobe = params ? params->nprobe : nprobe;
-    nprobe = std::min((idx_t)ivf.nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf.max_codes;
-
-    std::vector<HCounterState<HammingComputer>> cs;
-    for (size_t i = 0; i < nx; ++i) {
-        cs.push_back(HCounterState<HammingComputer>(
-                all_counters.data() + i * nBuckets,
-                all_ids_per_dis.get() + i * nBuckets * k,
-                x + i * ivf.code_size,
-                ivf.d,
-                k));
-    }
-
-    size_t nlistv = 0, ndis = 0;
-
-#pragma omp parallel for reduction(+ : nlistv, ndis)
-    for (int64_t i = 0; i < nx; i++) {
-        const idx_t* keysi = keys + i * nprobe;
-        HCounterState<HammingComputer>& csi = cs[i];
-
-        size_t nscan = 0;
-
-        for (size_t ik = 0; ik < nprobe; ik++) {
-            idx_t key = keysi[ik]; /* select the list  */
-            if (key < 0) {
-                // not enough centroids for multiprobe
-                continue;
-            }
-            FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)ivf.nlist,
-                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                    key,
-                    ik,
-                    ivf.nlist);
-
-            nlistv++;
-            size_t list_size = ivf.invlists->list_size(key);
-            InvertedLists::ScopedCodes scodes(ivf.invlists, key);
-            const uint8_t* list_vecs = scodes.get();
-            const Index::idx_t* ids =
-                    store_pairs ? nullptr : ivf.invlists->get_ids(key);
-
-            for (size_t j = 0; j < list_size; j++) {
-                if (bitset.empty() || !bitset.test(ids[j])) {
-                    const uint8_t* yj = list_vecs + ivf.code_size * j;
-
-                    idx_t id = store_pairs ? (key << 32 | j) : ids[j];
-                    csi.update_counter(yj, id);
-                }
-            }
-            if (ids)
-                ivf.invlists->release_ids(key, ids);
-
-            nscan += list_size;
-            if (max_codes && nscan >= max_codes)
-                break;
-        }
-        ndis += nscan;
-
-        int nres = 0;
-        for (int b = 0; b < nBuckets && nres < k; b++) {
-            for (int l = 0; l < csi.counters[b] && nres < k; l++) {
-                labels[i * k + nres] = csi.ids_per_dis[b * k + l];
-                distances[i * k + nres] = b;
-                nres++;
-            }
-        }
-        while (nres < k) {
-            labels[i * k + nres] = -1;
-            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
-            ++nres;
-        }
-    }
-
-    indexIVF_stats.nq += nx;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-}
-
-template <bool store_pairs>
-void search_knn_hamming_count_1(
-        const IndexBinaryIVF& ivf,
-        size_t nx,
-        const uint8_t* x,
-        const idx_t* keys,
-        int k,
-        int32_t* distances,
-        idx_t* labels,
-        const IVFSearchParameters* params,
-        const size_t nprobe,
-        const BitsetView bitset) {
-    switch (ivf.code_size) {
-#define HANDLE_CS(cs)                         \
-    case cs:                                  \
-        search_knn_hamming_count_thread_safe< \
-                HammingComputer##cs,          \
-                store_pairs>(                 \
-                ivf,                          \
-                nx,                           \
-                x,                            \
-                keys,                         \
-                k,                            \
-                distances,                    \
-                labels,                       \
-                params,                       \
-                nprobe,                       \
-                bitset);                      \
-        break;
-        HANDLE_CS(4);
-        HANDLE_CS(8);
-        HANDLE_CS(16);
-        HANDLE_CS(20);
-        HANDLE_CS(32);
-        HANDLE_CS(64);
-#undef HANDLE_CS
-        default:
-            search_knn_hamming_count_thread_safe<
-                    HammingComputerDefault,
-                    store_pairs>(
-                    ivf,
-                    nx,
-                    x,
-                    keys,
-                    k,
-                    distances,
-                    labels,
-                    params,
-                    nprobe,
-                    bitset);
-            break;
-    }
-}
-
-} // namespace
-
-void IndexBinaryIVF::search_preassigned_thread_safe(
-        idx_t n,
-        const uint8_t* x,
-        idx_t k,
-        const idx_t* idx,
-        const int32_t* coarse_dis,
-        int32_t* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        const size_t nprobe,
-        const BitsetView bitset) const {
-    if (metric_type == METRIC_Jaccard) {
-        if (use_heap) {
-            float* D = new float[k * n];
-            float* c_dis = new float[n * nprobe];
-            memcpy(c_dis, coarse_dis, sizeof(float) * n * nprobe);
-            search_knn_binary_dis_heap_thread_safe(
-                    *this,
-                    n,
-                    x,
-                    k,
-                    idx,
-                    c_dis,
-                    D,
-                    labels,
-                    store_pairs,
-                    params,
-                    nprobe,
-                    bitset);
-            memcpy(distances, D, sizeof(float) * n * k);
-            delete[] D;
-            delete[] c_dis;
-        } else {
-            // not implemented
-        }
-    } else if (
-            metric_type == METRIC_Substructure ||
-            metric_type == METRIC_Superstructure) {
-        // unsupported
-    } else {
-        if (use_heap) {
-            search_knn_hamming_heap_thread_safe(
-                    *this,
-                    n,
-                    x,
-                    k,
-                    idx,
-                    coarse_dis,
-                    distances,
-                    labels,
-                    store_pairs,
-                    params,
-                    nprobe,
-                    bitset);
-        } else {
-            if (store_pairs) {
-                search_knn_hamming_count_1<true>(
-                        *this,
-                        n,
-                        x,
-                        idx,
-                        k,
-                        distances,
-                        labels,
-                        params,
-                        nprobe,
-                        bitset);
-            } else {
-                search_knn_hamming_count_1<false>(
-                        *this,
-                        n,
-                        x,
-                        idx,
-                        k,
-                        distances,
-                        labels,
-                        params,
-                        nprobe,
-                        bitset);
-            }
-        }
-    }
-}
-
-void IndexBinaryIVF::range_search_thread_safe(
-        idx_t n,
-        const uint8_t* x,
-        float radius,
-        RangeSearchResult* res,
-        size_t nprobe,
-        const BitsetView bitset) const {
-    nprobe = std::min(nlist, nprobe);
-    std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-    std::unique_ptr<int32_t[]> coarse_dis(new int32_t[n * nprobe]);
-
-    double t0 = getmillisecs();
-    quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
-    indexIVF_stats.quantization_time += getmillisecs() - t0;
-
-    t0 = getmillisecs();
-    invlists->prefetch_lists(idx.get(), n * nprobe);
-    range_search_preassigned_thread_safe(
-            n, x, radius, idx.get(), coarse_dis.get(), res, nprobe, bitset);
-    indexIVF_stats.search_time += getmillisecs() - t0;
-}
-
-void IndexBinaryIVF::range_search_preassigned_thread_safe(
-        idx_t n,
-        const uint8_t* x,
-        float radius,
-        const idx_t* assign,
-        const int32_t* centroid_dis,
-        RangeSearchResult* res,
-        size_t nprobe,
-        const BitsetView bitset) const {
-    nprobe = std::min(nlist, nprobe);
-    bool store_pairs = false;
-    size_t nlistv = 0, ndis = 0;
-
-    std::vector<RangeSearchPartialResult*> all_pres(omp_get_max_threads());
-
-#pragma omp parallel reduction(+ : nlistv, ndis)
-    {
-        RangeSearchPartialResult pres(res);
-        std::unique_ptr<BinaryInvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs));
-        FAISS_THROW_IF_NOT(scanner.get());
-
-        all_pres[omp_get_thread_num()] = &pres;
-
-        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) {
-            idx_t key = assign[i * nprobe + ik]; /* select the list  */
-            if (key < 0)
-                return;
-            FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)nlist,
-                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                    key,
-                    ik,
-                    nlist);
-            const size_t list_size = invlists->list_size(key);
-
-            if (list_size == 0)
-                return;
-
-            InvertedLists::ScopedCodes scodes(invlists, key);
-            InvertedLists::ScopedIds ids(invlists, key);
-
-            scanner->set_list(key, assign[i * nprobe + ik]);
-            nlistv++;
-            ndis += list_size;
-            scanner->scan_codes_range(
-                    list_size, scodes.get(), ids.get(), radius, qres, bitset);
-        };
-
-#pragma omp for
-        for (idx_t i = 0; i < n; i++) {
-            scanner->set_query(x + i * code_size);
-
-            RangeQueryResult& qres = pres.new_result(i);
-            size_t prev_nres = qres.nres;
-
-            for (size_t ik = 0; ik < nprobe; ik++) {
-                scan_list_func(i, ik, qres);
-                if (qres.nres == prev_nres) break;
-                prev_nres = qres.nres;
-            }
-        }
-
-        pres.finalize();
-    }
-    indexIVF_stats.nq += n;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-}
-
-} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexFastScan.cpp b/thirdparty/faiss/faiss/IndexFastScan.cpp
new file mode 100644
index 000000000..ef8b0f847
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexFastScan.cpp
@@ -0,0 +1,629 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexFastScan.h>
+
+#include <cassert>
+#include <climits>
+#include <memory>
+
+#include <omp.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/impl/LookupTableScaler.h>
+#include <faiss/impl/ResultHandler.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/extra_distances.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/impl/pq4_fast_scan.h>
+#include <faiss/impl/simd_result_handlers.h>
+#include <faiss/utils/quantize_lut.h>
+
+namespace faiss {
+
+using namespace simd_result_handlers;
+
+inline size_t roundup(size_t a, size_t b) {
+    return (a + b - 1) / b * b;
+}
+
+void IndexFastScan::init_fastscan(
+        int d,
+        size_t M,
+        size_t nbits,
+        MetricType metric,
+        int bbs) {
+    FAISS_THROW_IF_NOT(nbits == 4);
+    FAISS_THROW_IF_NOT(bbs % 32 == 0);
+    this->d = d;
+    this->M = M;
+    this->nbits = nbits;
+    this->metric_type = metric;
+    this->bbs = bbs;
+    ksub = (1 << nbits);
+
+    code_size = (M * nbits + 7) / 8;
+    ntotal = ntotal2 = 0;
+    M2 = roundup(M, 2);
+    is_trained = false;
+}
+
+IndexFastScan::IndexFastScan()
+        : bbs(0), M(0), code_size(0), ntotal2(0), M2(0) {}
+
+void IndexFastScan::reset() {
+    codes.resize(0);
+    ntotal = 0;
+}
+
+void IndexFastScan::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT(is_trained);
+
+    // do some blocking to avoid excessive allocs
+    constexpr idx_t bs = 65536;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(n, i0 + bs);
+            if (verbose) {
+                printf("IndexFastScan::add %zd/%zd\n", size_t(i1), size_t(n));
+            }
+            add(i1 - i0, x + i0 * d);
+        }
+        return;
+    }
+    InterruptCallback::check();
+
+    AlignedTable<uint8_t> tmp_codes(n * code_size);
+    compute_codes(tmp_codes.get(), n, x);
+
+    ntotal2 = roundup(ntotal + n, bbs);
+    size_t new_size = ntotal2 * M2 / 2; // assume nbits = 4
+    size_t old_size = codes.size();
+    if (new_size > old_size) {
+        codes.resize(new_size);
+        memset(codes.get() + old_size, 0, new_size - old_size);
+    }
+
+    pq4_pack_codes_range(
+            tmp_codes.get(), M, ntotal, ntotal + n, bbs, M2, codes.get());
+
+    ntotal += n;
+}
+
+CodePacker* IndexFastScan::get_CodePacker() const {
+    return new CodePackerPQ4(M, bbs);
+}
+
+size_t IndexFastScan::remove_ids(const IDSelector& sel) {
+    idx_t j = 0;
+    std::vector<uint8_t> buffer(code_size);
+    CodePackerPQ4 packer(M, bbs);
+    for (idx_t i = 0; i < ntotal; i++) {
+        if (sel.is_member(i)) {
+            // should be removed
+        } else {
+            if (i > j) {
+                packer.unpack_1(codes.data(), i, buffer.data());
+                packer.pack_1(buffer.data(), j, codes.data());
+            }
+            j++;
+        }
+    }
+    size_t nremove = ntotal - j;
+    if (nremove > 0) {
+        ntotal = j;
+        ntotal2 = roundup(ntotal, bbs);
+        size_t new_size = ntotal2 * M2 / 2;
+        codes.resize(new_size);
+    }
+    return nremove;
+}
+
+void IndexFastScan::check_compatible_for_merge(const Index& otherIndex) const {
+    const IndexFastScan* other =
+            dynamic_cast<const IndexFastScan*>(&otherIndex);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->M == M);
+    FAISS_THROW_IF_NOT(other->bbs == bbs);
+    FAISS_THROW_IF_NOT(other->d == d);
+    FAISS_THROW_IF_NOT(other->code_size == code_size);
+    FAISS_THROW_IF_NOT_MSG(
+            typeid(*this) == typeid(*other),
+            "can only merge indexes of the same type");
+}
+
+void IndexFastScan::merge_from(Index& otherIndex, idx_t add_id) {
+    check_compatible_for_merge(otherIndex);
+    IndexFastScan* other = static_cast<IndexFastScan*>(&otherIndex);
+    ntotal2 = roundup(ntotal + other->ntotal, bbs);
+    codes.resize(ntotal2 * M2 / 2);
+    std::vector<uint8_t> buffer(code_size);
+    CodePackerPQ4 packer(M, bbs);
+
+    for (int i = 0; i < other->ntotal; i++) {
+        packer.unpack_1(other->codes.data(), i, buffer.data());
+        packer.pack_1(buffer.data(), ntotal + i, codes.data());
+    }
+    ntotal += other->ntotal;
+    other->reset();
+}
+
+namespace {
+
+template <class C, typename dis_t, class Scaler>
+void estimators_from_tables_generic(
+        const IndexFastScan& index,
+        const uint8_t* codes,
+        size_t ncodes,
+        const dis_t* dis_table,
+        size_t k,
+        typename C::T* heap_dis,
+        int64_t* heap_ids,
+        const Scaler& scaler) {
+    using accu_t = typename C::T;
+
+    for (size_t j = 0; j < ncodes; ++j) {
+        BitstringReader bsr(codes + j * index.code_size, index.code_size);
+        accu_t dis = 0;
+        const dis_t* dt = dis_table;
+        for (size_t m = 0; m < index.M - scaler.nscale; m++) {
+            uint64_t c = bsr.read(index.nbits);
+            dis += dt[c];
+            dt += index.ksub;
+        }
+
+        for (size_t m = 0; m < scaler.nscale; m++) {
+            uint64_t c = bsr.read(index.nbits);
+            dis += scaler.scale_one(dt[c]);
+            dt += index.ksub;
+        }
+
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_pop<C>(k, heap_dis, heap_ids);
+            heap_push<C>(k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
+} // anonymous namespace
+
+using namespace quantize_lut;
+
+void IndexFastScan::compute_quantized_LUT(
+        idx_t n,
+        const float* x,
+        uint8_t* lut,
+        float* normalizers) const {
+    size_t dim12 = ksub * M;
+    std::unique_ptr<float[]> dis_tables(new float[n * dim12]);
+    compute_float_LUT(dis_tables.get(), n, x);
+
+    for (uint64_t i = 0; i < n; i++) {
+        round_uint8_per_column(
+                dis_tables.get() + i * dim12,
+                M,
+                ksub,
+                &normalizers[2 * i],
+                &normalizers[2 * i + 1]);
+    }
+
+    for (uint64_t i = 0; i < n; i++) {
+        const float* t_in = dis_tables.get() + i * dim12;
+        uint8_t* t_out = lut + i * M2 * ksub;
+
+        for (int j = 0; j < dim12; j++) {
+            t_out[j] = int(t_in[j]);
+        }
+        memset(t_out + dim12, 0, (M2 - M) * ksub);
+    }
+}
+
+/******************************************************************************
+ * Search driver routine
+ ******************************************************************************/
+
+void IndexFastScan::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+    FAISS_THROW_IF_NOT(k > 0);
+
+    DummyScaler scaler;
+    if (metric_type == METRIC_L2) {
+        search_dispatch_implem<true>(n, x, k, distances, labels, scaler);
+    } else {
+        search_dispatch_implem<false>(n, x, k, distances, labels, scaler);
+    }
+}
+
+template <bool is_max, class Scaler>
+void IndexFastScan::search_dispatch_implem(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const Scaler& scaler) const {
+    using Cfloat = typename std::conditional<
+            is_max,
+            CMax<float, int64_t>,
+            CMin<float, int64_t>>::type;
+
+    using C = typename std::
+            conditional<is_max, CMax<uint16_t, int>, CMin<uint16_t, int>>::type;
+
+    if (n == 0) {
+        return;
+    }
+
+    // actual implementation used
+    int impl = implem;
+
+    if (impl == 0) {
+        if (bbs == 32) {
+            impl = 12;
+        } else {
+            impl = 14;
+        }
+        if (k > 20) {
+            impl++;
+        }
+    }
+
+    if (implem == 1) {
+        FAISS_THROW_MSG("not implemented");
+    } else if (implem == 2 || implem == 3 || implem == 4) {
+        FAISS_THROW_IF_NOT(orig_codes != nullptr);
+        search_implem_234<Cfloat>(n, x, k, distances, labels, scaler);
+    } else if (impl >= 12 && impl <= 15) {
+        FAISS_THROW_IF_NOT(ntotal < INT_MAX);
+        int nt = std::min(omp_get_max_threads(), int(n));
+        if (nt < 2) {
+            if (impl == 12 || impl == 13) {
+                search_implem_12<C>(n, x, k, distances, labels, impl, scaler);
+            } else {
+                search_implem_14<C>(n, x, k, distances, labels, impl, scaler);
+            }
+        } else {
+            // explicitly slice over threads
+#pragma omp parallel for num_threads(nt)
+            for (int slice = 0; slice < nt; slice++) {
+                idx_t i0 = n * slice / nt;
+                idx_t i1 = n * (slice + 1) / nt;
+                float* dis_i = distances + i0 * k;
+                idx_t* lab_i = labels + i0 * k;
+                if (impl == 12 || impl == 13) {
+                    search_implem_12<C>(
+                            i1 - i0, x + i0 * d, k, dis_i, lab_i, impl, scaler);
+                } else {
+                    search_implem_14<C>(
+                            i1 - i0, x + i0 * d, k, dis_i, lab_i, impl, scaler);
+                }
+            }
+        }
+    } else {
+        FAISS_THROW_FMT("invalid implem %d impl=%d", implem, impl);
+    }
+}
+
+template <class Cfloat, class Scaler>
+void IndexFastScan::search_implem_234(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const Scaler& scaler) const {
+    FAISS_THROW_IF_NOT(implem == 2 || implem == 3 || implem == 4);
+
+    const size_t dim12 = ksub * M;
+    std::unique_ptr<float[]> dis_tables(new float[n * dim12]);
+    compute_float_LUT(dis_tables.get(), n, x);
+
+    std::vector<float> normalizers(n * 2);
+
+    if (implem == 2) {
+        // default float
+    } else if (implem == 3 || implem == 4) {
+        for (uint64_t i = 0; i < n; i++) {
+            round_uint8_per_column(
+                    dis_tables.get() + i * dim12,
+                    M,
+                    ksub,
+                    &normalizers[2 * i],
+                    &normalizers[2 * i + 1]);
+        }
+    }
+
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        int64_t* heap_ids = labels + i * k;
+        float* heap_dis = distances + i * k;
+
+        heap_heapify<Cfloat>(k, heap_dis, heap_ids);
+
+        estimators_from_tables_generic<Cfloat>(
+                *this,
+                orig_codes,
+                ntotal,
+                dis_tables.get() + i * dim12,
+                k,
+                heap_dis,
+                heap_ids,
+                scaler);
+
+        heap_reorder<Cfloat>(k, heap_dis, heap_ids);
+
+        if (implem == 4) {
+            float a = normalizers[2 * i];
+            float b = normalizers[2 * i + 1];
+
+            for (int j = 0; j < k; j++) {
+                heap_dis[j] = heap_dis[j] / a + b;
+            }
+        }
+    }
+}
+
+template <class C, class Scaler>
+void IndexFastScan::search_implem_12(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        int impl,
+        const Scaler& scaler) const {
+    FAISS_THROW_IF_NOT(bbs == 32);
+
+    // handle qbs2 blocking by recursive call
+    int64_t qbs2 = this->qbs == 0 ? 11 : pq4_qbs_to_nq(this->qbs);
+    if (n > qbs2) {
+        for (int64_t i0 = 0; i0 < n; i0 += qbs2) {
+            int64_t i1 = std::min(i0 + qbs2, n);
+            search_implem_12<C>(
+                    i1 - i0,
+                    x + d * i0,
+                    k,
+                    distances + i0 * k,
+                    labels + i0 * k,
+                    impl,
+                    scaler);
+        }
+        return;
+    }
+
+    size_t dim12 = ksub * M2;
+    AlignedTable<uint8_t> quantized_dis_tables(n * dim12);
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+
+    if (skip & 1) {
+        quantized_dis_tables.clear();
+    } else {
+        compute_quantized_LUT(
+                n, x, quantized_dis_tables.get(), normalizers.get());
+    }
+
+    AlignedTable<uint8_t> LUT(n * dim12);
+
+    // block sizes are encoded in qbs, 4 bits at a time
+
+    // caution: we override an object field
+    int qbs = this->qbs;
+
+    if (n != pq4_qbs_to_nq(qbs)) {
+        qbs = pq4_preferred_qbs(n);
+    }
+
+    int LUT_nq =
+            pq4_pack_LUT_qbs(qbs, M2, quantized_dis_tables.get(), LUT.get());
+    FAISS_THROW_IF_NOT(LUT_nq == n);
+
+    if (k == 1) {
+        SingleResultHandler<C> handler(n, ntotal);
+        if (skip & 4) {
+            // pass
+        } else {
+            handler.disable = bool(skip & 2);
+            pq4_accumulate_loop_qbs(
+                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler, scaler);
+        }
+
+        handler.to_flat_arrays(distances, labels, normalizers.get());
+
+    } else if (impl == 12) {
+        std::vector<uint16_t> tmp_dis(n * k);
+        std::vector<int32_t> tmp_ids(n * k);
+
+        if (skip & 4) {
+            // skip
+        } else {
+            HeapHandler<C> handler(
+                    n, tmp_dis.data(), tmp_ids.data(), k, ntotal);
+            handler.disable = bool(skip & 2);
+
+            pq4_accumulate_loop_qbs(
+                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler, scaler);
+
+            if (!(skip & 8)) {
+                handler.to_flat_arrays(distances, labels, normalizers.get());
+            }
+        }
+
+    } else { // impl == 13
+
+        ReservoirHandler<C> handler(n, ntotal, k, 2 * k);
+        handler.disable = bool(skip & 2);
+
+        if (skip & 4) {
+            // skip
+        } else {
+            pq4_accumulate_loop_qbs(
+                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler, scaler);
+        }
+
+        if (!(skip & 8)) {
+            handler.to_flat_arrays(distances, labels, normalizers.get());
+        }
+
+        // FastScan_stats.t0 += handler.times[0];
+        // FastScan_stats.t1 += handler.times[1];
+        // FastScan_stats.t2 += handler.times[2];
+        // FastScan_stats.t3 += handler.times[3];
+    }
+}
+
+// FastScanStats FastScan_stats;
+
+template <class C, class Scaler>
+void IndexFastScan::search_implem_14(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        int impl,
+        const Scaler& scaler) const {
+    FAISS_THROW_IF_NOT(bbs % 32 == 0);
+
+    int qbs2 = qbs == 0 ? 4 : qbs;
+
+    // handle qbs2 blocking by recursive call
+    if (n > qbs2) {
+        for (int64_t i0 = 0; i0 < n; i0 += qbs2) {
+            int64_t i1 = std::min(i0 + qbs2, n);
+            search_implem_14<C>(
+                    i1 - i0,
+                    x + d * i0,
+                    k,
+                    distances + i0 * k,
+                    labels + i0 * k,
+                    impl,
+                    scaler);
+        }
+        return;
+    }
+
+    size_t dim12 = ksub * M2;
+    AlignedTable<uint8_t> quantized_dis_tables(n * dim12);
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+
+    if (skip & 1) {
+        quantized_dis_tables.clear();
+    } else {
+        compute_quantized_LUT(
+                n, x, quantized_dis_tables.get(), normalizers.get());
+    }
+
+    AlignedTable<uint8_t> LUT(n * dim12);
+    pq4_pack_LUT(n, M2, quantized_dis_tables.get(), LUT.get());
+
+    if (k == 1) {
+        SingleResultHandler<C> handler(n, ntotal);
+        if (skip & 4) {
+            // pass
+        } else {
+            handler.disable = bool(skip & 2);
+            pq4_accumulate_loop(
+                    n,
+                    ntotal2,
+                    bbs,
+                    M2,
+                    codes.get(),
+                    LUT.get(),
+                    handler,
+                    scaler);
+        }
+        handler.to_flat_arrays(distances, labels, normalizers.get());
+
+    } else if (impl == 14) {
+        std::vector<uint16_t> tmp_dis(n * k);
+        std::vector<int32_t> tmp_ids(n * k);
+
+        if (skip & 4) {
+            // skip
+        } else if (k > 1) {
+            HeapHandler<C> handler(
+                    n, tmp_dis.data(), tmp_ids.data(), k, ntotal);
+            handler.disable = bool(skip & 2);
+
+            pq4_accumulate_loop(
+                    n,
+                    ntotal2,
+                    bbs,
+                    M2,
+                    codes.get(),
+                    LUT.get(),
+                    handler,
+                    scaler);
+
+            if (!(skip & 8)) {
+                handler.to_flat_arrays(distances, labels, normalizers.get());
+            }
+        }
+
+    } else { // impl == 15
+
+        ReservoirHandler<C> handler(n, ntotal, k, 2 * k);
+        handler.disable = bool(skip & 2);
+
+        if (skip & 4) {
+            // skip
+        } else {
+            pq4_accumulate_loop(
+                    n,
+                    ntotal2,
+                    bbs,
+                    M2,
+                    codes.get(),
+                    LUT.get(),
+                    handler,
+                    scaler);
+        }
+
+        if (!(skip & 8)) {
+            handler.to_flat_arrays(distances, labels, normalizers.get());
+        }
+    }
+}
+
+template void IndexFastScan::search_dispatch_implem<true, NormTableScaler>(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const NormTableScaler& scaler) const;
+
+template void IndexFastScan::search_dispatch_implem<false, NormTableScaler>(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const NormTableScaler& scaler) const;
+
+void IndexFastScan::reconstruct(idx_t key, float* recons) const {
+    std::vector<uint8_t> code(code_size, 0);
+    BitstringWriter bsw(code.data(), code_size);
+    for (size_t m = 0; m < M; m++) {
+        uint8_t c = pq4_get_packed_element(codes.data(), bbs, M2, key, m);
+        bsw.write(c, nbits);
+    }
+    sa_decode(1, code.data(), recons);
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexFastScan.h b/thirdparty/faiss/faiss/IndexFastScan.h
new file mode 100644
index 000000000..28b73adb3
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexFastScan.h
@@ -0,0 +1,152 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+struct CodePacker;
+
+/** Fast scan version of IndexPQ and IndexAQ. Works for 4-bit PQ and AQ for now.
+ *
+ * The codes are not stored sequentially but grouped in blocks of size bbs.
+ * This makes it possible to compute distances quickly with SIMD instructions.
+ * The trailing codes (padding codes that are added to complete the last code)
+ * are garbage.
+ *
+ * Implementations:
+ * 12: blocked loop with internal loop on Q with qbs
+ * 13: same with reservoir accumulator to store results
+ * 14: no qbs with heap accumulator
+ * 15: no qbs with reservoir accumulator
+ */
+struct IndexFastScan : Index {
+    // implementation to select
+    int implem = 0;
+    // skip some parts of the computation (for timing)
+    int skip = 0;
+
+    // size of the kernel
+    int bbs;     // set at build time
+    int qbs = 0; // query block size 0 = use default
+
+    // vector quantizer
+    size_t M;
+    size_t nbits;
+    size_t ksub;
+    size_t code_size;
+
+    // packed version of the codes
+    size_t ntotal2;
+    size_t M2;
+
+    AlignedTable<uint8_t> codes;
+
+    // this is for testing purposes only
+    // (set when initialized by IndexPQ or IndexAQ)
+    const uint8_t* orig_codes = nullptr;
+
+    void init_fastscan(
+            int d,
+            size_t M,
+            size_t nbits,
+            MetricType metric,
+            int bbs);
+
+    IndexFastScan();
+
+    void reset() override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    void add(idx_t n, const float* x) override;
+
+    virtual void compute_codes(uint8_t* codes, idx_t n, const float* x)
+            const = 0;
+
+    virtual void compute_float_LUT(float* lut, idx_t n, const float* x)
+            const = 0;
+
+    // called by search function
+    void compute_quantized_LUT(
+            idx_t n,
+            const float* x,
+            uint8_t* lut,
+            float* normalizers) const;
+
+    template <bool is_max, class Scaler>
+    void search_dispatch_implem(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const Scaler& scaler) const;
+
+    template <class Cfloat, class Scaler>
+    void search_implem_234(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const Scaler& scaler) const;
+
+    template <class C, class Scaler>
+    void search_implem_12(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            const Scaler& scaler) const;
+
+    template <class C, class Scaler>
+    void search_implem_14(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            const Scaler& scaler) const;
+
+    void reconstruct(idx_t key, float* recons) const override;
+    size_t remove_ids(const IDSelector& sel) override;
+
+    CodePacker* get_CodePacker() const;
+
+    void merge_from(Index& otherIndex, idx_t add_id = 0) override;
+    void check_compatible_for_merge(const Index& otherIndex) const override;
+};
+
+// // todo aguzhva: removed in https://github.com/zilliztech/knowhere/pull/180,
+// //   but commented out here
+//
+// struct FastScanStats {
+//     uint64_t t0, t1, t2, t3;
+//     FastScanStats() {
+//         reset();
+//     }
+//     void reset() {
+//         memset(this, 0, sizeof(*this));
+//     }
+// };
+//
+// FAISS_API extern FastScanStats FastScan_stats;
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexFlat.cpp b/thirdparty/faiss/faiss/IndexFlat.cpp
index fae29e9f7..e2a150f12 100644
--- a/thirdparty/faiss/faiss/IndexFlat.cpp
+++ b/thirdparty/faiss/faiss/IndexFlat.cpp
@@ -7,14 +7,16 @@
 
 // -*- c++ -*-
 
+#include <faiss/FaissHook.h>
 #include <faiss/IndexFlat.h>
 
-#include <faiss/FaissHook.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/extra_distances.h>
+#include <faiss/utils/prefetch.h>
+#include <faiss/utils/sorting.h>
 #include <faiss/utils/utils.h>
 #include <cstring>
 
@@ -47,60 +49,30 @@ void IndexFlat::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    IDSelector* sel = params ? params->sel : nullptr;
     FAISS_THROW_IF_NOT(k > 0);
 
     // we see the distances and labels as heaps
-
     if (metric_type == METRIC_INNER_PRODUCT) {
         float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
         if (is_cosine) {
-            knn_cosine(x, get_xb(), get_norms(), d, n, ntotal, &res, bitset);
+            knn_cosine(x, get_xb(), get_norms(), d, n, ntotal, &res, sel);
         } else {
-            knn_inner_product(x, get_xb(), d, n, ntotal, &res, bitset);
+            knn_inner_product(x, get_xb(), d, n, ntotal, &res, sel);
         }
     } else if (metric_type == METRIC_L2) {
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, bitset);
-    } else if (metric_type == METRIC_Jaccard) {
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_jaccard(x, get_xb(), d, n, ntotal, &res, bitset);
+        knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
+    } else if (is_similarity_metric(metric_type)) {
+        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        knn_extra_metrics(
+                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
     } else {
+        FAISS_THROW_IF_NOT(!sel);
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_extra_metrics(
-                x,
-                get_xb(),
-                d,
-                n,
-                ntotal,
-                metric_type,
-                metric_arg,
-                &res,
-                bitset);
-    }
-}
-
-void IndexFlat::assign(idx_t n, const float* x, idx_t* labels, float* distances)
-        const {
-    // usually used in IVF k-means algorithm
-    float* dis_inner = (distances == nullptr) ? new float[n] : distances;
-    switch (metric_type) {
-        case METRIC_INNER_PRODUCT:
-        case METRIC_L2: {
-            // ignore the metric_type, both use L2
-            elkan_L2_sse(x, get_xb(), d, n, ntotal, labels, dis_inner);
-            break;
-        }
-        default: {
-            // binary metrics
-            // There may be something wrong, but maintain the original logic
-            // now.
-            Index::assign(n, x, labels, dis_inner);
-            break;
-        }
-    }
-    if (distances == nullptr) {
-        delete[] dis_inner;
+                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
     }
 }
 
@@ -109,20 +81,21 @@ void IndexFlat::range_search(
         const float* x,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    IDSelector* sel = params ? params->sel : nullptr;
+
     switch (metric_type) {
         case METRIC_INNER_PRODUCT:
             if (is_cosine) {
                 range_search_cosine(x, get_xb(), get_norms(), d, n, ntotal,
-                                    radius, result, bitset);
+                                    radius, result, sel);
             } else {
                 range_search_inner_product(
-                        x, get_xb(), d, n, ntotal, radius, result, bitset);
+                        x, get_xb(), d, n, ntotal, radius, result, sel);
             }
             break;
         case METRIC_L2:
-            range_search_L2sqr(
-                    x, get_xb(), d, n, ntotal, radius, result, bitset);
+            range_search_L2sqr(x, get_xb(), d, n, ntotal, radius, result, sel);
             break;
         default:
             FAISS_THROW_MSG("metric type not supported");
@@ -147,18 +120,22 @@ void IndexFlat::compute_distance_subset(
     }
 }
 
+size_t IndexFlat::cal_size() const {
+    return this->sa_code_size();
+}
+
 namespace {
 
-struct FlatL2Dis : DistanceComputer {
+struct FlatL2Dis : FlatCodesDistanceComputer {
     size_t d;
-    Index::idx_t nb;
+    idx_t nb;
     const float* q;
     const float* b;
     size_t ndis;
 
-    float operator()(idx_t i) override {
+    float distance_to_code(const uint8_t* code) final {
         ndis++;
-        return fvec_L2sqr(q, b + i * d, d);
+        return fvec_L2sqr(q, (float*)code, d);
     }
 
     float symmetric_dis(idx_t i, idx_t j) override {
@@ -166,7 +143,10 @@ struct FlatL2Dis : DistanceComputer {
     }
 
     explicit FlatL2Dis(const IndexFlat& storage, const float* q = nullptr)
-            : d(storage.d),
+            : FlatCodesDistanceComputer(
+                      storage.codes.data(),
+                      storage.code_size),
+              d(storage.d),
               nb(storage.ntotal),
               q(q),
               b(storage.get_xb()),
@@ -175,26 +155,62 @@ struct FlatL2Dis : DistanceComputer {
     void set_query(const float* x) override {
         q = x;
     }
+
+    // compute four distances
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) final override {
+        ndis += 4;
+
+        // compute first, assign next
+        const float* __restrict y0 =
+                reinterpret_cast<const float*>(codes + idx0 * code_size);
+        const float* __restrict y1 =
+                reinterpret_cast<const float*>(codes + idx1 * code_size);
+        const float* __restrict y2 =
+                reinterpret_cast<const float*>(codes + idx2 * code_size);
+        const float* __restrict y3 =
+                reinterpret_cast<const float*>(codes + idx3 * code_size);
+
+        float dp0 = 0;
+        float dp1 = 0;
+        float dp2 = 0;
+        float dp3 = 0;
+        fvec_L2sqr_batch_4(q, y0, y1, y2, y3, d, dp0, dp1, dp2, dp3);
+        dis0 = dp0;
+        dis1 = dp1;
+        dis2 = dp2;
+        dis3 = dp3;
+    }
 };
 
-struct FlatIPDis : DistanceComputer {
+struct FlatIPDis : FlatCodesDistanceComputer {
     size_t d;
-    Index::idx_t nb;
+    idx_t nb;
     const float* q;
     const float* b;
     size_t ndis;
 
-    float operator()(idx_t i) override {
-        ndis++;
-        return fvec_inner_product(q, b + i * d, d);
+    float symmetric_dis(idx_t i, idx_t j) final override {
+        return fvec_inner_product(b + j * d, b + i * d, d);
     }
 
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return fvec_inner_product(b + j * d, b + i * d, d);
+    float distance_to_code(const uint8_t* code) final override {
+        ndis++;
+        return fvec_inner_product(q, (const float*)code, d);
     }
 
     explicit FlatIPDis(const IndexFlat& storage, const float* q = nullptr)
-            : d(storage.d),
+            : FlatCodesDistanceComputer(
+                      storage.codes.data(),
+                      storage.code_size),
+              d(storage.d),
               nb(storage.ntotal),
               q(q),
               b(storage.get_xb()),
@@ -203,11 +219,44 @@ struct FlatIPDis : DistanceComputer {
     void set_query(const float* x) override {
         q = x;
     }
+
+    // compute four distances
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) final override {
+        ndis += 4;
+
+        // compute first, assign next
+        const float* __restrict y0 =
+                reinterpret_cast<const float*>(codes + idx0 * code_size);
+        const float* __restrict y1 =
+                reinterpret_cast<const float*>(codes + idx1 * code_size);
+        const float* __restrict y2 =
+                reinterpret_cast<const float*>(codes + idx2 * code_size);
+        const float* __restrict y3 =
+                reinterpret_cast<const float*>(codes + idx3 * code_size);
+
+        float dp0 = 0;
+        float dp1 = 0;
+        float dp2 = 0;
+        float dp3 = 0;
+        fvec_inner_product_batch_4(q, y0, y1, y2, y3, d, dp0, dp1, dp2, dp3);
+        dis0 = dp0;
+        dis1 = dp1;
+        dis2 = dp2;
+        dis3 = dp3;
+    }
 };
 
 } // namespace
 
-DistanceComputer* IndexFlat::get_distance_computer() const {
+FlatCodesDistanceComputer* IndexFlat::get_FlatCodesDistanceComputer() const {
     if (metric_type == METRIC_L2) {
         return new FlatL2Dis(*this);
     } else if (metric_type == METRIC_INNER_PRODUCT) {
@@ -234,6 +283,131 @@ void IndexFlat::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
     }
 }
 
+/***************************************************
+ * IndexFlatL2
+ ***************************************************/
+
+namespace {
+struct FlatL2WithNormsDis : FlatCodesDistanceComputer {
+    size_t d;
+    idx_t nb;
+    const float* q;
+    const float* b;
+    size_t ndis;
+
+    const float* l2norms;
+    float query_l2norm;
+
+    float distance_to_code(const uint8_t* code) final override {
+        ndis++;
+        return fvec_L2sqr(q, (float*)code, d);
+    }
+
+    float operator()(const idx_t i) final override {
+        const float* __restrict y =
+                reinterpret_cast<const float*>(codes + i * code_size);
+
+        prefetch_L2(l2norms + i);
+        const float dp0 = fvec_inner_product(q, y, d);
+        return query_l2norm + l2norms[i] - 2 * dp0;
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) final override {
+        const float* __restrict yi =
+                reinterpret_cast<const float*>(codes + i * code_size);
+        const float* __restrict yj =
+                reinterpret_cast<const float*>(codes + j * code_size);
+
+        prefetch_L2(l2norms + i);
+        prefetch_L2(l2norms + j);
+        const float dp0 = fvec_inner_product(yi, yj, d);
+        return l2norms[i] + l2norms[j] - 2 * dp0;
+    }
+
+    explicit FlatL2WithNormsDis(
+            const IndexFlatL2& storage,
+            const float* q = nullptr)
+            : FlatCodesDistanceComputer(
+                      storage.codes.data(),
+                      storage.code_size),
+              d(storage.d),
+              nb(storage.ntotal),
+              q(q),
+              b(storage.get_xb()),
+              ndis(0),
+              l2norms(storage.cached_l2norms.data()),
+              query_l2norm(0) {}
+
+    void set_query(const float* x) override {
+        q = x;
+        query_l2norm = fvec_norm_L2sqr(q, d);
+    }
+
+    // compute four distances
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) final override {
+        ndis += 4;
+
+        // compute first, assign next
+        const float* __restrict y0 =
+                reinterpret_cast<const float*>(codes + idx0 * code_size);
+        const float* __restrict y1 =
+                reinterpret_cast<const float*>(codes + idx1 * code_size);
+        const float* __restrict y2 =
+                reinterpret_cast<const float*>(codes + idx2 * code_size);
+        const float* __restrict y3 =
+                reinterpret_cast<const float*>(codes + idx3 * code_size);
+
+        prefetch_L2(l2norms + idx0);
+        prefetch_L2(l2norms + idx1);
+        prefetch_L2(l2norms + idx2);
+        prefetch_L2(l2norms + idx3);
+
+        float dp0 = 0;
+        float dp1 = 0;
+        float dp2 = 0;
+        float dp3 = 0;
+        fvec_inner_product_batch_4(q, y0, y1, y2, y3, d, dp0, dp1, dp2, dp3);
+        dis0 = query_l2norm + l2norms[idx0] - 2 * dp0;
+        dis1 = query_l2norm + l2norms[idx1] - 2 * dp1;
+        dis2 = query_l2norm + l2norms[idx2] - 2 * dp2;
+        dis3 = query_l2norm + l2norms[idx3] - 2 * dp3;
+    }
+};
+
+} // namespace
+
+void IndexFlatL2::sync_l2norms() {
+    cached_l2norms.resize(ntotal);
+    fvec_norms_L2sqr(
+            cached_l2norms.data(),
+            reinterpret_cast<const float*>(codes.data()),
+            d,
+            ntotal);
+}
+
+void IndexFlatL2::clear_l2norms() {
+    cached_l2norms.clear();
+    cached_l2norms.shrink_to_fit();
+}
+
+FlatCodesDistanceComputer* IndexFlatL2::get_FlatCodesDistanceComputer() const {
+    if (metric_type == METRIC_L2) {
+        if (!cached_l2norms.empty()) {
+            return new FlatL2WithNormsDis(*this);
+        }
+    }
+
+    return IndexFlat::get_FlatCodesDistanceComputer();
+}
+
 /***************************************************
  * IndexFlat1D
  ***************************************************/
@@ -269,14 +443,15 @@ void IndexFlat1D::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT_MSG(
             perm.size() == ntotal, "Call update_permutation before search");
     const float* xb = get_xb();
 
-#pragma omp parallel for
+#pragma omp parallel for if (n > 10000)
     for (idx_t i = 0; i < n; i++) {
         float q = x[i]; // query
         float* D = distances + i * k;
@@ -286,6 +461,14 @@ void IndexFlat1D::search(
         idx_t i0 = 0, i1 = ntotal;
         idx_t wp = 0;
 
+        if (ntotal == 0) {
+            for (idx_t j = 0; j < k; j++) {
+                I[j] = -1;
+                D[j] = HUGE_VAL;
+            }
+            goto done;
+        }
+
         if (xb[perm[i0]] > q) {
             i1 = 0;
             goto finish_right;
diff --git a/thirdparty/faiss/faiss/IndexFlat.h b/thirdparty/faiss/faiss/IndexFlat.h
index d220db2e6..c045f2779 100644
--- a/thirdparty/faiss/faiss/IndexFlat.h
+++ b/thirdparty/faiss/faiss/IndexFlat.h
@@ -13,7 +13,6 @@
 #include <vector>
 
 #include <faiss/IndexFlatCodes.h>
-#include <faiss/impl/AuxIndexStructures.h>
 
 namespace faiss {
 
@@ -25,6 +24,9 @@ struct IndexFlat : IndexFlatCodes {
     explicit IndexFlat(idx_t d, MetricType metric = METRIC_L2,
                        bool is_cosine = false);
 
+    // Be careful with overriding this function, because
+    //   renormalized x may be used inside. 
+    // Overridden by IndexFlat1D.
     void add(idx_t n, const float* x) override;
 
     void search(
@@ -33,17 +35,14 @@ struct IndexFlat : IndexFlatCodes {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
-
-    void assign(idx_t n, const float* x, idx_t* labels, float* distances)
-            const override;
+            const SearchParameters* params = nullptr) const override;
 
     void range_search(
             idx_t n,
             const float* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
 
@@ -79,16 +78,14 @@ struct IndexFlat : IndexFlatCodes {
 
     IndexFlat() {}
 
-    DistanceComputer* get_distance_computer() const override;
+    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
 
     /* The stanadlone codec interface (just memcopies in this case) */
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 
-    size_t cal_size() {
-        return this->sa_code_size();
-    }
+    size_t cal_size() const;
 };
 
 struct IndexFlatIP : IndexFlat {
@@ -97,13 +94,27 @@ struct IndexFlatIP : IndexFlat {
 };
 
 struct IndexFlatL2 : IndexFlat {
+    // Special cache for L2 norms.
+    // If this cache is set, then get_distance_computer() returns
+    // a special version that computes the distance using dot products
+    // and l2 norms.
+    std::vector<float> cached_l2norms;
+
     explicit IndexFlatL2(idx_t d) : IndexFlat(d, METRIC_L2) {}
     IndexFlatL2() {}
+
+    // override for l2 norms cache.
+    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
+
+    // compute L2 norms
+    void sync_l2norms();
+    // clear L2 norms
+    void clear_l2norms();
 };
 
 /// optimized version for 1D "vectors".
 struct IndexFlat1D : IndexFlatL2 {
-    bool continuous_update; ///< is the permutation updated continuously?
+    bool continuous_update = true; ///< is the permutation updated continuously?
 
     std::vector<idx_t> perm; ///< sorted database indices
 
@@ -124,7 +135,7 @@ struct IndexFlat1D : IndexFlatL2 {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexFlatCodes.cpp b/thirdparty/faiss/faiss/IndexFlatCodes.cpp
index c9cb84955..caff90ff9 100644
--- a/thirdparty/faiss/faiss/IndexFlatCodes.cpp
+++ b/thirdparty/faiss/faiss/IndexFlatCodes.cpp
@@ -8,7 +8,10 @@
 #include <faiss/IndexFlatCodes.h>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/CodePacker.h>
+#include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 
 namespace faiss {
 
@@ -19,8 +22,11 @@ IndexFlatCodes::IndexFlatCodes() : code_size(0) {}
 
 void IndexFlatCodes::add(idx_t n, const float* x) {
     FAISS_THROW_IF_NOT(is_trained);
+    if (n == 0) {
+        return;
+    }
     codes.resize((ntotal + n) * code_size);
-    sa_encode(n, x, &codes[ntotal * code_size]);
+    sa_encode(n, x, codes.data() + (ntotal * code_size));
     ntotal += n;
 }
 
@@ -64,4 +70,48 @@ void IndexFlatCodes::reconstruct(idx_t key, float* recons) const {
     reconstruct_n(key, 1, recons);
 }
 
+FlatCodesDistanceComputer* IndexFlatCodes::get_FlatCodesDistanceComputer()
+        const {
+    FAISS_THROW_MSG("not implemented");
+}
+
+void IndexFlatCodes::check_compatible_for_merge(const Index& otherIndex) const {
+    // minimal sanity checks
+    const IndexFlatCodes* other =
+            dynamic_cast<const IndexFlatCodes*>(&otherIndex);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->d == d);
+    FAISS_THROW_IF_NOT(other->code_size == code_size);
+    FAISS_THROW_IF_NOT_MSG(
+            typeid(*this) == typeid(*other),
+            "can only merge indexes of the same type");
+}
+
+void IndexFlatCodes::merge_from(Index& otherIndex, idx_t add_id) {
+    FAISS_THROW_IF_NOT_MSG(add_id == 0, "cannot set ids in FlatCodes index");
+    check_compatible_for_merge(otherIndex);
+    IndexFlatCodes* other = static_cast<IndexFlatCodes*>(&otherIndex);
+    codes.resize((ntotal + other->ntotal) * code_size);
+    memcpy(codes.data() + (ntotal * code_size),
+           other->codes.data(),
+           other->ntotal * code_size);
+    ntotal += other->ntotal;
+    other->reset();
+}
+
+CodePacker* IndexFlatCodes::get_CodePacker() const {
+    return new CodePackerFlat(code_size);
+}
+
+void IndexFlatCodes::permute_entries(const idx_t* perm) {
+    std::vector<uint8_t> new_codes(codes.size());
+
+    for (idx_t i = 0; i < ntotal; i++) {
+        memcpy(new_codes.data() + i * code_size,
+               codes.data() + perm[i] * code_size,
+               code_size);
+    }
+    std::swap(codes, new_codes);
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexFlatCodes.h b/thirdparty/faiss/faiss/IndexFlatCodes.h
index 174ba46a3..35123ce2b 100644
--- a/thirdparty/faiss/faiss/IndexFlatCodes.h
+++ b/thirdparty/faiss/faiss/IndexFlatCodes.h
@@ -10,10 +10,13 @@
 #pragma once
 
 #include <faiss/Index.h>
+#include <faiss/impl/DistanceComputer.h>
 #include <vector>
 
 namespace faiss {
 
+struct CodePacker;
+
 /** Index that encodes all vectors as fixed-size codes (size code_size). Storage
  * is in the codes vector */
 struct IndexFlatCodes : Index {
@@ -40,10 +43,27 @@ struct IndexFlatCodes : Index {
 
     size_t sa_code_size() const override;
 
-    /** remove some ids. NB that Because of the structure of the
-     * indexing structure, the semantics of this operation are
+    /** remove some ids. NB that because of the structure of the
+     * index, the semantics of this operation are
      * different from the usual ones: the new ids are shifted */
     size_t remove_ids(const IDSelector& sel) override;
+
+    /** a FlatCodesDistanceComputer offers a distance_to_code method */
+    virtual FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const;
+
+    DistanceComputer* get_distance_computer() const override {
+        return get_FlatCodesDistanceComputer();
+    }
+
+    // returns a new instance of a CodePacker
+    CodePacker* get_CodePacker() const;
+
+    void check_compatible_for_merge(const Index& otherIndex) const override;
+
+    virtual void merge_from(Index& otherIndex, idx_t add_id = 0) override;
+
+    // permute_entries. perm of size ntotal maps new to old positions
+    void permute_entries(const idx_t* perm);
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexFlatElkan.cpp b/thirdparty/faiss/faiss/IndexFlatElkan.cpp
new file mode 100644
index 000000000..b9c48e271
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexFlatElkan.cpp
@@ -0,0 +1,79 @@
+// Copyright (C) 2019-2023 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+#include <faiss/IndexFlatElkan.h>
+
+#include <memory>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+
+namespace faiss {
+
+IndexFlatElkan::IndexFlatElkan(idx_t d, MetricType metric, bool is_cosine, bool use_elkan)
+        : IndexFlat(d, metric, is_cosine) {
+    this->use_elkan = use_elkan;
+}
+
+void IndexFlatElkan::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+        // usually used in IVF k-means algorithm
+
+    FAISS_THROW_IF_NOT_MSG(
+        k == 1,
+        "this index requires k == 1 in a search() call."
+    );
+    FAISS_THROW_IF_NOT_MSG(
+        params == nullptr,
+        "search params not supported for this index"
+    );
+
+    float* dis_inner = distances;
+    std::unique_ptr<float[]> dis_inner_deleter = nullptr;
+    if (distances == nullptr) {
+        dis_inner_deleter = std::make_unique<float[]>(n);
+        dis_inner = dis_inner_deleter.get();
+    }
+
+    switch (metric_type) {
+        case METRIC_INNER_PRODUCT:
+        case METRIC_L2: {
+            // ignore the metric_type, both use L2
+            if (use_elkan) {
+                // use elkan
+                elkan_L2_sse(x, get_xb(), d, n, ntotal, labels, dis_inner);
+            }
+            else {
+                // use L2 search. The same code as in IndexFlat::search() for L2.
+                IDSelector* sel = params ? params->sel : nullptr;
+
+                float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
+                knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
+            }
+
+            break;
+        }
+        default: {
+            // binary metrics
+            // There may be something wrong, but maintain the original logic
+            // now.
+            IndexFlat::search(n, x, k, dis_inner, labels, params);
+            break;
+        }
+    }
+}
+
+}
diff --git a/thirdparty/faiss/faiss/IndexFlatElkan.h b/thirdparty/faiss/faiss/IndexFlatElkan.h
new file mode 100644
index 000000000..555f2d46d
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexFlatElkan.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2019-2023 Zilliz. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software distributed under the License
+// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+// or implied. See the License for the specific language governing permissions and limitations under the License.
+
+#pragma once
+
+#include <faiss/IndexFlat.h>
+
+namespace faiss {
+
+// This is a special modification of IndexFlat that does two things.
+// 1. It allows to use elkan algorithm for the search. It is slower, 
+//   sometimes a magnitude slower than the regular IndexFlat::search() 
+//   implementation, but sometimes the trained index produces a better 
+//   recall rate.
+// 2. It always uses L2 distance for the IP / L2 metrics in order to
+//   support an early stop strategy from Clustering.cpp. Early stop
+//   strategy is a Knowhere-specific feature.
+//
+// This index is intended to be used in Knowhere's ivf.cc file ONLY!!!
+//
+// Elkan algo was introduced into Knowhere in #2178, #2180 and #2258. 
+struct IndexFlatElkan : IndexFlat {
+    bool use_elkan = true;
+
+    explicit IndexFlatElkan(idx_t d, MetricType metric = METRIC_L2,
+                       bool is_cosine = false, bool use_elkan = true);
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;    
+};
+
+}
\ No newline at end of file
diff --git a/thirdparty/faiss/faiss/IndexHNSW.cpp b/thirdparty/faiss/faiss/IndexHNSW.cpp
index 46d3c9de8..7e9458a41 100644
--- a/thirdparty/faiss/faiss/IndexHNSW.cpp
+++ b/thirdparty/faiss/faiss/IndexHNSW.cpp
@@ -20,12 +20,9 @@
 #include <queue>
 #include <unordered_set>
 
-#include <stdint.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-
-#ifdef __SSE__
-#endif
+#include <cstdint>
 
 #include <faiss/FaissHook.h>
 #include <faiss/Index2Layer.h>
@@ -36,6 +33,7 @@
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/random.h>
+#include <faiss/utils/sorting.h>
 
 extern "C" {
 
@@ -59,7 +57,6 @@ int sgemm_(
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using MinimaxHeap = HNSW::MinimaxHeap;
 using storage_idx_t = HNSW::storage_idx_t;
 using NodeDistFarther = HNSW::NodeDistFarther;
@@ -102,7 +99,7 @@ struct NegativeDistanceComputer : DistanceComputer {
 };
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
     } else {
         return storage->get_distance_computer();
@@ -203,7 +200,10 @@ void hnsw_add_vertices(
                         verbose && omp_get_thread_num() == 0 ? 0 : -1;
                 size_t counter = 0;
 
-#pragma omp for schedule(dynamic)
+                // here we should do schedule(dynamic) but this segfaults for
+                // some versions of LLVM. The performance impact should not be
+                // too large when (i1 - i0) / num_threads >> 1
+#pragma omp for schedule(static)
                 for (int i = i0; i < i1; i++) {
                     storage_idx_t pt_id = order[i];
                     dis->set_query(x + (pt_id - n0) * d);
@@ -220,7 +220,6 @@ void hnsw_add_vertices(
                         printf("  %d / %d\r", i - i0, i1 - i0);
                         fflush(stdout);
                     }
-
                     if (counter % check_period == 0) {
                         if (InterruptCallback::is_interrupted()) {
                             interrupt = true;
@@ -252,18 +251,10 @@ void hnsw_add_vertices(
  **************************************************************/
 
 IndexHNSW::IndexHNSW(int d, int M, MetricType metric)
-        : Index(d, metric),
-          hnsw(M),
-          own_fields(false),
-          storage(nullptr),
-          reconstruct_from_neighbors(nullptr) {}
+        : Index(d, metric), hnsw(M) {}
 
 IndexHNSW::IndexHNSW(Index* storage, int M)
-        : Index(storage->d, storage->metric_type),
-          hnsw(M),
-          own_fields(false),
-          storage(storage),
-          reconstruct_from_neighbors(nullptr) {}
+        : Index(storage->d, storage->metric_type), hnsw(M), storage(storage) {}
 
 IndexHNSW::~IndexHNSW() {
     if (own_fields) {
@@ -286,16 +277,23 @@ void IndexHNSW::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT_MSG(
             storage,
             "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+    const SearchParametersHNSW* params = nullptr;
+
+    int efSearch = hnsw.efSearch;
+    if (params_in) {
+        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
+        efSearch = params->efSearch;
+    }
     size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
 
-    idx_t check_period = InterruptCallback::get_period_hint(
-            hnsw.max_level * d * hnsw.efSearch);
+    idx_t check_period =
+            InterruptCallback::get_period_hint(hnsw.max_level * d * efSearch);
 
     for (idx_t i0 = 0; i0 < n; i0 += check_period) {
         idx_t i1 = std::min(i0 + check_period, n);
@@ -307,14 +305,14 @@ void IndexHNSW::search(
             DistanceComputer* dis = storage_distance_computer(storage);
             ScopeDeleter1<DistanceComputer> del(dis);
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder) schedule(guided)
             for (idx_t i = i0; i < i1; i++) {
                 idx_t* idxi = labels + i * k;
                 float* simi = distances + i * k;
                 dis->set_query(x + i * d);
 
                 maxheap_heapify(k, simi, idxi);
-                HNSWStats stats = hnsw.search(*dis, k, idxi, simi, vt);
+                HNSWStats stats = hnsw.search(*dis, k, idxi, simi, vt, params);
                 n1 += stats.n1;
                 n2 += stats.n2;
                 n3 += stats.n3;
@@ -341,7 +339,7 @@ void IndexHNSW::search(
         InterruptCallback::check();
     }
 
-    if (metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(metric_type)) {
         // we need to revert the negated distances
         for (size_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];
@@ -423,16 +421,15 @@ void IndexHNSW::search_level_0(
     FAISS_THROW_IF_NOT(nprobe > 0);
 
     storage_idx_t ntotal = hnsw.levels.size();
-    size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
 
 #pragma omp parallel
     {
-        DistanceComputer* qdis = storage_distance_computer(storage);
-        ScopeDeleter1<DistanceComputer> del(qdis);
-
+        std::unique_ptr<DistanceComputer> qdis(
+                storage_distance_computer(storage));
+        HNSWStats search_stats;
         VisitedTable vt(ntotal);
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+#pragma omp for
         for (idx_t i = 0; i < n; i++) {
             idx_t* idxi = labels + i * k;
             float* simi = distances + i * k;
@@ -440,69 +437,24 @@ void IndexHNSW::search_level_0(
             qdis->set_query(x + i * d);
             maxheap_heapify(k, simi, idxi);
 
-            if (search_type == 1) {
-                int nres = 0;
+            hnsw.search_level_0(
+                    *qdis.get(),
+                    k,
+                    idxi,
+                    simi,
+                    nprobe,
+                    nearest + i * nprobe,
+                    nearest_d + i * nprobe,
+                    search_type,
+                    search_stats,
+                    vt);
 
-                for (int j = 0; j < nprobe; j++) {
-                    storage_idx_t cj = nearest[i * nprobe + j];
-
-                    if (cj < 0)
-                        break;
-
-                    if (vt.get(cj))
-                        continue;
-
-                    int candidates_size = std::max(hnsw.efSearch, int(k));
-                    MinimaxHeap candidates(candidates_size);
-
-                    candidates.push(cj, nearest_d[i * nprobe + j]);
-
-                    HNSWStats search_stats;
-                    nres = hnsw.search_from_candidates(
-                            *qdis,
-                            k,
-                            idxi,
-                            simi,
-                            candidates,
-                            vt,
-                            search_stats,
-                            0,
-                            nres);
-                    n1 += search_stats.n1;
-                    n2 += search_stats.n2;
-                    n3 += search_stats.n3;
-                    ndis += search_stats.ndis;
-                    nreorder += search_stats.nreorder;
-                }
-            } else if (search_type == 2) {
-                int candidates_size = std::max(hnsw.efSearch, int(k));
-                candidates_size = std::max(candidates_size, nprobe);
-
-                MinimaxHeap candidates(candidates_size);
-                for (int j = 0; j < nprobe; j++) {
-                    storage_idx_t cj = nearest[i * nprobe + j];
-
-                    if (cj < 0)
-                        break;
-                    candidates.push(cj, nearest_d[i * nprobe + j]);
-                }
-
-                HNSWStats search_stats;
-                hnsw.search_from_candidates(
-                        *qdis, k, idxi, simi, candidates, vt, search_stats, 0);
-                n1 += search_stats.n1;
-                n2 += search_stats.n2;
-                n3 += search_stats.n3;
-                ndis += search_stats.ndis;
-                nreorder += search_stats.nreorder;
-            }
             vt.advance();
-
             maxheap_reorder(k, simi, idxi);
         }
+#pragma omp critical
+        { hnsw_stats.combine(search_stats); }
     }
-
-    hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
 }
 
 void IndexHNSW::init_level_0_from_knngraph(
@@ -655,6 +607,15 @@ void IndexHNSW::link_singletons() {
     }
 }
 
+void IndexHNSW::permute_entries(const idx_t* perm) {
+    // todo aguzhva: permute norms?
+    auto flat_storage = dynamic_cast<IndexFlatCodes*>(storage);
+    FAISS_THROW_IF_NOT_MSG(
+            flat_storage, "don't know how to permute this index");
+    flat_storage->permute_entries(perm);
+    hnsw.permute_entries(perm);
+}
+
 /**************************************************************
  * ReconstructFromNeighbors implementation
  **************************************************************/
@@ -905,7 +866,10 @@ IndexHNSWFlat::IndexHNSWFlat() {
 }
 
 IndexHNSWFlat::IndexHNSWFlat(int d, int M, MetricType metric)
-        : IndexHNSW(new IndexFlat(d, metric), M) {
+        : IndexHNSW(
+                  (metric == METRIC_L2) ? new IndexFlatL2(d)
+                                        : new IndexFlat(d, metric),
+                  M) {
     own_fields = true;
     is_trained = true;
 }
@@ -914,10 +878,10 @@ IndexHNSWFlat::IndexHNSWFlat(int d, int M, MetricType metric)
  * IndexHNSWPQ implementation
  **************************************************************/
 
-IndexHNSWPQ::IndexHNSWPQ() {}
+IndexHNSWPQ::IndexHNSWPQ() = default;
 
-IndexHNSWPQ::IndexHNSWPQ(int d, int pq_m, int M)
-        : IndexHNSW(new IndexPQ(d, pq_m, 8), M) {
+IndexHNSWPQ::IndexHNSWPQ(int d, int pq_m, int M, int pq_nbits)
+        : IndexHNSW(new IndexPQ(d, pq_m, pq_nbits), M) {
     own_fields = true;
     is_trained = false;
 }
@@ -933,15 +897,15 @@ void IndexHNSWPQ::train(idx_t n, const float* x) {
 
 IndexHNSWSQ::IndexHNSWSQ(
         int d,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         int M,
         MetricType metric)
         : IndexHNSW(new IndexScalarQuantizer(d, qtype, metric), M) {
-    is_trained = false;
+    is_trained = this->storage->is_trained;
     own_fields = true;
 }
 
-IndexHNSWSQ::IndexHNSWSQ() {}
+IndexHNSWSQ::IndexHNSWSQ() = default;
 
 /**************************************************************
  * IndexHNSW2Level implementation
@@ -957,7 +921,7 @@ IndexHNSW2Level::IndexHNSW2Level(
     is_trained = false;
 }
 
-IndexHNSW2Level::IndexHNSW2Level() {}
+IndexHNSW2Level::IndexHNSW2Level() = default;
 
 namespace {
 
@@ -1036,8 +1000,10 @@ void IndexHNSW2Level::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
 
     if (dynamic_cast<const Index2Layer*>(storage)) {
         IndexHNSW::search(n, x, k, distances, labels);
@@ -1096,74 +1062,37 @@ void IndexHNSW2Level::search(
                 }
 
                 candidates.clear();
-                // copy the upper_beam elements to candidates list
-
-                int search_policy = 2;
-
-                if (search_policy == 1) {
-                    for (int j = 0; j < hnsw.upper_beam && j < k; j++) {
-                        if (idxi[j] < 0)
-                            break;
-                        candidates.push(idxi[j], simi[j]);
-                        // search_from_candidates adds them back
-                        idxi[j] = -1;
-                        simi[j] = HUGE_VAL;
-                    }
-
-                    // reorder from sorted to heap
-                    maxheap_heapify(k, simi, idxi, simi, idxi, k);
-
-                    HNSWStats search_stats;
-                    hnsw.search_from_candidates(
-                            *dis,
-                            k,
-                            idxi,
-                            simi,
-                            candidates,
-                            vt,
-                            search_stats,
-                            0,
-                            k);
-                    n1 += search_stats.n1;
-                    n2 += search_stats.n2;
-                    n3 += search_stats.n3;
-                    ndis += search_stats.ndis;
-                    nreorder += search_stats.nreorder;
-
-                    vt.advance();
-
-                } else if (search_policy == 2) {
-                    for (int j = 0; j < hnsw.upper_beam && j < k; j++) {
-                        if (idxi[j] < 0)
-                            break;
-                        candidates.push(idxi[j], simi[j]);
-                    }
 
-                    // reorder from sorted to heap
-                    maxheap_heapify(k, simi, idxi, simi, idxi, k);
-
-                    HNSWStats search_stats;
-                    search_from_candidates_2(
-                            hnsw,
-                            *dis,
-                            k,
-                            idxi,
-                            simi,
-                            candidates,
-                            vt,
-                            search_stats,
-                            0,
-                            k);
-                    n1 += search_stats.n1;
-                    n2 += search_stats.n2;
-                    n3 += search_stats.n3;
-                    ndis += search_stats.ndis;
-                    nreorder += search_stats.nreorder;
-
-                    vt.advance();
-                    vt.advance();
+                for (int j = 0; j < hnsw.upper_beam && j < k; j++) {
+                    if (idxi[j] < 0)
+                        break;
+                    candidates.push(idxi[j], simi[j]);
                 }
 
+                // reorder from sorted to heap
+                maxheap_heapify(k, simi, idxi, simi, idxi, k);
+
+                HNSWStats search_stats;
+                search_from_candidates_2(
+                        hnsw,
+                        *dis,
+                        k,
+                        idxi,
+                        simi,
+                        candidates,
+                        vt,
+                        search_stats,
+                        0,
+                        k);
+                n1 += search_stats.n1;
+                n2 += search_stats.n2;
+                n3 += search_stats.n3;
+                ndis += search_stats.ndis;
+                nreorder += search_stats.nreorder;
+
+                vt.advance();
+                vt.advance();
+
                 maxheap_reorder(k, simi, idxi);
             }
         }
diff --git a/thirdparty/faiss/faiss/IndexHNSW.h b/thirdparty/faiss/faiss/IndexHNSW.h
index fce0d8cf5..13855d303 100644
--- a/thirdparty/faiss/faiss/IndexHNSW.h
+++ b/thirdparty/faiss/faiss/IndexHNSW.h
@@ -22,7 +22,6 @@ namespace faiss {
 struct IndexHNSW;
 
 struct ReconstructFromNeighbors {
-    typedef Index::idx_t idx_t;
     typedef HNSW::storage_idx_t storage_idx_t;
 
     const IndexHNSW& index;
@@ -75,10 +74,10 @@ struct IndexHNSW : Index {
     HNSW hnsw;
 
     // the sequential storage
-    bool own_fields;
-    Index* storage;
+    bool own_fields = false;
+    Index* storage = nullptr;
 
-    ReconstructFromNeighbors* reconstruct_from_neighbors;
+    ReconstructFromNeighbors* reconstruct_from_neighbors = nullptr;
 
     explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
     explicit IndexHNSW(Index* storage, int M = 32);
@@ -97,7 +96,7 @@ struct IndexHNSW : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
 
@@ -135,6 +134,8 @@ struct IndexHNSW : Index {
     void reorder_links();
 
     void link_singletons();
+
+    void permute_entries(const idx_t* perm);
 };
 
 /** Flat index topped with with a HNSW structure to access elements
@@ -151,7 +152,7 @@ struct IndexHNSWFlat : IndexHNSW {
  */
 struct IndexHNSWPQ : IndexHNSW {
     IndexHNSWPQ();
-    IndexHNSWPQ(int d, int pq_m, int M);
+    IndexHNSWPQ(int d, int pq_m, int M, int pq_nbits = 8);
     void train(idx_t n, const float* x) override;
 };
 
@@ -162,7 +163,7 @@ struct IndexHNSWSQ : IndexHNSW {
     IndexHNSWSQ();
     IndexHNSWSQ(
             int d,
-            QuantizerType qtype,
+            ScalarQuantizer::QuantizerType qtype,
             int M,
             MetricType metric = METRIC_L2);
 };
@@ -182,7 +183,7 @@ struct IndexHNSW2Level : IndexHNSW {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIDMap.cpp b/thirdparty/faiss/faiss/IndexIDMap.cpp
new file mode 100644
index 000000000..9107ad550
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIDMap.cpp
@@ -0,0 +1,281 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexIDMap.h>
+
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/WorkerThread.h>
+
+namespace faiss {
+
+namespace {
+
+// IndexBinary needs to update the code_size when d is set...
+
+void sync_d(Index* index) {}
+
+void sync_d(IndexBinary* index) {
+    FAISS_THROW_IF_NOT(index->d % 8 == 0);
+    index->code_size = index->d / 8;
+}
+
+} // anonymous namespace
+
+/*****************************************************
+ * IndexIDMap implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::IndexIDMapTemplate(IndexT* index) : index(index) {
+    FAISS_THROW_IF_NOT_MSG(index->ntotal == 0, "index must be empty on input");
+    this->is_trained = index->is_trained;
+    this->metric_type = index->metric_type;
+    this->verbose = index->verbose;
+    this->d = index->d;
+    sync_d(this);
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add(
+        idx_t,
+        const typename IndexT::component_t*) {
+    FAISS_THROW_MSG(
+            "add does not make sense with IndexIDMap, "
+            "use add_with_ids");
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::train(
+        idx_t n,
+        const typename IndexT::component_t* x) {
+    index->train(n, x);
+    this->is_trained = index->is_trained;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::reset() {
+    index->reset();
+    id_map.clear();
+    this->ntotal = 0;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::add_with_ids(
+        idx_t n,
+        const typename IndexT::component_t* x,
+        const idx_t* xids) {
+    index->add(n, x);
+    for (idx_t i = 0; i < n; i++)
+        id_map.push_back(xids[i]);
+    this->ntotal = index->ntotal;
+}
+
+namespace {
+
+/// RAII object to reset the IDSelector in the params object
+struct ScopedSelChange {
+    SearchParameters* params = nullptr;
+    IDSelector* old_sel = nullptr;
+
+    void set(SearchParameters* params, IDSelector* new_sel) {
+        this->params = params;
+        old_sel = params->sel;
+        params->sel = new_sel;
+    }
+    ~ScopedSelChange() {
+        if (params) {
+            params->sel = old_sel;
+        }
+    }
+};
+
+} // namespace
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::search(
+        idx_t n,
+        const typename IndexT::component_t* x,
+        idx_t k,
+        typename IndexT::distance_t* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    IDSelectorTranslated this_idtrans(this->id_map, nullptr);
+    ScopedSelChange sel_change;
+
+    if (params && params->sel) {
+        auto idtrans = dynamic_cast<const IDSelectorTranslated*>(params->sel);
+
+        if (!idtrans) {
+            /*
+            FAISS_THROW_IF_NOT_MSG(
+                    idtrans,
+                    "IndexIDMap requires an IDSelectorTranslated on input");
+            */
+            // then make an idtrans and force it into the SearchParameters
+            // (hence the const_cast)
+            auto params_non_const = const_cast<SearchParameters*>(params);
+            this_idtrans.sel = params->sel;
+            sel_change.set(params_non_const, &this_idtrans);
+        }
+    }
+    index->search(n, x, k, distances, labels, params);
+    idx_t* li = labels;
+#pragma omp parallel for
+    for (idx_t i = 0; i < n * k; i++) {
+        li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
+    }
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::range_search(
+        idx_t n,
+        const typename IndexT::component_t* x,
+        float radius,
+        RangeSearchResult* result,
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+    index->range_search(n, x, radius, result);
+#pragma omp parallel for
+    for (idx_t i = 0; i < result->lims[result->nq]; i++) {
+        result->labels[i] = result->labels[i] < 0 ? result->labels[i]
+                                                  : id_map[result->labels[i]];
+    }
+}
+
+template <typename IndexT>
+size_t IndexIDMapTemplate<IndexT>::remove_ids(const IDSelector& sel) {
+    // remove in sub-index first
+    IDSelectorTranslated sel2(id_map, &sel);
+    size_t nremove = index->remove_ids(sel2);
+
+    int64_t j = 0;
+    for (idx_t i = 0; i < this->ntotal; i++) {
+        if (sel.is_member(id_map[i])) {
+            // remove
+        } else {
+            id_map[j] = id_map[i];
+            j++;
+        }
+    }
+    FAISS_ASSERT(j == index->ntotal);
+    this->ntotal = j;
+    id_map.resize(this->ntotal);
+    return nremove;
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::check_compatible_for_merge(
+        const IndexT& otherIndex) const {
+    auto other = dynamic_cast<const IndexIDMapTemplate<IndexT>*>(&otherIndex);
+    FAISS_THROW_IF_NOT(other);
+    index->check_compatible_for_merge(*other->index);
+}
+
+template <typename IndexT>
+void IndexIDMapTemplate<IndexT>::merge_from(IndexT& otherIndex, idx_t add_id) {
+    check_compatible_for_merge(otherIndex);
+    auto other = static_cast<IndexIDMapTemplate<IndexT>*>(&otherIndex);
+    index->merge_from(*other->index);
+    for (size_t i = 0; i < other->id_map.size(); i++) {
+        id_map.push_back(other->id_map[i] + add_id);
+    }
+    other->id_map.resize(0);
+    this->ntotal = index->ntotal;
+    other->ntotal = 0;
+}
+
+template <typename IndexT>
+IndexIDMapTemplate<IndexT>::~IndexIDMapTemplate() {
+    if (own_fields)
+        delete index;
+}
+
+/*****************************************************
+ * IndexIDMap2 implementation
+ *******************************************************/
+
+template <typename IndexT>
+IndexIDMap2Template<IndexT>::IndexIDMap2Template(IndexT* index)
+        : IndexIDMapTemplate<IndexT>(index) {}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::add_with_ids(
+        idx_t n,
+        const typename IndexT::component_t* x,
+        const idx_t* xids) {
+    size_t prev_ntotal = this->ntotal;
+    IndexIDMapTemplate<IndexT>::add_with_ids(n, x, xids);
+    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
+        rev_map[this->id_map[i]] = i;
+    }
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::check_consistency() const {
+    FAISS_THROW_IF_NOT(rev_map.size() == this->id_map.size());
+    FAISS_THROW_IF_NOT(this->id_map.size() == this->ntotal);
+    for (size_t i = 0; i < this->ntotal; i++) {
+        idx_t ii = rev_map.at(this->id_map[i]);
+        FAISS_THROW_IF_NOT(ii == i);
+    }
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::merge_from(IndexT& otherIndex, idx_t add_id) {
+    size_t prev_ntotal = this->ntotal;
+    IndexIDMapTemplate<IndexT>::merge_from(otherIndex, add_id);
+    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
+        rev_map[this->id_map[i]] = i;
+    }
+    static_cast<IndexIDMap2Template<IndexT>&>(otherIndex).rev_map.clear();
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::construct_rev_map() {
+    rev_map.clear();
+    for (size_t i = 0; i < this->ntotal; i++) {
+        rev_map[this->id_map[i]] = i;
+    }
+}
+
+template <typename IndexT>
+size_t IndexIDMap2Template<IndexT>::remove_ids(const IDSelector& sel) {
+    // This is quite inefficient
+    size_t nremove = IndexIDMapTemplate<IndexT>::remove_ids(sel);
+    construct_rev_map();
+    return nremove;
+}
+
+template <typename IndexT>
+void IndexIDMap2Template<IndexT>::reconstruct(
+        idx_t key,
+        typename IndexT::component_t* recons) const {
+    try {
+        this->index->reconstruct(rev_map.at(key), recons);
+    } catch (const std::out_of_range& e) {
+        FAISS_THROW_FMT("key %" PRId64 " not found", key);
+    }
+}
+
+// explicit template instantiations
+
+template struct IndexIDMapTemplate<Index>;
+template struct IndexIDMapTemplate<IndexBinary>;
+template struct IndexIDMap2Template<Index>;
+template struct IndexIDMap2Template<IndexBinary>;
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIDMap.h b/thirdparty/faiss/faiss/IndexIDMap.h
new file mode 100644
index 000000000..73c8cdec6
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIDMap.h
@@ -0,0 +1,129 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/impl/IDSelector.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace faiss {
+
+/** Index that translates search results to ids */
+template <typename IndexT>
+struct IndexIDMapTemplate : IndexT {
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    IndexT* index = nullptr; ///! the sub-index
+    bool own_fields = false; ///! whether pointers are deleted in destructo
+    std::vector<idx_t> id_map;
+
+    explicit IndexIDMapTemplate(IndexT* index);
+
+    /// @param xids if non-null, ids to store for the vectors (size n)
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    /// this will fail. Use add_with_ids
+    void add(idx_t n, const component_t* x) override;
+
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void reset() override;
+
+    /// remove ids adapted to IndexFlat
+    size_t remove_ids(const IDSelector& sel) override;
+
+    // Knowhere-specific: radius became float because of Jaccard distance
+    //   for IndexBinary
+    void range_search(
+            idx_t n,
+            const component_t* x,
+            float radius,
+            RangeSearchResult* result,
+            const SearchParameters* params = nullptr) const override;
+
+    void merge_from(IndexT& otherIndex, idx_t add_id = 0) override;
+    void check_compatible_for_merge(const IndexT& otherIndex) const override;
+
+    ~IndexIDMapTemplate() override;
+    IndexIDMapTemplate() {
+        own_fields = false;
+        index = nullptr;
+    }
+};
+
+using IndexIDMap = IndexIDMapTemplate<Index>;
+using IndexBinaryIDMap = IndexIDMapTemplate<IndexBinary>;
+
+/** same as IndexIDMap but also provides an efficient reconstruction
+ *  implementation via a 2-way index */
+template <typename IndexT>
+struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
+    using component_t = typename IndexT::component_t;
+    using distance_t = typename IndexT::distance_t;
+
+    std::unordered_map<idx_t, idx_t> rev_map;
+
+    explicit IndexIDMap2Template(IndexT* index);
+
+    /// make the rev_map from scratch
+    void construct_rev_map();
+
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    size_t remove_ids(const IDSelector& sel) override;
+
+    void reconstruct(idx_t key, component_t* recons) const override;
+
+    /// check that the rev_map and the id_map are in sync
+    void check_consistency() const;
+
+    void merge_from(IndexT& otherIndex, idx_t add_id = 0) override;
+
+    ~IndexIDMap2Template() override {}
+    IndexIDMap2Template() {}
+};
+
+using IndexIDMap2 = IndexIDMap2Template<Index>;
+using IndexBinaryIDMap2 = IndexIDMap2Template<IndexBinary>;
+
+// IDSelector that translates the ids using an IDMap
+struct IDSelectorTranslated : IDSelector {
+    const std::vector<int64_t>& id_map;
+    const IDSelector* sel;
+
+    IDSelectorTranslated(
+            const std::vector<int64_t>& id_map,
+            const IDSelector* sel)
+            : id_map(id_map), sel(sel) {}
+
+    IDSelectorTranslated(IndexBinaryIDMap& index_idmap, const IDSelector* sel)
+            : id_map(index_idmap.id_map), sel(sel) {}
+
+    IDSelectorTranslated(IndexIDMap& index_idmap, const IDSelector* sel)
+            : id_map(index_idmap.id_map), sel(sel) {}
+
+    bool is_member(idx_t id) const override {
+        return sel->is_member(id_map[id]);
+    }
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVF.cpp b/thirdparty/faiss/faiss/IndexIVF.cpp
index 3619e9816..8f976f37c 100644
--- a/thirdparty/faiss/faiss/IndexIVF.cpp
+++ b/thirdparty/faiss/faiss/IndexIVF.cpp
@@ -10,23 +10,23 @@
 #include <faiss/IndexIVF.h>
 
 #include <omp.h>
+#include <cstdint>
 #include <mutex>
 
 #include <algorithm>
 #include <cinttypes>
 #include <cstdio>
-#include <iostream>
+#include <limits>
 #include <memory>
 
-
-#include <knowhere/utils.h>
-
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
 
 #include <faiss/IndexFlat.h>
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 
 namespace faiss {
 
@@ -38,27 +38,19 @@ using ScopedCodes = InvertedLists::ScopedCodes;
  ******************************************/
 
 Level1Quantizer::Level1Quantizer(Index* quantizer, size_t nlist)
-        : quantizer(quantizer),
-          nlist(nlist),
-          quantizer_trains_alone(0),
-          own_fields(false),
-          clustering_index(nullptr) {
+        : quantizer(quantizer), nlist(nlist) {
     // here we set a low # iterations because this is typically used
     // for large clusterings (nb this is not used for the MultiIndex,
     // for which quantizer_trains_alone = true)
     cp.niter = 10;
 }
 
-Level1Quantizer::Level1Quantizer()
-        : quantizer(nullptr),
-          nlist(0),
-          quantizer_trains_alone(0),
-          own_fields(false),
-          clustering_index(nullptr) {}
+Level1Quantizer::Level1Quantizer() = default;
 
 Level1Quantizer::~Level1Quantizer() {
-    if (own_fields)
+    if (own_fields) {
         delete quantizer;
+    }
 }
 
 void Level1Quantizer::train_q1(
@@ -134,7 +126,7 @@ size_t Level1Quantizer::coarse_code_size() const {
     return nbyte;
 }
 
-void Level1Quantizer::encode_listno(Index::idx_t list_no, uint8_t* code) const {
+void Level1Quantizer::encode_listno(idx_t list_no, uint8_t* code) const {
     // little endian
     size_t nl = nlist - 1;
     while (nl > 0) {
@@ -144,7 +136,7 @@ void Level1Quantizer::encode_listno(Index::idx_t list_no, uint8_t* code) const {
     }
 }
 
-Index::idx_t Level1Quantizer::decode_listno(const uint8_t* code) const {
+idx_t Level1Quantizer::decode_listno(const uint8_t* code) const {
     size_t nl = nlist - 1;
     int64_t list_no = 0;
     int nbit = 0;
@@ -168,13 +160,10 @@ IndexIVF::IndexIVF(
         size_t code_size,
         MetricType metric)
         : Index(d, metric),
-          Level1Quantizer(quantizer, nlist),
+          IndexIVFInterface(quantizer, nlist),
           invlists(new ArrayInvertedLists(nlist, code_size)),
           own_invlists(true),
-          code_size(code_size),
-          nprobe(1),
-          max_codes(0),
-          parallel_mode(0) {
+          code_size(code_size) {
     FAISS_THROW_IF_NOT(d == quantizer->d);
     is_trained = quantizer->is_trained && (quantizer->ntotal == nlist);
     // Spherical by default if the metric is inner_product
@@ -183,13 +172,7 @@ IndexIVF::IndexIVF(
     }
 }
 
-IndexIVF::IndexIVF()
-        : invlists(nullptr),
-          own_invlists(false),
-          code_size(0),
-          nprobe(1),
-          max_codes(0),
-          parallel_mode(0) {}
+IndexIVF::IndexIVF() = default;
 
 void IndexIVF::add(idx_t n, const float* x) {
     add_with_ids(n, x, nullptr);
@@ -209,8 +192,7 @@ void IndexIVF::add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids) {
         const uint8_t* code = codes + (code_size + coarse_size) * i;
         idx_t list_no = decode_listno(code);
         idx_t id = xids ? xids[i] : ntotal + i;
-        size_t ofs =
-                invlists->add_entry(list_no, id, code + coarse_size);
+        size_t ofs = invlists->add_entry(list_no, id, code + coarse_size);
         dm_adder.add(i, list_no, ofs);
     }
     ntotal += n;
@@ -268,7 +250,7 @@ void IndexIVF::add_core(
             if (list_no >= 0 && list_no % nt == rank) {
                 idx_t id = xids ? xids[i] : ntotal + i;
                 size_t ofs = invlists->add_entry(
-                        list_no, id, flat_codes.get() + i * code_size, x_norms  == nullptr ? nullptr : x_norms + i);
+                        list_no, id, flat_codes.get() + i * code_size, (x_norms == nullptr) ? nullptr : x_norms + i);
 
                 dm_adder.add(i, list_no, ofs);
 
@@ -325,14 +307,19 @@ void IndexIVF::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
-
-    const size_t nprobe = std::min(nlist, this->nprobe);
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
     // search function for a subset of queries
-    auto sub_search_func = [this, k, nprobe, bitset](
+    auto sub_search_func = [this, k, nprobe, params](
                                    idx_t n,
                                    const float* x,
                                    float* distances,
@@ -342,7 +329,13 @@ void IndexIVF::search(
         std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
 
         double t0 = getmillisecs();
-        quantizer->search(n, x, nprobe, coarse_dis.get(), idx.get());
+        quantizer->search(
+                n,
+                x,
+                nprobe,
+                coarse_dis.get(),
+                idx.get(),
+                params ? params->quantizer_params : nullptr);
 
         double t1 = getmillisecs();
         invlists->prefetch_lists(idx.get(), n * nprobe);
@@ -356,9 +349,8 @@ void IndexIVF::search(
                 distances,
                 labels,
                 false,
-                nullptr,
-                ivf_stats,
-                bitset);
+                params,
+                ivf_stats);
         double t2 = getmillisecs();
         ivf_stats->quantization_time += t1 - t0;
         ivf_stats->search_time += t2 - t0;
@@ -399,7 +391,7 @@ void IndexIVF::search(
             indexIVF_stats.add(stats[slice]);
         }
     } else {
-        // handle paralellization at level below (or don't run in parallel at
+        // handle parallelization at level below (or don't run in parallel at
         // all)
         sub_search_func(n, x, distances, labels, &indexIVF_stats);
     }
@@ -415,15 +407,32 @@ void IndexIVF::search_preassigned(
         idx_t* labels,
         bool store_pairs,
         const IVFSearchParameters* params,
-        IndexIVFStats* ivf_stats,
-        const BitsetView bitset) const {
+        IndexIVFStats* ivf_stats) const {
     FAISS_THROW_IF_NOT(k > 0);
 
     idx_t nprobe = params ? params->nprobe : this->nprobe;
     nprobe = std::min((idx_t)nlist, nprobe);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    const idx_t unlimited_list_size = std::numeric_limits<idx_t>::max();
     idx_t max_codes = params ? params->max_codes : this->max_codes;
+    IDSelector* sel = params ? params->sel : nullptr;
+    const IDSelectorRange* selr = dynamic_cast<const IDSelectorRange*>(sel);
+    if (selr) {
+        if (selr->assume_sorted) {
+            sel = nullptr; // use special IDSelectorRange processing
+        } else {
+            selr = nullptr; // use generic processing
+        }
+    }
+
+    FAISS_THROW_IF_NOT_MSG(
+            !(sel && store_pairs),
+            "selector and store_pairs cannot be combined");
+
+    FAISS_THROW_IF_NOT_MSG(
+            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
+            "iterable inverted lists don't support max_codes and store_pairs");
 
     size_t nlistv = 0, ndis = 0, nheap = 0;
 
@@ -434,15 +443,16 @@ void IndexIVF::search_preassigned(
     std::mutex exception_mutex;
     std::string exception_string;
 
-    int preassigned_parallel_mode = 0;
-    if (params && params->parallel_mode != -1) {
-        preassigned_parallel_mode = params->parallel_mode;
-    } else {
-        preassigned_parallel_mode = this->parallel_mode;
+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
+
+    FAISS_THROW_IF_NOT_MSG(
+            max_codes == 0 || pmode == 0 || pmode == 3,
+            "max_codes supported only for parallel_mode = 0 or 3");
+
+    if (max_codes == 0) {
+        max_codes = unlimited_list_size;
     }
-    int pmode = preassigned_parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
-    bool do_heap_init =
-            !(preassigned_parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
 
     bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 0           ? false
@@ -452,7 +462,8 @@ void IndexIVF::search_preassigned(
 
 #pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap)
     {
-        InvertedListScanner* scanner = get_InvertedListScanner(store_pairs);
+        InvertedListScanner* scanner =
+                get_InvertedListScanner(store_pairs, sel);
         ScopeDeleter1<InvertedListScanner> del(scanner);
 
         /*****************************************************
@@ -461,7 +472,7 @@ void IndexIVF::search_preassigned(
          * that are in common between the two
          ******************************************************/
 
-        // intialize + reorder a result heap
+        // initialize + reorder a result heap
 
         auto init_result = [&](float* simi, idx_t* idxi) {
             if (!do_heap_init)
@@ -500,7 +511,7 @@ void IndexIVF::search_preassigned(
                                  float coarse_dis_i,
                                  float* simi,
                                  idx_t* idxi,
-                                 const BitsetView bitset) {
+                                 idx_t list_size_max) {
             if (key < 0) {
                 // not enough centroids for multiprobe
                 return (size_t)0;
@@ -511,10 +522,8 @@ void IndexIVF::search_preassigned(
                     key,
                     nlist);
 
-            size_t list_size = invlists->list_size(key);
-
             // don't waste time on empty lists
-            if (list_size == 0) {
+            if (invlists->is_empty(key)) {
                 return (size_t)0;
             }
 
@@ -522,33 +531,50 @@ void IndexIVF::search_preassigned(
 
             nlistv++;
 
-            size_t scan_cnt = 0;
             try {
-                size_t segment_num = invlists->get_segment_num(key);
-                for (size_t segment_idx = 0; segment_idx < segment_num; segment_idx++) {
-                    size_t segment_size = invlists->get_segment_size(key, segment_idx);
-                    size_t segment_offset = invlists->get_segment_offset(key, segment_idx);
-                    InvertedLists::ScopedCodes scodes(invlists, key, segment_offset);
-                    std::unique_ptr<InvertedLists::ScopedIds> sids;
-                    const Index::idx_t* ids = nullptr;
-
-                    auto scode_norms = std::make_unique<InvertedLists::ScopedCodeNorms>(invlists, key, segment_offset);
-                    const float* code_norms = scode_norms->get();
-
-                    if (!store_pairs) {
-                        sids.reset(new InvertedLists::ScopedIds(invlists, key, segment_offset));
-                        ids = sids->get();
-                    }
-                    nheap += scanner->scan_codes(
-                            segment_size,
-                            scodes.get(),
-                            code_norms,
-                            ids,
-                            simi,
-                            idxi,
-                            k,
-                            bitset);
-                    scan_cnt += segment_size;
+                // todo aguzhva: validate segments code here
+                //   also, iterators don't seem to know how to use segments
+                if (invlists->use_iterator) {
+                    size_t list_size = 0;
+
+                    std::unique_ptr<InvertedListsIterator> it(
+                            invlists->get_iterator(key));
+
+                    nheap += scanner->iterate_codes(
+                            it.get(), simi, idxi, k, list_size);
+
+                    return list_size;
+                } else {
+                    size_t scan_cnt = 0;
+
+                    size_t segment_num = invlists->get_segment_num(key);
+                    for (size_t segment_idx = 0; segment_idx < segment_num; segment_idx++) {
+                        size_t segment_size = invlists->get_segment_size(key, segment_idx);
+                        size_t segment_offset = invlists->get_segment_offset(key, segment_idx);
+                        InvertedLists::ScopedCodes scodes(invlists, key, segment_offset);
+                        std::unique_ptr<InvertedLists::ScopedIds> sids;
+                        const idx_t* ids = nullptr;
+
+                        auto scode_norms = std::make_unique<InvertedLists::ScopedCodeNorms>(invlists, key, segment_offset);
+                        const float* code_norms = scode_norms->get();
+
+                        if (!store_pairs) {
+                            sids = std::make_unique<InvertedLists::ScopedIds>(
+                                invlists, key, segment_offset);
+                            ids = sids->get();
+                        }
+                        nheap += scanner->scan_codes(
+                                segment_size,
+                                scodes.get(),
+                                code_norms,
+                                ids,
+                                simi,
+                                idxi,
+                                k);
+                        scan_cnt += segment_size;
+                    }                
+
+                    return scan_cnt;
                 }
             } catch (const std::exception& e) {
                 std::lock_guard<std::mutex> lock(exception_mutex);
@@ -557,8 +583,6 @@ void IndexIVF::search_preassigned(
                 interrupt = true;
                 return size_t(0);
             }
-
-            return scan_cnt;
         };
 
         /****************************************************
@@ -588,9 +612,8 @@ void IndexIVF::search_preassigned(
                             coarse_dis[i * nprobe + ik],
                             simi,
                             idxi,
-                            bitset);
-
-                    if (max_codes && nscan >= max_codes) {
+                            max_codes - nscan);
+                    if (nscan >= max_codes) {
                         break;
                     }
                 }
@@ -618,7 +641,7 @@ void IndexIVF::search_preassigned(
                             coarse_dis[i * nprobe + ik],
                             local_dis.data(),
                             local_idx.data(),
-                            bitset);
+                            unlimited_list_size);
 
                     // can't do the test on max_codes
                 }
@@ -660,7 +683,7 @@ void IndexIVF::search_preassigned(
                         coarse_dis[ij],
                         local_dis.data(),
                         local_idx.data(),
-                        bitset);
+                        unlimited_list_size);
 #pragma omp critical
                 {
                     add_local_results(
@@ -701,13 +724,22 @@ void IndexIVF::range_search(
         const float* x,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset) const {
-    const size_t nprobe = std::min(nlist, this->nprobe);
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    const SearchParameters* quantizer_params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
+        quantizer_params = params->quantizer_params;
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
     std::unique_ptr<idx_t[]> keys(new idx_t[nx * nprobe]);
     std::unique_ptr<float[]> coarse_dis(new float[nx * nprobe]);
 
     double t0 = getmillisecs();
-    quantizer->search(nx, x, nprobe, coarse_dis.get(), keys.get());
+    quantizer->search(
+            nx, x, nprobe, coarse_dis.get(), keys.get(), quantizer_params);
     indexIVF_stats.quantization_time += getmillisecs() - t0;
 
     t0 = getmillisecs();
@@ -721,9 +753,8 @@ void IndexIVF::range_search(
             coarse_dis.get(),
             result,
             false,
-            nullptr,
-            &indexIVF_stats,
-            bitset);
+            params,
+            &indexIVF_stats);
 
     indexIVF_stats.search_time += getmillisecs() - t0;
 }
@@ -737,11 +768,21 @@ void IndexIVF::range_search_preassigned(
         RangeSearchResult* result,
         bool store_pairs,
         const IVFSearchParameters* params,
-        IndexIVFStats* stats,
-        const BitsetView bitset) const {
+        IndexIVFStats* stats) const {
+
+    // Knowhere-specific code: 
+    //   only "parallel_mode == 0" branch is supported.
+
     idx_t nprobe = params ? params->nprobe : this->nprobe;
     nprobe = std::min((idx_t)nlist, nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+
     idx_t max_codes = params ? params->max_codes : this->max_codes;
+    IDSelector* sel = params ? params->sel : nullptr;
+
+    FAISS_THROW_IF_NOT_MSG(
+            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
+            "iterable inverted lists don't support max_codes and store_pairs");
 
     size_t nlistv = 0, ndis = 0;
 
@@ -751,13 +792,7 @@ void IndexIVF::range_search_preassigned(
 
     std::vector<RangeSearchPartialResult*> all_pres(omp_get_max_threads());
 
-    int preassigned_parallel_mode = 0;
-    if (params && params->parallel_mode != -1) {
-        preassigned_parallel_mode = params->parallel_mode;
-    } else {
-        preassigned_parallel_mode = this->parallel_mode;
-    }
-    int pmode = preassigned_parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
     // don't start parallel section if single query
     bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 3           ? false
@@ -769,16 +804,13 @@ void IndexIVF::range_search_preassigned(
     {
         RangeSearchPartialResult pres(result);
         std::unique_ptr<InvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs));
+                get_InvertedListScanner(store_pairs, sel));
         FAISS_THROW_IF_NOT(scanner.get());
         all_pres[omp_get_thread_num()] = &pres;
 
         // prepare the list scanning function
 
-        auto scan_list_func = [&](size_t i,
-                                  size_t ik,
-                                  RangeQueryResult& qres,
-                                  const BitsetView bitset) {
+        auto scan_list_func = [&](size_t i, size_t ik, RangeQueryResult& qres) {
             idx_t key = keys[i * nprobe + ik]; /* select the list  */
             if (key < 0)
                 return;
@@ -788,32 +820,46 @@ void IndexIVF::range_search_preassigned(
                     key,
                     ik,
                     nlist);
-            const size_t list_size = invlists->list_size(key);
 
-            if (list_size == 0)
+            if (invlists->is_empty(key)) {
                 return;
+            }
 
             try {
-                size_t segment_num = invlists->get_segment_num(key);
-                for (size_t segment_idx = 0; segment_idx < segment_num; segment_idx++) {
-                    size_t segment_size = invlists->get_segment_size(key, segment_idx);
-                    size_t segment_offset = invlists->get_segment_offset(key, segment_idx);
+                // todo aguzhva: validate segments here
+                //   also, iterators don't know how to use segments
+                size_t list_size = 0;
+                scanner->set_list(key, coarse_dis[i * nprobe + ik]);
+                if (invlists->use_iterator) {
+                    std::unique_ptr<InvertedListsIterator> it(
+                            invlists->get_iterator(key));
 
-                    InvertedLists::ScopedCodes scodes(invlists, key, segment_offset);
-                    InvertedLists::ScopedIds ids(invlists, key, segment_offset);
-                    InvertedLists::ScopedCodeNorms scode_norms(invlists, key, segment_offset);
+                    scanner->iterate_codes_range(
+                            it.get(), radius, qres, list_size);
 
-                    scanner->set_list(key, coarse_dis[i * nprobe + ik]);
                     nlistv++;
-                    ndis += segment_size;
-                    scanner->scan_codes_range(
-                            segment_size,
-                            scodes.get(),
-                            scode_norms.get(),
-                            ids.get(),
-                            radius,
-                            qres,
-                            bitset);
+                    ndis += list_size;
+                } else {
+                    size_t segment_num = invlists->get_segment_num(key);
+                    for (size_t segment_idx = 0; segment_idx < segment_num; segment_idx++) {
+                        size_t segment_size = invlists->get_segment_size(key, segment_idx);
+                        size_t segment_offset = invlists->get_segment_offset(key, segment_idx);
+
+                        InvertedLists::ScopedCodes scodes(invlists, key, segment_offset);
+                        InvertedLists::ScopedIds ids(invlists, key, segment_offset);
+                        InvertedLists::ScopedCodeNorms scode_norms(invlists, key, segment_offset);
+
+                        scanner->set_list(key, coarse_dis[i * nprobe + ik]);
+                        nlistv++;
+                        ndis += segment_size;
+                        scanner->scan_codes_range(
+                                segment_size,
+                                scodes.get(),
+                                scode_norms.get(),
+                                ids.get(),
+                                radius,
+                                qres);
+                    }
                 }
             } catch (const std::exception& e) {
                 std::lock_guard<std::mutex> lock(exception_mutex);
@@ -823,19 +869,36 @@ void IndexIVF::range_search_preassigned(
             }
         };
 
+        if (parallel_mode == 0) {
 #pragma omp for
-        for (idx_t i = 0; i < nx; i++) {
-            scanner->set_query(x + i * d);
+            for (idx_t i = 0; i < nx; i++) {
+                scanner->set_query(x + i * d);
+
+                RangeQueryResult& qres = pres.new_result(i);
 
-            RangeQueryResult& qres = pres.new_result(i);
-            size_t prev_nres = qres.nres;
+                // ====================================================
+                // The following piece of the code is Knowhere-specific.
+                //
+                // cbe86cf716dc1969fc716c29ccf8ea63e82a2b4c: 
+                //   Adopt new strategy for faiss IVF range search
 
-            for (size_t ik = 0; ik < nprobe; ik++) {
-                scan_list_func(i, ik, qres, bitset);
-                if (qres.nres == prev_nres) break;
-                prev_nres = qres.nres;
+                size_t prev_nres = qres.nres;
+
+                for (size_t ik = 0; ik < nprobe; ik++) {
+                    scan_list_func(i, ik, qres);
+                    if (qres.nres == prev_nres) break;
+                    prev_nres = qres.nres;
+                }
+
+                // The end of Knowhere-specific code. 
+                // ====================================================
             }
+        } else {
+            // Other parallel modes from 1.7.4 were disabled for Milvus.
+            FAISS_THROW_FMT("parallel_mode %d not supported\n", parallel_mode);
         }
+
+        // Other parallel modes from 1.7.4 were disabled for Milvus.
         pres.finalize();
     }
 
@@ -856,7 +919,8 @@ void IndexIVF::range_search_preassigned(
 }
 
 InvertedListScanner* IndexIVF::get_InvertedListScanner(
-        bool /*store_pairs*/) const {
+        bool /*store_pairs*/,
+        const IDSelector* /* sel */) const {
     return nullptr;
 }
 
@@ -884,6 +948,21 @@ void IndexIVF::reconstruct_n(idx_t i0, idx_t ni, float* recons) const {
     }
 }
 
+bool IndexIVF::check_ids_sorted() const {
+    size_t nflip = 0;
+
+    for (size_t i = 0; i < nlist; i++) {
+        size_t list_size = invlists->list_size(i);
+        InvertedLists::ScopedIds ids(invlists, i);
+        for (size_t j = 0; j + 1 < list_size; j++) {
+            if (ids[j + 1] < ids[j]) {
+                nflip++;
+            }
+        }
+    }
+    return nflip == 0;
+}
+
 /* standalone codec interface */
 size_t IndexIVF::sa_code_size() const {
     size_t coarse_size = coarse_code_size();
@@ -903,12 +982,19 @@ void IndexIVF::search_and_reconstruct(
         idx_t k,
         float* distances,
         idx_t* labels,
-        float* recons) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    const size_t nprobe = std::min(nlist, this->nprobe);
+        float* recons,
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    // todo aguzhva: deprecate ScopeDeleter and ScopeDeleter1
+    //   in favor of std::unique_ptr
     idx_t* idx = new idx_t[n * nprobe];
     ScopeDeleter<idx_t> del(idx);
     float* coarse_dis = new float[n * nprobe];
@@ -928,7 +1014,8 @@ void IndexIVF::search_and_reconstruct(
             coarse_dis,
             distances,
             labels,
-            true /* store_pairs */);
+            true /* store_pairs */,
+            params);
     for (idx_t i = 0; i < n; ++i) {
         for (idx_t j = 0; j < k; ++j) {
             idx_t ij = i * k + j;
@@ -996,44 +1083,93 @@ void IndexIVF::update_vectors(int n, const idx_t* new_ids, const float* x) {
 }
 
 void IndexIVF::train(idx_t n, const float* x) {
-    if (verbose)
+    if (verbose) {
         printf("Training level-1 quantizer\n");
+    }
 
     train_q1(n, x, verbose, metric_type);
 
-    if (verbose)
+    if (verbose) {
         printf("Training IVF residual\n");
+    }
+
+    // optional subsampling
+    idx_t max_nt = train_encoder_num_vectors();
+    if (max_nt <= 0) {
+        max_nt = (size_t)1 << 35;
+    }
+
+    TransformedVectors tv(
+            x, fvecs_maybe_subsample(d, (size_t*)&n, max_nt, x, verbose));
+
+    if (by_residual) {
+        std::vector<idx_t> assign(n);
+        quantizer->assign(n, tv.x, assign.data());
+
+        std::vector<float> residuals(n * d);
+        quantizer->compute_residual_n(n, tv.x, residuals.data(), assign.data());
+
+        train_encoder(n, residuals.data(), assign.data());
+    } else {
+        train_encoder(n, tv.x, nullptr);
+    }
 
-    train_residual(n, x);
     is_trained = true;
 }
 
-void IndexIVF::train_residual(idx_t /*n*/, const float* /*x*/) {
-    if (verbose)
-        printf("IndexIVF: no residual training\n");
+idx_t IndexIVF::train_encoder_num_vectors() const {
+    return 0;
+}
+
+void IndexIVF::train_encoder(
+        idx_t /*n*/,
+        const float* /*x*/,
+        const idx_t* assign) {
     // does nothing by default
+    if (verbose) {
+        printf("IndexIVF: no residual training\n");
+    }
 }
 
-void IndexIVF::check_compatible_for_merge(const IndexIVF& other) const {
+bool check_compatible_for_merge_expensive_check = true;
+
+void IndexIVF::check_compatible_for_merge(const Index& otherIndex) const {
     // minimal sanity checks
-    FAISS_THROW_IF_NOT(other.d == d);
-    FAISS_THROW_IF_NOT(other.nlist == nlist);
-    FAISS_THROW_IF_NOT(other.code_size == code_size);
+    const IndexIVF* other = dynamic_cast<const IndexIVF*>(&otherIndex);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->d == d);
+    FAISS_THROW_IF_NOT(other->nlist == nlist);
+    FAISS_THROW_IF_NOT(quantizer->ntotal == other->quantizer->ntotal);
+    FAISS_THROW_IF_NOT(other->code_size == code_size);
     FAISS_THROW_IF_NOT_MSG(
-            typeid(*this) == typeid(other),
+            typeid(*this) == typeid(*other),
             "can only merge indexes of the same type");
     FAISS_THROW_IF_NOT_MSG(
-            this->direct_map.no() && other.direct_map.no(),
+            this->direct_map.no() && other->direct_map.no(),
             "merge direct_map not implemented");
+
+    if (check_compatible_for_merge_expensive_check) {
+        std::vector<float> v(d), v2(d);
+        for (size_t i = 0; i < nlist; i++) {
+            quantizer->reconstruct(i, v.data());
+            other->quantizer->reconstruct(i, v2.data());
+            FAISS_THROW_IF_NOT_MSG(
+                    v == v2, "coarse quantizers should be the same");
+        }
+    }
 }
 
-void IndexIVF::merge_from(IndexIVF& other, idx_t add_id) {
-    check_compatible_for_merge(other);
+void IndexIVF::merge_from(Index& otherIndex, idx_t add_id) {
+    check_compatible_for_merge(otherIndex);
+    IndexIVF* other = static_cast<IndexIVF*>(&otherIndex);
+    invlists->merge_from(other->invlists, add_id);
 
-    invlists->merge_from(other.invlists, add_id);
+    ntotal += other->ntotal;
+    other->ntotal = 0;
+}
 
-    ntotal += other.ntotal;
-    other.ntotal = 0;
+CodePacker* IndexIVF::get_CodePacker() const {
+    return new CodePackerFlat(code_size);
 }
 
 void IndexIVF::replace_invlists(InvertedLists* il, bool own) {
@@ -1054,94 +1190,11 @@ void IndexIVF::replace_invlists(InvertedLists* il, bool own) {
 
 void IndexIVF::copy_subset_to(
         IndexIVF& other,
-        int subset_type,
+        InvertedLists::subset_type_t subset_type,
         idx_t a1,
         idx_t a2) const {
-    FAISS_THROW_IF_NOT(nlist == other.nlist);
-    FAISS_THROW_IF_NOT(code_size == other.code_size);
-    FAISS_THROW_IF_NOT(other.direct_map.no());
-    FAISS_THROW_IF_NOT_FMT(
-            subset_type == 0 || subset_type == 1 || subset_type == 2,
-            "subset type %d not implemented",
-            subset_type);
-
-    size_t accu_n = 0;
-    size_t accu_a1 = 0;
-    size_t accu_a2 = 0;
-
-    InvertedLists* oivf = other.invlists;
-
-    for (idx_t list_no = 0; list_no < nlist; list_no++) {
-        size_t n = invlists->list_size(list_no);
-        ScopedIds ids_in(invlists, list_no);
-
-        if (subset_type == 0) {
-            for (idx_t i = 0; i < n; i++) {
-                idx_t id = ids_in[i];
-                if (a1 <= id && id < a2) {
-                    oivf->add_entry(
-                            list_no,
-                            invlists->get_single_id(list_no, i),
-                            ScopedCodes(invlists, list_no, i).get());
-                    other.ntotal++;
-                }
-            }
-        } else if (subset_type == 1) {
-            for (idx_t i = 0; i < n; i++) {
-                idx_t id = ids_in[i];
-                if (id % a1 == a2) {
-                    oivf->add_entry(
-                            list_no,
-                            invlists->get_single_id(list_no, i),
-                            ScopedCodes(invlists, list_no, i).get());
-                    other.ntotal++;
-                }
-            }
-        } else if (subset_type == 2) {
-            // see what is allocated to a1 and to a2
-            size_t next_accu_n = accu_n + n;
-            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
-            size_t i1 = next_accu_a1 - accu_a1;
-            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
-            size_t i2 = next_accu_a2 - accu_a2;
-
-            for (idx_t i = i1; i < i2; i++) {
-                oivf->add_entry(
-                        list_no,
-                        invlists->get_single_id(list_no, i),
-                        ScopedCodes(invlists, list_no, i).get());
-            }
-
-            other.ntotal += i2 - i1;
-            accu_a1 = next_accu_a1;
-            accu_a2 = next_accu_a2;
-        }
-        accu_n += n;
-    }
-    FAISS_ASSERT(accu_n == ntotal);
-}
-
-void IndexIVF::dump() {
-    for (auto i = 0; i < invlists->nlist; ++i) {
-        auto numVecs = invlists->list_size(i);
-        auto ids = invlists->get_ids(i);
-        auto codes = invlists->get_codes(i);
-        int code_size = invlists->code_size;
-
-        std::cout << "Bucket ID: " << i << ", with code size: " << code_size
-                  << ", vectors number: " << numVecs << std::endl;
-        if (code_size == 8) {
-            // int8 types
-            for (auto j = 0; j < numVecs; ++j) {
-                std::cout << *(ids + j) << ": " << std::endl;
-                for (int k = 0; k < this->d; ++k) {
-                    printf("%u ", (uint8_t)(codes[j * d + k]));
-                }
-                std::cout << std::endl;
-            }
-        }
-        std::cout << "Bucket End." << std::endl;
-    }
+    other.ntotal +=
+            invlists->copy_subset_to(*other.invlists, subset_type, a1, a2);
 }
 
 IndexIVF::~IndexIVF() {
@@ -1180,13 +1233,13 @@ size_t InvertedListScanner::scan_codes(
         const idx_t* ids,
         float* simi,
         idx_t* idxi,
-        size_t k,
-        const BitsetView bitset) const {
+        size_t k) const {
     size_t nup = 0;
 
     if (!keep_max) {
         for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(j)) {
+            // // todo aguzhva: use int64_t id instead of j ?
+            if (!sel || sel->is_member(j)) {
                 float dis = distance_to_code(codes);
                 if (code_norms) {
                     dis /= code_norms[j];
@@ -1201,7 +1254,8 @@ size_t InvertedListScanner::scan_codes(
         }
     } else {
         for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(j)) {
+            // // todo aguzhva: use int64_t id instead of j ?
+            if (!sel || sel->is_member(j)) {
                 float dis = distance_to_code(codes);
                 if (code_norms) {
                     dis /= code_norms[j];
@@ -1218,17 +1272,51 @@ size_t InvertedListScanner::scan_codes(
     return nup;
 }
 
+size_t InvertedListScanner::iterate_codes(
+        InvertedListsIterator* it,
+        float* simi,
+        idx_t* idxi,
+        size_t k,
+        size_t& list_size) const {
+    size_t nup = 0;
+    list_size = 0;
+
+    if (!keep_max) {
+        for (; it->is_available(); it->next()) {
+            auto id_and_codes = it->get_id_and_codes();
+            float dis = distance_to_code(id_and_codes.second);
+            if (dis < simi[0]) {
+                maxheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
+                nup++;
+            }
+            list_size++;
+        }
+    } else {
+        for (; it->is_available(); it->next()) {
+            auto id_and_codes = it->get_id_and_codes();
+            float dis = distance_to_code(id_and_codes.second);
+            if (dis > simi[0]) {
+                minheap_replace_top(k, simi, idxi, dis, id_and_codes.first);
+                nup++;
+            }
+            list_size++;
+        }
+    }
+    return nup;
+}
+
 void InvertedListScanner::scan_codes_range(
         size_t list_size,
         const uint8_t* codes,
         const float* code_norms,
         const idx_t* ids,
         float radius,
-        RangeQueryResult& res,
-        const BitsetView bitset) const {
+        RangeQueryResult& res) const {
     for (size_t j = 0; j < list_size; j++) {
-        if (bitset.empty() || !bitset.test(j)) {
+        // // todo aguzhva: use int64_t id instead of j ?
+        if (!sel || sel->is_member(j)) {
             float dis = distance_to_code(codes);
+            // // todo aguzhva: use int64_t id instead of j ?
             if (code_norms) {
                 dis /= code_norms[j];
             }
@@ -1244,4 +1332,23 @@ void InvertedListScanner::scan_codes_range(
     }
 }
 
+void InvertedListScanner::iterate_codes_range(
+        InvertedListsIterator* it,
+        float radius,
+        RangeQueryResult& res,
+        size_t& list_size) const {
+    list_size = 0;
+    for (; it->is_available(); it->next()) {
+        auto id_and_codes = it->get_id_and_codes();
+        float dis = distance_to_code(id_and_codes.second);
+        bool keep = !keep_max
+                ? dis < radius
+                : dis > radius; // TODO templatize to remove this test
+        if (keep) {
+            res.add(dis, id_and_codes.first);
+        }
+        list_size++;
+    }
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVF.h b/thirdparty/faiss/faiss/IndexIVF.h
index 90ce3b23d..f8bbe6e3c 100644
--- a/thirdparty/faiss/faiss/IndexIVF.h
+++ b/thirdparty/faiss/faiss/IndexIVF.h
@@ -11,18 +11,18 @@
 #define FAISS_INDEX_IVF_H
 
 #include <stdint.h>
+#include <memory>
 #include <unordered_map>
 #include <vector>
 
 #include <faiss/Clustering.h>
 #include <faiss/Index.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/invlists/DirectMap.h>
 #include <faiss/invlists/InvertedLists.h>
 #include <faiss/utils/Heap.h>
-#include <knowhere/bitsetview.h>
 
-using knowhere::BitsetView;
 namespace faiss {
 
 /** Encapsulates a quantizer object for the IndexIVF
@@ -31,19 +31,23 @@ namespace faiss {
  * of the lists (especially training)
  */
 struct Level1Quantizer {
-    Index* quantizer; ///< quantizer that maps vectors to inverted lists
-    size_t nlist;     ///< number of possible key values
+    /// quantizer that maps vectors to inverted lists
+    Index* quantizer = nullptr;
+
+    /// number of inverted lists
+    size_t nlist = 0;
 
     /**
      * = 0: use the quantizer as index in a kmeans training
      * = 1: just pass on the training set to the train() of the quantizer
      * = 2: kmeans training on a flat index + add the centroids to the quantizer
      */
-    char quantizer_trains_alone;
-    bool own_fields; ///< whether object owns the quantizer (false by default)
+    char quantizer_trains_alone = 0;
+    bool own_fields = false; ///< whether object owns the quantizer
 
     ClusteringParameters cp; ///< to override default clustering params
-    Index* clustering_index; ///< to override index used during clustering
+    /// to override index used during clustering
+    Index* clustering_index = nullptr;
 
     /// Trains the quantizer and calls train_residual to train sub-quantizers
     void train_q1(
@@ -54,8 +58,8 @@ struct Level1Quantizer {
 
     /// compute the number of bytes required to store list ids
     size_t coarse_code_size() const;
-    void encode_listno(Index::idx_t list_no, uint8_t* code) const;
-    Index::idx_t decode_listno(const uint8_t* code) const;
+    void encode_listno(idx_t list_no, uint8_t* code) const;
+    idx_t decode_listno(const uint8_t* code) const;
 
     Level1Quantizer(Index* quantizer, size_t nlist);
 
@@ -64,17 +68,88 @@ struct Level1Quantizer {
     ~Level1Quantizer();
 };
 
-struct IVFSearchParameters {
-    size_t nprobe;     ///< number of probes at query time
-    size_t max_codes;  ///< max nb of codes to visit to do a query
-    int parallel_mode; // default value if -1, and we will use
-                       // this->parallel_mode in this case
-    IVFSearchParameters() : nprobe(1), max_codes(0), parallel_mode(-1) {}
-    virtual ~IVFSearchParameters() {}
+struct SearchParametersIVF : SearchParameters {
+    size_t nprobe = 1;    ///< number of probes at query time
+    size_t max_codes = 0; ///< max nb of codes to visit to do a query
+    SearchParameters* quantizer_params = nullptr;
+
+    virtual ~SearchParametersIVF() {}
 };
 
+// the new convention puts the index type after SearchParameters
+using IVFSearchParameters = SearchParametersIVF;
+
 struct InvertedListScanner;
 struct IndexIVFStats;
+struct CodePacker;
+
+struct IndexIVFInterface : Level1Quantizer {
+    size_t nprobe = 1;    ///< number of probes at query time
+    size_t max_codes = 0; ///< max nb of codes to visit to do a query
+
+    explicit IndexIVFInterface(Index* quantizer = nullptr, size_t nlist = 0)
+            : Level1Quantizer(quantizer, nlist) {}
+
+    /** search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the corresponding heaps with the query
+     *  results. The default implementation uses InvertedListScanners
+     *  to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param distance
+     *               output distances, size n * k
+     * @param labels output labels, size n * k
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     * @param stats  search stats to be updated (can be null)
+     */
+    virtual void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* assign,
+            const float* centroid_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const = 0;
+
+    /** Range search a set of vectors, that are pre-quantized by the IVF
+     *  quantizer. Fill in the RangeSearchResults results. The default
+     * implementation uses InvertedListScanners to do the search.
+     *
+     * @param n      nb of vectors to query
+     * @param x      query vectors, size nx * d
+     * @param assign coarse quantization indices, size nx * nprobe
+     * @param centroid_dis
+     *               distances to coarse centroids, size nx * nprobe
+     * @param result Output results
+     * @param store_pairs store inv list index + inv list offset
+     *                     instead in upper/lower 32 bit of result,
+     *                     instead of ids (used for reranking).
+     * @param params used to override the object's search parameters
+     * @param stats  search stats to be updated (can be null)
+     */
+    virtual void range_search_preassigned(
+            idx_t nx,
+            const float* x,
+            float radius,
+            const idx_t* keys,
+            const float* coarse_dis,
+            RangeSearchResult* result,
+            bool store_pairs = false,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const = 0;
+
+    virtual ~IndexIVFInterface() {}
+};
 
 /** Index based on a inverted file (IVF)
  *
@@ -96,15 +171,12 @@ struct IndexIVFStats;
  * Sub-classes implement a post-filtering of the index that refines
  * the distance estimation from the query to databse vectors.
  */
-struct IndexIVF : Index, Level1Quantizer {
+struct IndexIVF : Index, IndexIVFInterface {
     /// Access to the actual data
-    InvertedLists* invlists;
-    bool own_invlists;
-
-    size_t code_size; ///< code size per vector in bytes
+    InvertedLists* invlists = nullptr;
+    bool own_invlists = false;
 
-    size_t nprobe;    ///< number of probes at query time
-    size_t max_codes; ///< max nb of codes to visit to do a query
+    size_t code_size = 0; ///< code size per vector in bytes
 
     /** Parallel mode determines how queries are parallelized with OpenMP
      *
@@ -116,13 +188,17 @@ struct IndexIVF : Index, Level1Quantizer {
      * PARALLEL_MODE_NO_HEAP_INIT: binary or with the previous to
      * prevent the heap to be initialized and finalized
      */
-    int parallel_mode;
+    int parallel_mode = 0;
     const int PARALLEL_MODE_NO_HEAP_INIT = 1024;
 
     /** optional map that maps back ids to invlist entries. This
      *  enables reconstruct() */
     DirectMap direct_map;
 
+    /// do the codes in the invlists encode the vectors relative to the
+    /// centroids?
+    bool by_residual = true;
+
     /** The Inverted file takes a quantizer (an Index) on input,
      * which implements the function mapping a vector to a list
      * identifier.
@@ -136,7 +212,7 @@ struct IndexIVF : Index, Level1Quantizer {
 
     void reset() override;
 
-    /// Trains the quantizer and calls train_residual to train sub-quantizers
+    /// Trains the quantizer and calls train_encoder to train sub-quantizers
     void train(idx_t n, const float* x) override;
 
     /// Calls add_with_ids with NULL ids
@@ -182,30 +258,17 @@ struct IndexIVF : Index, Level1Quantizer {
      */
     void add_sa_codes(idx_t n, const uint8_t* codes, const idx_t* xids);
 
-    /// Sub-classes that encode the residuals can train their encoders here
-    /// does nothing by default
-    virtual void train_residual(idx_t n, const float* x);
-
-    /** search a set of vectors, that are pre-quantized by the IVF
-     *  quantizer. Fill in the corresponding heaps with the query
-     *  results. The default implementation uses InvertedListScanners
-     *  to do the search.
+    /** Train the encoder for the vectors.
      *
-     * @param n      nb of vectors to query
-     * @param x      query vectors, size nx * d
-     * @param assign coarse quantization indices, size nx * nprobe
-     * @param centroid_dis
-     *               distances to coarse centroids, size nx * nprobe
-     * @param distance
-     *               output distances, size n * k
-     * @param labels output labels, size n * k
-     * @param store_pairs store inv list index + inv list offset
-     *                     instead in upper/lower 32 bit of result,
-     *                     instead of ids (used for reranking).
-     * @param params used to override the object's search parameters
-     * @param stats  search stats to be updated (can be null)
-     */
-    virtual void search_preassigned(
+     * If by_residual then it is called with residuals and corresponding assign
+     * array, otherwise x is the raw training vectors and assign=nullptr */
+    virtual void train_encoder(idx_t n, const float* x, const idx_t* assign);
+
+    /// can be redefined by subclasses to indicate how many training vectors
+    /// they need
+    virtual idx_t train_encoder_num_vectors() const;
+
+    void search_preassigned(
             idx_t n,
             const float* x,
             idx_t k,
@@ -215,62 +278,42 @@ struct IndexIVF : Index, Level1Quantizer {
             idx_t* labels,
             bool store_pairs,
             const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr,
-            const BitsetView bitset = nullptr) const;
+            IndexIVFStats* stats = nullptr) const override;
 
-    /** assign the vectors, then call search_preassign */
-    void search(
-            idx_t n,
+    void range_search_preassigned(
+            idx_t nx,
             const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            float radius,
+            const idx_t* keys,
+            const float* coarse_dis,
+            RangeSearchResult* result,
+            bool store_pairs = false,
+            const IVFSearchParameters* params = nullptr,
+            IndexIVFStats* stats = nullptr) const override;
 
-    void search_thread_safe(
+    /** assign the vectors, then call search_preassign */
+    void search(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const size_t nprobe,
-            const size_t max_codes,
-            const BitsetView bitset = nullptr) const;
+            const SearchParameters* params = nullptr) const override;
 
     void range_search(
             idx_t n,
             const float* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
-
-    void range_search_thread_safe(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const size_t nprobe,
-            const size_t max_codes,
-            const BitsetView bitset = nullptr) const;
-
-    void range_search_preassigned(
-            idx_t nx,
-            const float* x,
-            float radius,
-            const idx_t* keys,
-            const float* coarse_dis,
-            RangeSearchResult* result,
-            bool store_pairs = false,
-            const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr,
-            const BitsetView bitset = nullptr) const;
+            const SearchParameters* params = nullptr) const override;
 
     /** Get a scanner for this index (store_pairs means ignore labels)
      *
      * The default search implementation uses this to compute the distances
      */
     virtual InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs = false) const;
+            bool store_pairs = false,
+            const IDSelector* sel = nullptr) const;
 
     /** reconstruct a vector. Works only if maintain_direct_map is set to 1 or 2
      */
@@ -312,7 +355,8 @@ struct IndexIVF : Index, Level1Quantizer {
             idx_t k,
             float* distances,
             idx_t* labels,
-            float* recons) const override;
+            float* recons,
+            const SearchParameters* params = nullptr) const override;
 
     /** Reconstruct a vector given the location in terms of (inv list index +
      * inv list offset) instead of the id.
@@ -330,26 +374,19 @@ struct IndexIVF : Index, Level1Quantizer {
 
     size_t remove_ids(const IDSelector& sel) override;
 
-    /** check that the two indexes are compatible (ie, they are
-     * trained in the same way and have the same
-     * parameters). Otherwise throw. */
-    void check_compatible_for_merge(const IndexIVF& other) const;
+    void check_compatible_for_merge(const Index& otherIndex) const override;
 
-    /** moves the entries from another dataset to self. On output,
-     * other is empty. add_id is added to all moved ids (for
-     * sequential ids, this would be this->ntotal */
-    virtual void merge_from(IndexIVF& other, idx_t add_id);
+    virtual void merge_from(Index& otherIndex, idx_t add_id) override;
+
+    // returns a new instance of a CodePacker
+    virtual CodePacker* get_CodePacker() const;
 
     /** copy a subset of the entries index to the other index
-     *
-     * if subset_type == 0: copies ids in [a1, a2)
-     * if subset_type == 1: copies ids if id % a1 == a2
-     * if subset_type == 2: copies inverted lists such that a1
-     *                      elements are left before and a2 elements are after
+     * see Invlists::copy_subset_to for the meaning of subset_type
      */
     virtual void copy_subset_to(
             IndexIVF& other,
-            int subset_type,
+            InvertedLists::subset_type_t subset_type,
             idx_t a1,
             idx_t a2) const;
 
@@ -363,7 +400,10 @@ struct IndexIVF : Index, Level1Quantizer {
         return invlists->list_size(list_no);
     }
 
-    /** intialize a direct map
+    /// are the ids sorted?
+    bool check_ids_sorted() const;
+
+    /** initialize a direct map
      *
      * @param new_maintain_direct_map    if true, create a direct map,
      *                                   else clear it
@@ -377,7 +417,6 @@ struct IndexIVF : Index, Level1Quantizer {
 
     /* The standalone codec interface (except sa_decode that is specific) */
     size_t sa_code_size() const override;
-
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     void dump();
@@ -392,12 +431,18 @@ struct RangeQueryResult;
  * distance_to_code and scan_codes can be called in multiple
  * threads */
 struct InvertedListScanner {
-    using idx_t = Index::idx_t;
-
     idx_t list_no = -1;    ///< remember current list
     bool keep_max = false; ///< keep maximum instead of minimum
     /// store positions in invlists rather than labels
-    bool store_pairs = false;
+    bool store_pairs;
+
+    /// search in this subset of ids
+    const IDSelector* sel;
+
+    InvertedListScanner(
+            bool store_pairs = false,
+            const IDSelector* sel = nullptr)
+            : store_pairs(store_pairs), sel(sel) {}
 
     /// used in default implementation of scan_codes
     size_t code_size = 0;
@@ -430,8 +475,15 @@ struct InvertedListScanner {
             const idx_t* ids,
             float* distances,
             idx_t* labels,
+            size_t k) const;
+
+    // same as scan_codes, using an iterator
+    virtual size_t iterate_codes(
+            InvertedListsIterator* iterator,
+            float* distances,
+            idx_t* labels,
             size_t k,
-            const BitsetView bitset = nullptr) const;
+            size_t& list_size) const;
 
     /** scan a set of codes, compute distances to current query and
      * update results if distances are below radius
@@ -443,12 +495,21 @@ struct InvertedListScanner {
             const float* code_norms,
             const idx_t* ids,
             float radius,
+            RangeQueryResult& result) const;
+
+    // same as scan_codes_range, using an iterator
+    virtual void iterate_codes_range(
+            InvertedListsIterator* iterator,
+            float radius,
             RangeQueryResult& result,
-            const BitsetView bitset = nullptr) const;
+            size_t& list_size) const;
 
     virtual ~InvertedListScanner() {}
 };
 
+// whether to check that coarse quantizers are the same
+FAISS_API extern bool check_compatible_for_merge_expensive_check;
+
 struct IndexIVFStats {
     size_t nq;                // nb of queries run
     size_t nlist;             // nb of inverted lists scanned
diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
index 266df5a16..9395dd6ee 100644
--- a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.cpp
@@ -5,9 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// quiet the noise
-// XXclang-format off
-
 #include <faiss/IndexIVFAdditiveQuantizer.h>
 
 #include <algorithm>
@@ -41,26 +38,20 @@ IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(
 IndexIVFAdditiveQuantizer::IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq)
         : IndexIVF(), aq(aq) {}
 
-void IndexIVFAdditiveQuantizer::train_residual(idx_t n, const float* x) {
-    const float* x_in = x;
+void IndexIVFAdditiveQuantizer::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    aq->train(n, x);
+}
 
+idx_t IndexIVFAdditiveQuantizer::train_encoder_num_vectors() const {
     size_t max_train_points = 1024 * ((size_t)1 << aq->nbits[0]);
-
-    x = fvecs_maybe_subsample(
-            d, (size_t*)&n, max_train_points, x, verbose, 1234);
-    ScopeDeleter1<float> del_x(x_in == x ? nullptr : x);
-
-    if (by_residual) {
-        std::vector<Index::idx_t> idx(n);
-        quantizer->assign(n, x, idx.data());
-
-        std::vector<float> residuals(n * d);
-        quantizer->compute_residual_n(n, x, residuals.data(), idx.data());
-
-        aq->train(n, residuals.data());
-    } else {
-        aq->train(n, x);
+    // we need more data to train LSQ
+    if (dynamic_cast<LocalSearchQuantizer*>(aq)) {
+        max_train_points = 1024 * aq->M * ((size_t)1 << aq->nbits[0]);
     }
+    return max_train_points;
 }
 
 void IndexIVFAdditiveQuantizer::encode_vectors(
@@ -77,7 +68,7 @@ void IndexIVFAdditiveQuantizer::encode_vectors(
         // subtract centroids
         std::vector<float> residuals(n * d);
 
-#pragma omp parallel if (n > 10000)
+#pragma omp parallel for if (n > 10000)
         for (idx_t i = 0; i < n; i++) {
             quantizer->compute_residual(
                     x + i * d,
@@ -100,7 +91,33 @@ void IndexIVFAdditiveQuantizer::encode_vectors(
     }
 }
 
-IndexIVFAdditiveQuantizer::~IndexIVFAdditiveQuantizer() {}
+void IndexIVFAdditiveQuantizer::sa_decode(
+        idx_t n,
+        const uint8_t* codes,
+        float* x) const {
+    const size_t coarse_size = coarse_code_size();
+
+#pragma omp parallel if (n > 1000)
+    {
+        std::vector<float> residual(d);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno(code);
+            float* xi = x + i * d;
+            aq->decode(code + coarse_size, xi, 1);
+            if (by_residual) {
+                quantizer->reconstruct(list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
+}
+
+IndexIVFAdditiveQuantizer::~IndexIVFAdditiveQuantizer() = default;
 
 /*********************************************
  * AQInvertedListScanner
@@ -119,7 +136,7 @@ struct AQInvertedListScanner : InvertedListScanner {
             : ia(ia), aq(*ia.aq) {
         this->store_pairs = store_pairs;
         this->code_size = ia.code_size;
-        keep_max = ia.metric_type == METRIC_INNER_PRODUCT;
+        keep_max = is_similarity_metric(ia.metric_type);
         tmp.resize(ia.d);
     }
 
@@ -141,7 +158,7 @@ struct AQInvertedListScanner : InvertedListScanner {
         }
     }
 
-    ~AQInvertedListScanner() {}
+    ~AQInvertedListScanner() = default;
 };
 
 template <bool is_IP>
@@ -172,7 +189,7 @@ struct AQInvertedListScannerDecompress : AQInvertedListScanner {
                      : fvec_L2sqr(q, b.data(), aq.d);
     }
 
-    ~AQInvertedListScannerDecompress() override {}
+    ~AQInvertedListScannerDecompress() override = default;
 };
 
 template <bool is_IP, Search_type_t search_type>
@@ -215,13 +232,15 @@ struct AQInvertedListScannerLUT : AQInvertedListScanner {
                 aq.compute_1_distance_LUT<is_IP, search_type>(code, LUT.data());
     }
 
-    ~AQInvertedListScannerLUT() override {}
+    ~AQInvertedListScannerLUT() override = default;
 };
 
 } // anonymous namespace
 
 InvertedListScanner* IndexIVFAdditiveQuantizer::get_InvertedListScanner(
-        bool store_pairs) const {
+        bool store_pairs,
+        const IDSelector* sel) const {
+    FAISS_THROW_IF_NOT(!sel);
     if (metric_type == METRIC_INNER_PRODUCT) {
         if (aq->search_type == AdditiveQuantizer::ST_decompress) {
             return new AQInvertedListScannerDecompress<true>(
@@ -245,8 +264,10 @@ InvertedListScanner* IndexIVFAdditiveQuantizer::get_InvertedListScanner(
                 A(ST_norm_float)
                 A(ST_norm_qint8)
                 A(ST_norm_qint4)
-                A(ST_norm_cqint8)
                 A(ST_norm_cqint4)
+            case AdditiveQuantizer::ST_norm_lsq2x4:
+            case AdditiveQuantizer::ST_norm_rq2x4:
+                A(ST_norm_cqint8)
 #undef A
             default:
                 FAISS_THROW_FMT(
@@ -290,7 +311,7 @@ IndexIVFResidualQuantizer::IndexIVFResidualQuantizer(
                   metric,
                   search_type) {}
 
-IndexIVFResidualQuantizer::~IndexIVFResidualQuantizer() {}
+IndexIVFResidualQuantizer::~IndexIVFResidualQuantizer() = default;
 
 /**************************************************************************************
  * IndexIVFLocalSearchQuantizer
@@ -312,6 +333,53 @@ IndexIVFLocalSearchQuantizer::IndexIVFLocalSearchQuantizer(
 IndexIVFLocalSearchQuantizer::IndexIVFLocalSearchQuantizer()
         : IndexIVFAdditiveQuantizer(&lsq) {}
 
-IndexIVFLocalSearchQuantizer::~IndexIVFLocalSearchQuantizer() {}
+IndexIVFLocalSearchQuantizer::~IndexIVFLocalSearchQuantizer() = default;
+
+/**************************************************************************************
+ * IndexIVFProductResidualQuantizer
+ **************************************************************************************/
+
+IndexIVFProductResidualQuantizer::IndexIVFProductResidualQuantizer(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t nsplits,
+        size_t Msub,
+        size_t nbits,
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexIVFAdditiveQuantizer(&prq, quantizer, d, nlist, metric),
+          prq(d, nsplits, Msub, nbits, search_type) {
+    code_size = invlists->code_size = prq.code_size;
+}
+
+IndexIVFProductResidualQuantizer::IndexIVFProductResidualQuantizer()
+        : IndexIVFAdditiveQuantizer(&prq) {}
+
+IndexIVFProductResidualQuantizer::~IndexIVFProductResidualQuantizer() = default;
+
+/**************************************************************************************
+ * IndexIVFProductLocalSearchQuantizer
+ **************************************************************************************/
+
+IndexIVFProductLocalSearchQuantizer::IndexIVFProductLocalSearchQuantizer(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t nsplits,
+        size_t Msub,
+        size_t nbits,
+        MetricType metric,
+        Search_type_t search_type)
+        : IndexIVFAdditiveQuantizer(&plsq, quantizer, d, nlist, metric),
+          plsq(d, nsplits, Msub, nbits, search_type) {
+    code_size = invlists->code_size = plsq.code_size;
+}
+
+IndexIVFProductLocalSearchQuantizer::IndexIVFProductLocalSearchQuantizer()
+        : IndexIVFAdditiveQuantizer(&plsq) {}
+
+IndexIVFProductLocalSearchQuantizer::~IndexIVFProductLocalSearchQuantizer() =
+        default;
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.h b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.h
index f5deb6bcb..d065947d0 100644
--- a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.h
+++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizer.h
@@ -15,6 +15,7 @@
 
 #include <faiss/IndexIVF.h>
 #include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ProductAdditiveQuantizer.h>
 #include <faiss/impl/ResidualQuantizer.h>
 #include <faiss/impl/platform_macros.h>
 
@@ -25,7 +26,6 @@ namespace faiss {
 struct IndexIVFAdditiveQuantizer : IndexIVF {
     // the quantizer
     AdditiveQuantizer* aq;
-    bool by_residual = true;
     int use_precomputed_table = 0; // for future use
 
     using Search_type_t = AdditiveQuantizer::Search_type_t;
@@ -39,7 +39,9 @@ struct IndexIVFAdditiveQuantizer : IndexIVF {
 
     explicit IndexIVFAdditiveQuantizer(AdditiveQuantizer* aq);
 
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+
+    idx_t train_encoder_num_vectors() const override;
 
     void encode_vectors(
             idx_t n,
@@ -49,7 +51,10 @@ struct IndexIVFAdditiveQuantizer : IndexIVF {
             bool include_listnos = false) const override;
 
     InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs) const override;
+            bool store_pairs,
+            const IDSelector* sel) const override;
+
+    void sa_decode(idx_t n, const uint8_t* codes, float* x) const override;
 
     ~IndexIVFAdditiveQuantizer() override;
 };
@@ -116,6 +121,64 @@ struct IndexIVFLocalSearchQuantizer : IndexIVFAdditiveQuantizer {
     virtual ~IndexIVFLocalSearchQuantizer();
 };
 
+/** IndexIVF based on a product residual quantizer. Stored vectors are
+ * approximated by product residual quantization codes.
+ */
+struct IndexIVFProductResidualQuantizer : IndexIVFAdditiveQuantizer {
+    /// The product residual quantizer used to encode the vectors
+    ProductResidualQuantizer prq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param nsplits  number of residual quantizers
+     * @param Msub   number of subquantizers per RQ
+     * @param nbits  number of bit per subvector index
+     */
+    IndexIVFProductResidualQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t nsplits,
+            size_t Msub,
+            size_t nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexIVFProductResidualQuantizer();
+
+    virtual ~IndexIVFProductResidualQuantizer();
+};
+
+/** IndexIVF based on a product local search quantizer. Stored vectors are
+ * approximated by product local search quantization codes.
+ */
+struct IndexIVFProductLocalSearchQuantizer : IndexIVFAdditiveQuantizer {
+    /// The product local search quantizer used to encode the vectors
+    ProductLocalSearchQuantizer plsq;
+
+    /** Constructor.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param nsplits  number of local search quantizers
+     * @param Msub   number of subquantizers per LSQ
+     * @param nbits  number of bit per subvector index
+     */
+    IndexIVFProductLocalSearchQuantizer(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t nsplits,
+            size_t Msub,
+            size_t nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_decompress);
+
+    IndexIVFProductLocalSearchQuantizer();
+
+    virtual ~IndexIVFProductLocalSearchQuantizer();
+};
+
 } // namespace faiss
 
 #endif
diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
new file mode 100644
index 000000000..e8ea80984
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.cpp
@@ -0,0 +1,574 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
+
+#include <cassert>
+#include <cinttypes>
+#include <cstdio>
+
+#include <omp.h>
+
+#include <memory>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/LookupTableScaler.h>
+#include <faiss/impl/pq4_fast_scan.h>
+#include <faiss/invlists/BlockInvertedLists.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/quantize_lut.h>
+#include <faiss/utils/simdlib.h>
+#include <faiss/utils/utils.h>
+
+#include "simd/hook.h"
+
+namespace faiss {
+
+inline size_t roundup(size_t a, size_t b) {
+    return (a + b - 1) / b * b;
+}
+
+IndexIVFAdditiveQuantizerFastScan::IndexIVFAdditiveQuantizerFastScan(
+        Index* quantizer,
+        AdditiveQuantizer* aq,
+        size_t d,
+        size_t nlist,
+        MetricType metric,
+        int bbs)
+        : IndexIVFFastScan(quantizer, d, nlist, 0, metric) {
+    if (aq != nullptr) {
+        init(aq, nlist, metric, bbs);
+    }
+}
+
+void IndexIVFAdditiveQuantizerFastScan::init(
+        AdditiveQuantizer* aq,
+        size_t nlist,
+        MetricType metric,
+        int bbs) {
+    FAISS_THROW_IF_NOT(aq != nullptr);
+    FAISS_THROW_IF_NOT(!aq->nbits.empty());
+    FAISS_THROW_IF_NOT(aq->nbits[0] == 4);
+    if (metric == METRIC_INNER_PRODUCT) {
+        FAISS_THROW_IF_NOT_MSG(
+                aq->search_type == AdditiveQuantizer::ST_LUT_nonorm,
+                "Search type must be ST_LUT_nonorm for IP metric");
+    } else {
+        FAISS_THROW_IF_NOT_MSG(
+                aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
+                        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4,
+                "Search type must be lsq2x4 or rq2x4 for L2 metric");
+    }
+
+    this->aq = aq;
+    if (metric_type == METRIC_L2) {
+        M = aq->M + 2; // 2x4 bits AQ
+    } else {
+        M = aq->M;
+    }
+    init_fastscan(M, 4, nlist, metric, bbs);
+
+    max_train_points = 1024 * ksub * M;
+    by_residual = true;
+}
+
+IndexIVFAdditiveQuantizerFastScan::IndexIVFAdditiveQuantizerFastScan(
+        const IndexIVFAdditiveQuantizer& orig,
+        int bbs)
+        : IndexIVFFastScan(
+                  orig.quantizer,
+                  orig.d,
+                  orig.nlist,
+                  0,
+                  orig.metric_type),
+          aq(orig.aq) {
+    FAISS_THROW_IF_NOT(
+            metric_type == METRIC_INNER_PRODUCT || !orig.by_residual);
+
+    init(aq, nlist, metric_type, bbs);
+
+    is_trained = orig.is_trained;
+    ntotal = orig.ntotal;
+    nprobe = orig.nprobe;
+
+    for (size_t i = 0; i < nlist; i++) {
+        size_t nb = orig.invlists->list_size(i);
+        size_t nb2 = roundup(nb, bbs);
+        AlignedTable<uint8_t> tmp(nb2 * M2 / 2);
+        pq4_pack_codes(
+                InvertedLists::ScopedCodes(orig.invlists, i).get(),
+                nb,
+                M,
+                nb2,
+                bbs,
+                M2,
+                tmp.get());
+        invlists->add_entries(
+                i,
+                nb,
+                InvertedLists::ScopedIds(orig.invlists, i).get(),
+                tmp.get());
+    }
+
+    orig_invlists = orig.invlists;
+}
+
+IndexIVFAdditiveQuantizerFastScan::IndexIVFAdditiveQuantizerFastScan() {
+    bbs = 0;
+    M2 = 0;
+    aq = nullptr;
+
+    is_trained = false;
+}
+
+IndexIVFAdditiveQuantizerFastScan::~IndexIVFAdditiveQuantizerFastScan() =
+        default;
+
+/*********************************************************
+ * Training
+ *********************************************************/
+
+idx_t IndexIVFAdditiveQuantizerFastScan::train_encoder_num_vectors() const {
+    return max_train_points;
+}
+
+void IndexIVFAdditiveQuantizerFastScan::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    if (aq->is_trained) {
+        return;
+    }
+
+    if (verbose) {
+        printf("training additive quantizer on %d vectors\n", int(n));
+    }
+
+    if (verbose) {
+        printf("training %zdx%zd additive quantizer on "
+               "%" PRId64 " vectors in %dD\n",
+               aq->M,
+               ksub,
+               n,
+               d);
+    }
+    aq->verbose = verbose;
+    aq->train(n, x);
+
+    // train norm quantizer
+    if (by_residual && metric_type == METRIC_L2) {
+        std::vector<float> decoded_x(n * d);
+        std::vector<uint8_t> x_codes(n * aq->code_size);
+        aq->compute_codes(x, x_codes.data(), n);
+        aq->decode(x_codes.data(), decoded_x.data(), n);
+
+        // add coarse centroids
+        std::vector<float> centroid(d);
+        for (idx_t i = 0; i < n; i++) {
+            auto xi = decoded_x.data() + i * d;
+            quantizer->reconstruct(assign[i], centroid.data());
+            fvec_add(d, centroid.data(), xi, xi);
+        }
+
+        std::vector<float> norms(n, 0);
+        fvec_norms_L2sqr(norms.data(), decoded_x.data(), d, n);
+
+        // re-train norm tables
+        aq->train_norm(n, norms.data());
+    }
+
+    if (metric_type == METRIC_L2) {
+        estimate_norm_scale(n, x);
+    }
+}
+
+void IndexIVFAdditiveQuantizerFastScan::estimate_norm_scale(
+        idx_t n,
+        const float* x_in) {
+    FAISS_THROW_IF_NOT(metric_type == METRIC_L2);
+
+    constexpr int seed = 0x980903;
+    constexpr size_t max_points_estimated = 65536;
+    size_t ns = n;
+    const float* x = fvecs_maybe_subsample(
+            d, &ns, max_points_estimated, x_in, verbose, seed);
+    n = ns;
+    std::unique_ptr<float[]> del_x;
+    if (x != x_in) {
+        del_x.reset((float*)x);
+    }
+
+    std::vector<idx_t> coarse_ids(n);
+    std::vector<float> coarse_dis(n);
+    quantizer->search(n, x, 1, coarse_dis.data(), coarse_ids.data());
+
+    AlignedTable<float> dis_tables;
+    AlignedTable<float> biases;
+
+    size_t index_nprobe = nprobe;
+    nprobe = 1;
+    compute_LUT(n, x, coarse_ids.data(), coarse_dis.data(), dis_tables, biases);
+    nprobe = index_nprobe;
+
+    float scale = 0;
+
+#pragma omp parallel for reduction(+ : scale)
+    for (idx_t i = 0; i < n; i++) {
+        const float* lut = dis_tables.get() + i * M * ksub;
+        scale += quantize_lut::aq_estimate_norm_scale(M, ksub, 2, lut);
+    }
+    scale /= n;
+    norm_scale = (int)std::roundf(std::max(scale, 1.0f));
+
+    if (verbose) {
+        printf("estimated norm scale: %lf\n", scale);
+        printf("rounded norm scale: %d\n", norm_scale);
+    }
+}
+
+/*********************************************************
+ * Code management functions
+ *********************************************************/
+
+void IndexIVFAdditiveQuantizerFastScan::encode_vectors(
+        idx_t n,
+        const float* x,
+        const idx_t* list_nos,
+        uint8_t* codes,
+        bool include_listnos) const {
+    idx_t bs = 65536;
+    if (n > bs) {
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(n, i0 + bs);
+            encode_vectors(
+                    i1 - i0,
+                    x + i0 * d,
+                    list_nos + i0,
+                    codes + i0 * code_size,
+                    include_listnos);
+        }
+        return;
+    }
+
+    if (by_residual) {
+        std::vector<float> residuals(n * d);
+        std::vector<float> centroids(n * d);
+
+#pragma omp parallel for if (n > 1000)
+        for (idx_t i = 0; i < n; i++) {
+            if (list_nos[i] < 0) {
+                memset(residuals.data() + i * d, 0, sizeof(residuals[0]) * d);
+            } else {
+                quantizer->compute_residual(
+                        x + i * d, residuals.data() + i * d, list_nos[i]);
+            }
+        }
+
+#pragma omp parallel for if (n > 1000)
+        for (idx_t i = 0; i < n; i++) {
+            auto c = centroids.data() + i * d;
+            quantizer->reconstruct(list_nos[i], c);
+        }
+
+        aq->compute_codes_add_centroids(
+                residuals.data(), codes, n, centroids.data());
+
+    } else {
+        aq->compute_codes(x, codes, n);
+    }
+
+    if (include_listnos) {
+        size_t coarse_size = coarse_code_size();
+        for (idx_t i = n - 1; i >= 0; i--) {
+            uint8_t* code = codes + i * (coarse_size + code_size);
+            memmove(code + coarse_size, codes + i * code_size, code_size);
+            encode_listno(list_nos[i], code);
+        }
+    }
+}
+
+/*********************************************************
+ * Search functions
+ *********************************************************/
+
+void IndexIVFAdditiveQuantizerFastScan::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params,
+                "IndexIVFAdditiveQuantizerFastScan params have incorrect type");
+    }
+
+    FAISS_THROW_IF_NOT(k > 0);
+    bool rescale = (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2);
+    if (!rescale) {
+        IndexIVFFastScan::search(n, x, k, distances, labels, params);
+        return;
+    }
+
+    NormTableScaler scaler(norm_scale);
+    if (metric_type == METRIC_L2) {
+        search_dispatch_implem<true>(
+                n, x, k, distances, labels, scaler, params);
+    } else {
+        search_dispatch_implem<false>(
+                n, x, k, distances, labels, scaler, params);
+    }
+}
+
+/*********************************************************
+ * Look-Up Table functions
+ *********************************************************/
+
+/********************************************************
+
+Let q denote the query vector,
+    x denote the quantized database vector,
+    c denote the corresponding IVF centroid,
+    r denote the residual (x - c).
+
+The L2 distance between q and x is:
+
+    d(q, x) = (q - x)^2
+            = (q - c - r)^2
+            = q^2 - 2<q, c> - 2<q, r> + x^2
+
+where q^2 is a constant for all x, <q,c> is only relevant to c,
+and x^2 is the quantized database vector norm.
+
+Different from IVFAdditiveQuantizer, we encode the quantized vector norm x^2
+instead of r^2. So that we only need to compute one LUT for each query vector:
+
+    LUT[m][k] = -2 * <q, codebooks[m][k]>
+
+`-2<q,c>` could be precomputed in `compute_LUT` and store in `biases`.
+if `by_residual=False`, `<q,c>` is simply 0.
+
+
+
+About norm look-up tables:
+
+To take advantage of the fast SIMD table lookups, we encode the norm by a 2x4
+bits 1D additive quantizer (simply treat the scalar norm as a 1D vector).
+
+Let `cm` denote the codebooks of the trained 2x4 bits 1D additive quantizer,
+size (2, 16); `bm` denote the encoding code of the norm, a 8-bit integer; `cb`
+denote the codebooks of the additive quantizer to encode the database vector,
+size (M, 16).
+
+The decoded norm is:
+
+    decoded_norm = cm[0][bm & 15] + cm[1][bm >> 4]
+
+The decoding is actually doing a table look-up.
+
+We combine the norm LUTs and the IP LUTs together:
+
+    LUT is a 2D table, size (M + 2, 16)
+    if m < M :
+        LUT[m][k] = -2 * <q, cb[m][k]>
+    else:
+        LUT[m][k] = cm[m - M][k]
+
+********************************************************/
+
+bool IndexIVFAdditiveQuantizerFastScan::lookup_table_is_3d() const {
+    return false;
+}
+
+void IndexIVFAdditiveQuantizerFastScan::compute_LUT(
+        size_t n,
+        const float* x,
+        const idx_t* coarse_ids,
+        const float*,
+        AlignedTable<float>& dis_tables,
+        AlignedTable<float>& biases) const {
+    const size_t dim12 = ksub * M;
+    const size_t ip_dim12 = aq->M * ksub;
+
+    dis_tables.resize(n * dim12);
+
+    float coef = 1.0f;
+    if (metric_type == METRIC_L2) {
+        coef = -2.0f;
+    }
+
+    if (by_residual) {
+        // bias = coef * <q, c>
+        // NOTE: q^2 is not added to `biases`
+        biases.resize(n * nprobe);
+#pragma omp parallel
+        {
+            std::vector<float> centroid(d);
+            float* c = centroid.data();
+
+#pragma omp for
+            for (idx_t ij = 0; ij < n * nprobe; ij++) {
+                int i = ij / nprobe;
+                quantizer->reconstruct(coarse_ids[ij], c);
+                biases[ij] = coef * fvec_inner_product(c, x + i * d, d);
+            }
+        }
+    }
+
+    if (metric_type == METRIC_L2) {
+        const size_t norm_dim12 = 2 * ksub;
+
+        // inner product look-up tables
+        aq->compute_LUT(n, x, dis_tables.data(), -2.0f, dim12);
+
+        // copy and rescale norm look-up tables
+        auto norm_tabs = aq->norm_tabs;
+        if (rescale_norm && norm_scale > 1 && metric_type == METRIC_L2) {
+            for (size_t i = 0; i < norm_tabs.size(); i++) {
+                norm_tabs[i] /= norm_scale;
+            }
+        }
+        const float* norm_lut = norm_tabs.data();
+        FAISS_THROW_IF_NOT(norm_tabs.size() == norm_dim12);
+
+        // combine them
+#pragma omp parallel for if (n > 100)
+        for (idx_t i = 0; i < n; i++) {
+            float* tab = dis_tables.data() + i * dim12 + ip_dim12;
+            memcpy(tab, norm_lut, norm_dim12 * sizeof(*tab));
+        }
+
+    } else if (metric_type == METRIC_INNER_PRODUCT) {
+        aq->compute_LUT(n, x, dis_tables.get());
+    } else {
+        FAISS_THROW_FMT("metric %d not supported", metric_type);
+    }
+}
+
+void IndexIVFAdditiveQuantizerFastScan::sa_decode(
+        idx_t n,
+        const uint8_t* bytes,
+        float* x) const {
+    aq->decode(bytes, x, n);
+}
+
+/********** IndexIVFLocalSearchQuantizerFastScan ************/
+IndexIVFLocalSearchQuantizerFastScan::IndexIVFLocalSearchQuantizerFastScan(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,
+        size_t nbits,
+        MetricType metric,
+        Search_type_t search_type,
+        int bbs)
+        : IndexIVFAdditiveQuantizerFastScan(
+                  quantizer,
+                  nullptr,
+                  d,
+                  nlist,
+                  metric,
+                  bbs),
+          lsq(d, M, nbits, search_type) {
+    FAISS_THROW_IF_NOT(nbits == 4);
+    init(&lsq, nlist, metric, bbs);
+}
+
+IndexIVFLocalSearchQuantizerFastScan::IndexIVFLocalSearchQuantizerFastScan() {
+    aq = &lsq;
+}
+
+/********** IndexIVFResidualQuantizerFastScan ************/
+IndexIVFResidualQuantizerFastScan::IndexIVFResidualQuantizerFastScan(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,
+        size_t nbits,
+        MetricType metric,
+        Search_type_t search_type,
+        int bbs)
+        : IndexIVFAdditiveQuantizerFastScan(
+                  quantizer,
+                  nullptr,
+                  d,
+                  nlist,
+                  metric,
+                  bbs),
+          rq(d, M, nbits, search_type) {
+    FAISS_THROW_IF_NOT(nbits == 4);
+    init(&rq, nlist, metric, bbs);
+}
+
+IndexIVFResidualQuantizerFastScan::IndexIVFResidualQuantizerFastScan() {
+    aq = &rq;
+}
+
+/********** IndexIVFProductLocalSearchQuantizerFastScan ************/
+IndexIVFProductLocalSearchQuantizerFastScan::
+        IndexIVFProductLocalSearchQuantizerFastScan(
+                Index* quantizer,
+                size_t d,
+                size_t nlist,
+                size_t nsplits,
+                size_t Msub,
+                size_t nbits,
+                MetricType metric,
+                Search_type_t search_type,
+                int bbs)
+        : IndexIVFAdditiveQuantizerFastScan(
+                  quantizer,
+                  nullptr,
+                  d,
+                  nlist,
+                  metric,
+                  bbs),
+          plsq(d, nsplits, Msub, nbits, search_type) {
+    FAISS_THROW_IF_NOT(nbits == 4);
+    init(&plsq, nlist, metric, bbs);
+}
+
+IndexIVFProductLocalSearchQuantizerFastScan::
+        IndexIVFProductLocalSearchQuantizerFastScan() {
+    aq = &plsq;
+}
+
+/********** IndexIVFProductResidualQuantizerFastScan ************/
+IndexIVFProductResidualQuantizerFastScan::
+        IndexIVFProductResidualQuantizerFastScan(
+                Index* quantizer,
+                size_t d,
+                size_t nlist,
+                size_t nsplits,
+                size_t Msub,
+                size_t nbits,
+                MetricType metric,
+                Search_type_t search_type,
+                int bbs)
+        : IndexIVFAdditiveQuantizerFastScan(
+                  quantizer,
+                  nullptr,
+                  d,
+                  nlist,
+                  metric,
+                  bbs),
+          prq(d, nsplits, Msub, nbits, search_type) {
+    FAISS_THROW_IF_NOT(nbits == 4);
+    init(&prq, nlist, metric, bbs);
+}
+
+IndexIVFProductResidualQuantizerFastScan::
+        IndexIVFProductResidualQuantizerFastScan() {
+    aq = &prq;
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h
new file mode 100644
index 000000000..24ce7287e
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIVFAdditiveQuantizerFastScan.h
@@ -0,0 +1,173 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <faiss/IndexIVFAdditiveQuantizer.h>
+#include <faiss/IndexIVFFastScan.h>
+#include <faiss/impl/AdditiveQuantizer.h>
+#include <faiss/impl/ProductAdditiveQuantizer.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+/** Fast scan version of IVFAQ. Works for 4-bit AQ for now.
+ *
+ * The codes in the inverted lists are not stored sequentially but
+ * grouped in blocks of size bbs. This makes it possible to very quickly
+ * compute distances with SIMD instructions.
+ *
+ * Implementations (implem):
+ * 0: auto-select implementation (default)
+ * 1: orig's search, re-implemented
+ * 2: orig's search, re-ordered by invlist
+ * 10: optimizer int16 search, collect results in heap, no qbs
+ * 11: idem, collect results in reservoir
+ * 12: optimizer int16 search, collect results in heap, uses qbs
+ * 13: idem, collect results in reservoir
+ */
+
+struct IndexIVFAdditiveQuantizerFastScan : IndexIVFFastScan {
+    using Search_type_t = AdditiveQuantizer::Search_type_t;
+
+    AdditiveQuantizer* aq;
+
+    bool rescale_norm = false;
+    int norm_scale = 1;
+
+    // max number of training vectors
+    size_t max_train_points;
+
+    IndexIVFAdditiveQuantizerFastScan(
+            Index* quantizer,
+            AdditiveQuantizer* aq,
+            size_t d,
+            size_t nlist,
+            MetricType metric = METRIC_L2,
+            int bbs = 32);
+
+    void init(AdditiveQuantizer* aq, size_t nlist, MetricType metric, int bbs);
+
+    IndexIVFAdditiveQuantizerFastScan();
+
+    ~IndexIVFAdditiveQuantizerFastScan() override;
+
+    // built from an IndexIVFAQ
+    explicit IndexIVFAdditiveQuantizerFastScan(
+            const IndexIVFAdditiveQuantizer& orig,
+            int bbs = 32);
+
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+
+    idx_t train_encoder_num_vectors() const override;
+
+    void estimate_norm_scale(idx_t n, const float* x);
+
+    /// same as the regular IVFAQ encoder. The codes are not reorganized by
+    /// blocks a that point
+    void encode_vectors(
+            idx_t n,
+            const float* x,
+            const idx_t* list_nos,
+            uint8_t* codes,
+            bool include_listno = false) const override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    // prepare look-up tables
+
+    bool lookup_table_is_3d() const override;
+
+    void compute_LUT(
+            size_t n,
+            const float* x,
+            const idx_t* coarse_ids,
+            const float* coarse_dis,
+            AlignedTable<float>& dis_tables,
+            AlignedTable<float>& biases) const override;
+
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+struct IndexIVFLocalSearchQuantizerFastScan
+        : IndexIVFAdditiveQuantizerFastScan {
+    LocalSearchQuantizer lsq;
+
+    IndexIVFLocalSearchQuantizerFastScan(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
+            int bbs = 32);
+
+    IndexIVFLocalSearchQuantizerFastScan();
+};
+
+struct IndexIVFResidualQuantizerFastScan : IndexIVFAdditiveQuantizerFastScan {
+    ResidualQuantizer rq;
+
+    IndexIVFResidualQuantizerFastScan(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
+            int bbs = 32);
+
+    IndexIVFResidualQuantizerFastScan();
+};
+
+struct IndexIVFProductLocalSearchQuantizerFastScan
+        : IndexIVFAdditiveQuantizerFastScan {
+    ProductLocalSearchQuantizer plsq;
+
+    IndexIVFProductLocalSearchQuantizerFastScan(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t nsplits,
+            size_t Msub,
+            size_t nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
+            int bbs = 32);
+
+    IndexIVFProductLocalSearchQuantizerFastScan();
+};
+
+struct IndexIVFProductResidualQuantizerFastScan
+        : IndexIVFAdditiveQuantizerFastScan {
+    ProductResidualQuantizer prq;
+
+    IndexIVFProductResidualQuantizerFastScan(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t nsplits,
+            size_t Msub,
+            size_t nbits,
+            MetricType metric = METRIC_L2,
+            Search_type_t search_type = AdditiveQuantizer::ST_norm_lsq2x4,
+            int bbs = 32);
+
+    IndexIVFProductResidualQuantizerFastScan();
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFFastScan.cpp
new file mode 100644
index 000000000..0a90ebc03
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIVFFastScan.cpp
@@ -0,0 +1,1618 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexIVFFastScan.h>
+
+#include <cassert>
+#include <cinttypes>
+#include <cstdio>
+#include <set>
+
+#include <omp.h>
+
+#include <memory>
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/LookupTableScaler.h>
+#include <faiss/impl/pq4_fast_scan.h>
+#include <faiss/impl/simd_result_handlers.h>
+#include <faiss/invlists/BlockInvertedLists.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/quantize_lut.h>
+#include <faiss/utils/utils.h>
+
+#include <knowhere/utils.h>
+
+namespace faiss {
+
+using namespace simd_result_handlers;
+
+inline size_t roundup(size_t a, size_t b) {
+    return (a + b - 1) / b * b;
+}
+
+IndexIVFFastScan::IndexIVFFastScan(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t code_size,
+        MetricType metric,
+        bool is_cosine)
+        : IndexIVF(quantizer, d, nlist, code_size, metric) {
+    // unlike other indexes, we prefer no residuals for performance reasons.
+    by_residual = false;
+    FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
+
+    this->is_cosine = is_cosine;
+}
+
+IndexIVFFastScan::IndexIVFFastScan() {
+    bbs = 0;
+    M2 = 0;
+    is_trained = false;
+    by_residual = false;
+}
+
+void IndexIVFFastScan::init_fastscan(
+        size_t M,
+        size_t nbits,
+        size_t nlist,
+        MetricType /* metric */,
+        int bbs) {
+    FAISS_THROW_IF_NOT(bbs % 32 == 0);
+    FAISS_THROW_IF_NOT(nbits == 4);
+
+    this->M = M;
+    this->nbits = nbits;
+    this->bbs = bbs;
+    ksub = (1 << nbits);
+    M2 = roundup(M, 2);
+    code_size = M2 / 2;
+
+    is_trained = false;
+    replace_invlists(new BlockInvertedLists(nlist, get_CodePacker()), true);
+}
+
+void IndexIVFFastScan::init_code_packer() {
+    auto bil = dynamic_cast<BlockInvertedLists*>(invlists);
+    FAISS_THROW_IF_NOT(bil);
+    delete bil->packer; // in case there was one before
+    bil->packer = get_CodePacker();
+}
+
+IndexIVFFastScan::~IndexIVFFastScan() = default;
+
+/*********************************************************
+ * Code management functions
+ *********************************************************/
+
+void IndexIVFFastScan::train(idx_t n, const float* x) {
+    if (is_cosine) {
+        auto norm_data = std::make_unique<float[]>(n * d);
+        std::memcpy(norm_data.get(), x, n * d * sizeof(float));
+        knowhere::NormalizeVecs(norm_data.get(), n, d);
+        IndexIVF::train(n, norm_data.get());
+    } else {
+        IndexIVF::train(n, x);
+    }
+}
+
+void IndexIVFFastScan::add_with_ids(
+        idx_t n,
+        const float* x,
+        const idx_t* xids) {
+    if (is_cosine) {
+        auto norm_data = std::make_unique<float[]>(n * d);
+        std::memcpy(norm_data.get(), x, n * d * sizeof(float));
+        norms = std::move(knowhere::NormalizeVecs(norm_data.get(), n, d));
+        add_with_ids_impl(n, norm_data.get(), xids);
+    } else {
+        add_with_ids_impl(n, x, xids);
+    }
+}
+
+// knowhere-specific function
+void IndexIVFFastScan::add_with_ids_impl(
+        idx_t n,
+        const float* x,
+        const idx_t* xids) {
+    FAISS_THROW_IF_NOT(is_trained);
+
+    // do some blocking to avoid excessive allocs
+    constexpr idx_t bs = 65536;
+    if (n > bs) {
+        double t0 = getmillisecs();
+        for (idx_t i0 = 0; i0 < n; i0 += bs) {
+            idx_t i1 = std::min(n, i0 + bs);
+            if (verbose) {
+                double t1 = getmillisecs();
+                double elapsed_time = (t1 - t0) / 1000;
+                double total_time = 0;
+                if (i0 != 0) {
+                    total_time = elapsed_time / i0 * n;
+                }
+                size_t mem = get_mem_usage_kb() / (1 << 10);
+
+                printf("IndexIVFFastScan::add_with_ids %zd/%zd, time %.2f/%.2f, RSS %zdMB\n",
+                       size_t(i1),
+                       size_t(n),
+                       elapsed_time,
+                       total_time,
+                       mem);
+            }
+            add_with_ids_impl(i1 - i0, x + i0 * d, xids ? xids + i0 : nullptr);
+        }
+        return;
+    }
+    InterruptCallback::check();
+
+    direct_map.check_can_add(xids);
+    std::unique_ptr<idx_t[]> idx(new idx_t[n]);
+    quantizer->assign(n, x, idx.get());
+
+    AlignedTable<uint8_t> flat_codes(n * code_size);
+    encode_vectors(n, x, idx.get(), flat_codes.get());
+
+    DirectMapAdd dm_adder(direct_map, n, xids);
+    BlockInvertedLists* bil = dynamic_cast<BlockInvertedLists*>(invlists);
+    FAISS_THROW_IF_NOT_MSG(bil, "only block inverted lists supported");
+
+    // prepare batches
+    std::vector<idx_t> order(n);
+    for (idx_t i = 0; i < n; i++) {
+        order[i] = i;
+    }
+
+    // TODO should not need stable
+    std::stable_sort(order.begin(), order.end(), [&idx](idx_t a, idx_t b) {
+        return idx[a] < idx[b];
+    });
+
+    // TODO parallelize
+    idx_t i0 = 0;
+    while (i0 < n) {
+        idx_t list_no = idx[order[i0]];
+        idx_t i1 = i0 + 1;
+        while (i1 < n && idx[order[i1]] == list_no) {
+            i1++;
+        }
+
+        if (list_no == -1) {
+            i0 = i1;
+            continue;
+        }
+
+        // make linear array
+        AlignedTable<uint8_t> list_codes((i1 - i0) * code_size);
+        size_t list_size = bil->list_size(list_no);
+
+        bil->resize(list_no, list_size + i1 - i0);
+
+        for (idx_t i = i0; i < i1; i++) {
+            size_t ofs = list_size + i - i0;
+            idx_t id = xids ? xids[order[i]] : ntotal + order[i];
+            dm_adder.add(order[i], list_no, ofs);
+            bil->ids[list_no][ofs] = id;
+            memcpy(list_codes.data() + (i - i0) * code_size,
+                   flat_codes.data() + order[i] * code_size,
+                   code_size);
+        }
+        pq4_pack_codes_range(
+                list_codes.data(),
+                M,
+                list_size,
+                list_size + i1 - i0,
+                bbs,
+                M2,
+                bil->codes[list_no].data());
+
+        i0 = i1;
+    }
+
+    ntotal += n;
+}
+
+CodePacker* IndexIVFFastScan::get_CodePacker() const {
+    return new CodePackerPQ4(M, bbs);
+}
+
+/*********************************************************
+ * search
+ *********************************************************/
+
+namespace {
+
+template <class C, typename dis_t, class Scaler>
+void estimators_from_tables_generic(
+        const IndexIVFFastScan& index,
+        const uint8_t* codes,
+        size_t ncodes,
+        const dis_t* dis_table,
+        const int64_t* ids,
+        float bias,
+        size_t k,
+        typename C::T* heap_dis,
+        int64_t* heap_ids,
+        const Scaler& scaler) {
+    using accu_t = typename C::T;
+    for (size_t j = 0; j < ncodes; ++j) {
+        BitstringReader bsr(codes + j * index.code_size, index.code_size);
+        accu_t dis = bias;
+        const dis_t* __restrict dt = dis_table;
+        for (size_t m = 0; m < index.M - scaler.nscale; m++) {
+            uint64_t c = bsr.read(index.nbits);
+            dis += dt[c];
+            dt += index.ksub;
+        }
+
+        for (size_t m = 0; m < scaler.nscale; m++) {
+            uint64_t c = bsr.read(index.nbits);
+            dis += scaler.scale_one(dt[c]);
+            dt += index.ksub;
+        }
+
+        if (C::cmp(heap_dis[0], dis)) {
+            // todo aguzhva: replace with heap_replace_top
+            heap_pop<C>(k, heap_dis, heap_ids);
+            heap_push<C>(k, heap_dis, heap_ids, dis, ids[j]);
+        }
+    }
+}
+
+using namespace quantize_lut;
+
+} // anonymous namespace
+
+/*********************************************************
+ * Look-Up Table functions
+ *********************************************************/
+
+void IndexIVFFastScan::compute_LUT_uint8(
+        size_t n,
+        const float* x,
+        const idx_t* coarse_ids,
+        const float* coarse_dis,
+        AlignedTable<uint8_t>& dis_tables,
+        AlignedTable<uint16_t>& biases,
+        float* normalizers) const {
+    AlignedTable<float> dis_tables_float;
+    AlignedTable<float> biases_float;
+
+    uint64_t t0 = get_cy();
+    compute_LUT(n, x, coarse_ids, coarse_dis, dis_tables_float, biases_float);
+    // IVFFastScan_stats.t_compute_distance_tables += get_cy() - t0;
+
+    bool lut_is_3d = lookup_table_is_3d();
+    size_t dim123 = ksub * M;
+    size_t dim123_2 = ksub * M2;
+    if (lut_is_3d) {
+        dim123 *= nprobe;
+        dim123_2 *= nprobe;
+    }
+    dis_tables.resize(n * dim123_2);
+    if (biases_float.get()) {
+        biases.resize(n * nprobe);
+    }
+    uint64_t t1 = get_cy();
+
+#pragma omp parallel for if (n > 100)
+    for (int64_t i = 0; i < n; i++) {
+        const float* t_in = dis_tables_float.get() + i * dim123;
+        const float* b_in = nullptr;
+        uint8_t* t_out = dis_tables.get() + i * dim123_2;
+        uint16_t* b_out = nullptr;
+        if (biases_float.get()) {
+            b_in = biases_float.get() + i * nprobe;
+            b_out = biases.get() + i * nprobe;
+        }
+
+        quantize_LUT_and_bias(
+                nprobe,
+                M,
+                ksub,
+                lut_is_3d,
+                t_in,
+                b_in,
+                t_out,
+                M2,
+                b_out,
+                normalizers + 2 * i,
+                normalizers + 2 * i + 1);
+    }
+    // IVFFastScan_stats.t_round += get_cy() - t1;
+}
+
+/*********************************************************
+ * Search functions
+ *********************************************************/
+
+void IndexIVFFastScan::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
+    }
+
+    FAISS_THROW_IF_NOT(k > 0);
+
+    DummyScaler scaler;
+    if (metric_type == METRIC_L2) {
+        search_dispatch_implem<true>(
+                n, x, k, distances, labels, scaler, params);
+    } else {
+        search_dispatch_implem<false>(
+                n, x, k, distances, labels, scaler, params);
+    }
+}
+
+void IndexIVFFastScan::range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
+    }
+
+    DummyScaler scaler;
+    if (metric_type == METRIC_L2) {
+        range_search_dispatch_implem<true>(
+                n, x, radius, result, scaler, params);
+    } else {
+        range_search_dispatch_implem<false>(
+                n, x, radius, result, scaler, params);
+    }
+}
+
+template <bool is_max, class Scaler>
+void IndexIVFFastScan::search_dispatch_implem(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    const idx_t nprobe = params ? params->nprobe : this->nprobe;
+
+    using Cfloat = typename std::conditional<
+            is_max,
+            CMax<float, int64_t>,
+            CMin<float, int64_t>>::type;
+
+    using C = typename std::conditional<
+            is_max,
+            CMax<uint16_t, int64_t>,
+            CMin<uint16_t, int64_t>>::type;
+
+    if (n == 0) {
+        return;
+    }
+
+    // actual implementation used
+    int impl = implem;
+
+    if (impl == 0) {
+        if (bbs == 32) {
+            impl = 12;
+        } else {
+            impl = 10;
+        }
+        if (k > 20) {
+            impl++;
+        }
+    }
+
+    if (impl == 1) {
+        search_implem_1<Cfloat>(n, x, k, distances, labels, scaler, params);
+    } else if (impl == 2) {
+        search_implem_2<C>(n, x, k, distances, labels, scaler, params);
+
+    } else if (impl >= 10 && impl <= 15) {
+        size_t ndis = 0, nlist_visited = 0;
+
+        if (n < 2) {
+            if (impl == 12 || impl == 13) {
+                search_implem_12<C>(
+                        n,
+                        x,
+                        k,
+                        distances,
+                        labels,
+                        impl,
+                        &ndis,
+                        &nlist_visited,
+                        scaler,
+                        params);
+            } else if (impl == 14 || impl == 15) {
+                search_implem_14<C>(
+                        n, x, k, distances, labels, impl, scaler, params);
+            } else {
+                search_implem_10<C>(
+                        n,
+                        x,
+                        k,
+                        distances,
+                        labels,
+                        impl,
+                        &ndis,
+                        &nlist_visited,
+                        scaler,
+                        params);
+            }
+        } else {
+            // explicitly slice over threads
+            int nslice;
+            if (n <= omp_get_max_threads()) {
+                nslice = n;
+            } else if (lookup_table_is_3d()) {
+                // make sure we don't make too big LUT tables
+                size_t lut_size_per_query =
+                        M * ksub * nprobe * (sizeof(float) + sizeof(uint8_t));
+
+                size_t max_lut_size = precomputed_table_max_bytes;
+                // how many queries we can handle within mem budget
+                size_t nq_ok =
+                        std::max(max_lut_size / lut_size_per_query, size_t(1));
+                nslice =
+                        roundup(std::max(size_t(n / nq_ok), size_t(1)),
+                                omp_get_max_threads());
+            } else {
+                // LUTs unlikely to be a limiting factor
+                nslice = omp_get_max_threads();
+            }
+            if (impl == 14 ||
+                impl == 15) { // this might require slicing if there are too
+                              // many queries (for now we keep this simple)
+                search_implem_14<C>(
+                        n, x, k, distances, labels, impl, scaler, params);
+            } else {
+#pragma omp parallel for reduction(+ : ndis, nlist_visited)
+                for (int slice = 0; slice < nslice; slice++) {
+                    idx_t i0 = n * slice / nslice;
+                    idx_t i1 = n * (slice + 1) / nslice;
+                    float* dis_i = distances + i0 * k;
+                    idx_t* lab_i = labels + i0 * k;
+                    if (impl == 12 || impl == 13) {
+                        search_implem_12<C>(
+                                i1 - i0,
+                                x + i0 * d,
+                                k,
+                                dis_i,
+                                lab_i,
+                                impl,
+                                &ndis,
+                                &nlist_visited,
+                                scaler,
+                                params);
+                    } else {
+                        search_implem_10<C>(
+                                i1 - i0,
+                                x + i0 * d,
+                                k,
+                                dis_i,
+                                lab_i,
+                                impl,
+                                &ndis,
+                                &nlist_visited,
+                                scaler,
+                                params);
+                    }
+                }
+            }
+        }
+        indexIVF_stats.nq += n;
+        indexIVF_stats.ndis += ndis;
+        indexIVF_stats.nlist += nlist_visited;
+    } else {
+        FAISS_THROW_FMT("implem %d does not exist", implem);
+    }
+}
+
+template <class C, class Scaler>
+void IndexIVFFastScan::search_implem_1(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    FAISS_THROW_IF_NOT(orig_invlists);
+
+    const size_t nprobe = params ? params->nprobe : this->nprobe;
+    const size_t max_codes = params ? params->max_codes : this->max_codes;
+    const IDSelector* sel = params ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    quantizer->search(
+            n, x, nprobe, coarse_dis.get(), coarse_ids.get(), quantizer_params);
+
+    size_t dim12 = ksub * M;
+    AlignedTable<float> dis_tables;
+    AlignedTable<float> biases;
+
+    compute_LUT(n, x, coarse_ids.get(), coarse_dis.get(), dis_tables, biases);
+
+    bool single_LUT = !lookup_table_is_3d();
+
+    size_t ndis = 0, nlist_visited = 0;
+
+#pragma omp parallel for reduction(+ : ndis, nlist_visited)
+    for (idx_t i = 0; i < n; i++) {
+        int64_t* heap_ids = labels + i * k;
+        float* heap_dis = distances + i * k;
+        heap_heapify<C>(k, heap_dis, heap_ids);
+        float* LUT = nullptr;
+
+        if (single_LUT) {
+            LUT = dis_tables.get() + i * dim12;
+        }
+        for (idx_t j = 0; j < nprobe; j++) {
+            if (!single_LUT) {
+                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
+            }
+            idx_t list_no = coarse_ids[i * nprobe + j];
+            if (list_no < 0)
+                continue;
+            size_t ls = orig_invlists->list_size(list_no);
+            if (ls == 0)
+                continue;
+            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
+            InvertedLists::ScopedIds ids(orig_invlists, list_no);
+
+            float bias = biases.get() ? biases[i * nprobe + j] : 0;
+
+            estimators_from_tables_generic<C>(
+                    *this,
+                    codes.get(),
+                    ls,
+                    LUT,
+                    ids.get(),
+                    bias,
+                    k,
+                    heap_dis,
+                    heap_ids,
+                    scaler);
+            nlist_visited++;
+            ndis++;
+        }
+        heap_reorder<C>(k, heap_dis, heap_ids);
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+
+template <class C, class Scaler>
+void IndexIVFFastScan::search_implem_2(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    FAISS_THROW_IF_NOT(orig_invlists);
+
+    const size_t nprobe = params ? params->nprobe : this->nprobe;
+    const size_t max_codes = params ? params->max_codes : this->max_codes;
+    const IDSelector* sel = params ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    quantizer->search(
+            n, x, nprobe, coarse_dis.get(), coarse_ids.get(), quantizer_params);
+
+    size_t dim12 = ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+
+    bool single_LUT = !lookup_table_is_3d();
+
+    size_t ndis = 0, nlist_visited = 0;
+
+#pragma omp parallel for reduction(+ : ndis, nlist_visited)
+    for (idx_t i = 0; i < n; i++) {
+        std::vector<uint16_t> tmp_dis(k);
+        int64_t* heap_ids = labels + i * k;
+        uint16_t* heap_dis = tmp_dis.data();
+        heap_heapify<C>(k, heap_dis, heap_ids);
+        const uint8_t* LUT = nullptr;
+
+        if (single_LUT) {
+            LUT = dis_tables.get() + i * dim12;
+        }
+        for (idx_t j = 0; j < nprobe; j++) {
+            if (!single_LUT) {
+                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
+            }
+            idx_t list_no = coarse_ids[i * nprobe + j];
+            if (list_no < 0)
+                continue;
+            size_t ls = orig_invlists->list_size(list_no);
+            if (ls == 0)
+                continue;
+            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
+            InvertedLists::ScopedIds ids(orig_invlists, list_no);
+
+            uint16_t bias = biases.get() ? biases[i * nprobe + j] : 0;
+
+            estimators_from_tables_generic<C>(
+                    *this,
+                    codes.get(),
+                    ls,
+                    LUT,
+                    ids.get(),
+                    bias,
+                    k,
+                    heap_dis,
+                    heap_ids,
+                    scaler);
+
+            nlist_visited++;
+            ndis += ls;
+        }
+        heap_reorder<C>(k, heap_dis, heap_ids);
+        // convert distances to float
+        {
+            float one_a = 1 / normalizers[2 * i], b = normalizers[2 * i + 1];
+            if (skip & 16) {
+                one_a = 1;
+                b = 0;
+            }
+            float* heap_dis_float = distances + i * k;
+            for (int j = 0; j < k; j++) {
+                heap_dis_float[j] = b + heap_dis[j] * one_a;
+            }
+        }
+    }
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+
+template <class C, class Scaler>
+void IndexIVFFastScan::search_implem_10(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        int impl,
+        size_t* ndis_out,
+        size_t* nlist_out,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    const size_t nprobe = params ? params->nprobe : this->nprobe;
+    const size_t max_codes = params ? params->max_codes : this->max_codes;
+    const IDSelector* sel = params ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    memset(distances, -1, sizeof(float) * k * n);
+    memset(labels, -1, sizeof(idx_t) * k * n);
+
+    using HeapHC = HeapHandler<C, true>;
+    using ReservoirHC = ReservoirHandler<C, true>;
+    using SingleResultHC = SingleResultHandler<C, true>;
+
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    uint64_t times[10];
+    memset(times, 0, sizeof(times));
+    int ti = 0;
+#define TIC times[ti++] = get_cy()
+    TIC;
+
+    quantizer->search(
+            n, x, nprobe, coarse_dis.get(), coarse_ids.get(), quantizer_params);
+
+    TIC;
+
+    size_t dim12 = ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+
+    TIC;
+
+    bool single_LUT = !lookup_table_is_3d();
+
+    TIC;
+    size_t ndis = 0, nlist_visited = 0;
+
+    {
+        AlignedTable<uint16_t> tmp_distances(k);
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* LUT = nullptr;
+            int qmap1[1] = {0};
+            std::unique_ptr<SIMDResultHandler<C, true>> handler;
+
+            if (k == 1) {
+                handler.reset(new SingleResultHC(1, 0, sel));
+            } else if (impl == 10) {
+                handler.reset(new HeapHC(
+                        1, tmp_distances.get(), labels + i * k, k, 0, sel));
+            } else if (impl == 11) {
+                handler.reset(new ReservoirHC(1, 0, k, 2 * k, sel));
+            } else {
+                FAISS_THROW_MSG("invalid");
+            }
+
+            handler->q_map = qmap1;
+
+            if (single_LUT) {
+                LUT = dis_tables.get() + i * dim12;
+            }
+            for (idx_t j = 0; j < nprobe; j++) {
+                size_t ij = i * nprobe + j;
+                if (!single_LUT) {
+                    LUT = dis_tables.get() + ij * dim12;
+                }
+                if (biases.get()) {
+                    handler->dbias = biases.get() + ij;
+                }
+
+                idx_t list_no = coarse_ids[ij];
+                if (list_no < 0)
+                    continue;
+                size_t ls = invlists->list_size(list_no);
+                if (ls == 0)
+                    continue;
+
+                InvertedLists::ScopedCodes codes(invlists, list_no);
+                InvertedLists::ScopedIds ids(invlists, list_no);
+
+                handler->ntotal = ls;
+                handler->id_map = ids.get();
+
+#define DISPATCH(classHC)                                                      \
+    if (dynamic_cast<classHC*>(handler.get())) {                               \
+        auto* res = static_cast<classHC*>(handler.get());                      \
+        pq4_accumulate_loop(                                                   \
+                1, roundup(ls, bbs), bbs, M2, codes.get(), LUT, *res, scaler); \
+    }
+                DISPATCH(HeapHC)
+                else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
+#undef DISPATCH
+
+                        nlist_visited++;
+                ndis++;
+            }
+
+            handler->to_flat_arrays(
+                    distances + i * k,
+                    labels + i * k,
+                    skip & 16 ? nullptr : normalizers.get() + i * 2);
+        }
+    }
+    *ndis_out = ndis;
+    *nlist_out = nlist;
+}
+
+template <class C, class Scaler>
+void IndexIVFFastScan::search_implem_12(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        int impl,
+        size_t* ndis_out,
+        size_t* nlist_out,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    if (n == 0) { // does not work well with reservoir
+        return;
+    }
+    FAISS_THROW_IF_NOT(bbs == 32);
+
+    const size_t nprobe = params ? params->nprobe : this->nprobe;
+    const size_t max_codes = params ? params->max_codes : this->max_codes;
+    const IDSelector* sel = params ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    uint64_t times[10];
+    memset(times, 0, sizeof(times));
+    int ti = 0;
+#define TIC times[ti++] = get_cy()
+    TIC;
+
+    quantizer->search(
+            n, x, nprobe, coarse_dis.get(), coarse_ids.get(), quantizer_params);
+
+    TIC;
+
+    size_t dim12 = ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+
+    TIC;
+
+    struct QC {
+        int qno;     // sequence number of the query
+        int list_no; // list to visit
+        int rank;    // this is the rank'th result of the coarse quantizer
+    };
+    bool single_LUT = !lookup_table_is_3d();
+
+    std::vector<QC> qcs;
+    {
+        int ij = 0;
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < nprobe; j++) {
+                if (coarse_ids[ij] >= 0) {
+                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
+                }
+                ij++;
+            }
+        }
+        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
+            return a.list_no < b.list_no;
+        });
+    }
+    TIC;
+
+    // prepare the result handlers
+
+    std::unique_ptr<SIMDResultHandler<C, true>> handler;
+    AlignedTable<uint16_t> tmp_distances;
+
+    using HeapHC = HeapHandler<C, true>;
+    using ReservoirHC = ReservoirHandler<C, true>;
+    using SingleResultHC = SingleResultHandler<C, true>;
+
+    if (k == 1) {
+        handler.reset(new SingleResultHC(n, 0, sel));
+    } else if (impl == 12) {
+        tmp_distances.resize(n * k);
+        handler.reset(new HeapHC(n, tmp_distances.get(), labels, k, 0, sel));
+    } else if (impl == 13) {
+        handler.reset(new ReservoirHC(n, 0, k, 2 * k, sel));
+    }
+
+    int qbs2 = this->qbs2 ? this->qbs2 : 11;
+
+    std::vector<uint16_t> tmp_bias;
+    if (biases.get()) {
+        tmp_bias.resize(qbs2);
+        handler->dbias = tmp_bias.data();
+    }
+    TIC;
+
+    size_t ndis = 0;
+
+    size_t i0 = 0;
+    uint64_t t_copy_pack = 0, t_scan = 0;
+    while (i0 < qcs.size()) {
+        uint64_t tt0 = get_cy();
+
+        // find all queries that access this inverted list
+        int list_no = qcs[i0].list_no;
+        size_t i1 = i0 + 1;
+
+        while (i1 < qcs.size() && i1 < i0 + qbs2) {
+            if (qcs[i1].list_no != list_no) {
+                break;
+            }
+            i1++;
+        }
+
+        size_t list_size = invlists->list_size(list_no);
+
+        if (list_size == 0) {
+            i0 = i1;
+            continue;
+        }
+
+        // re-organize LUTs and biases into the right order
+        int nc = i1 - i0;
+
+        std::vector<int> q_map(nc), lut_entries(nc);
+        AlignedTable<uint8_t> LUT(nc * dim12);
+        memset(LUT.get(), -1, nc * dim12);
+        int qbs = pq4_preferred_qbs(nc);
+
+        for (size_t i = i0; i < i1; i++) {
+            const QC& qc = qcs[i];
+            q_map[i - i0] = qc.qno;
+            int ij = qc.qno * nprobe + qc.rank;
+            lut_entries[i - i0] = single_LUT ? qc.qno : ij;
+            if (biases.get()) {
+                tmp_bias[i - i0] = biases[ij];
+            }
+        }
+        pq4_pack_LUT_qbs_q_map(
+                qbs, M2, dis_tables.get(), lut_entries.data(), LUT.get());
+
+        // access the inverted list
+
+        ndis += (i1 - i0) * list_size;
+
+        InvertedLists::ScopedCodes codes(invlists, list_no);
+        InvertedLists::ScopedIds ids(invlists, list_no);
+
+        // prepare the handler
+
+        handler->ntotal = list_size;
+        handler->q_map = q_map.data();
+        handler->id_map = ids.get();
+        uint64_t tt1 = get_cy();
+
+#define DISPATCH(classHC)                                                  \
+    if (dynamic_cast<classHC*>(handler.get())) {                           \
+        auto* res = static_cast<classHC*>(handler.get());                  \
+        pq4_accumulate_loop_qbs(                                           \
+                qbs, list_size, M2, codes.get(), LUT.get(), *res, scaler); \
+    }
+        DISPATCH(HeapHC)
+        else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
+
+                // prepare for next loop
+                i0 = i1;
+
+        uint64_t tt2 = get_cy();
+        t_copy_pack += tt1 - tt0;
+        t_scan += tt2 - tt1;
+    }
+    TIC;
+
+    // labels is in-place for HeapHC
+    handler->to_flat_arrays(
+            distances, labels, skip & 16 ? nullptr : normalizers.get());
+
+    TIC;
+
+    // these stats are not thread-safe
+
+    // for (int i = 1; i < ti; i++) {
+    //     IVFFastScan_stats.times[i] += times[i] - times[i - 1];
+    // }
+    // IVFFastScan_stats.t_copy_pack += t_copy_pack;
+    // IVFFastScan_stats.t_scan += t_scan;
+    //
+    // if (auto* rh = dynamic_cast<ReservoirHC*>(handler.get())) {
+    //     for (int i = 0; i < 4; i++) {
+    //         IVFFastScan_stats.reservoir_times[i] += rh->times[i];
+    //     }
+    // }
+
+    *ndis_out = ndis;
+    *nlist_out = nlist;
+}
+
+template <class C, class Scaler>
+void IndexIVFFastScan::search_implem_14(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        int impl,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    if (n == 0) { // does not work well with reservoir
+        return;
+    }
+    FAISS_THROW_IF_NOT(bbs == 32);
+
+    const size_t nprobe = params ? params->nprobe : this->nprobe;
+    const size_t max_codes = params ? params->max_codes : this->max_codes;
+    const IDSelector* sel = params ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    uint64_t ttg0 = get_cy();
+
+    quantizer->search(
+            n, x, nprobe, coarse_dis.get(), coarse_ids.get(), quantizer_params);
+
+    uint64_t ttg1 = get_cy();
+    uint64_t coarse_search_tt = ttg1 - ttg0;
+
+    size_t dim12 = ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+
+    uint64_t ttg2 = get_cy();
+    uint64_t lut_compute_tt = ttg2 - ttg1;
+
+    struct QC {
+        int qno;     // sequence number of the query
+        int list_no; // list to visit
+        int rank;    // this is the rank'th result of the coarse quantizer
+    };
+    bool single_LUT = !lookup_table_is_3d();
+
+    std::vector<QC> qcs;
+    {
+        int ij = 0;
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < nprobe; j++) {
+                if (coarse_ids[ij] >= 0) {
+                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
+                }
+                ij++;
+            }
+        }
+        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
+            return a.list_no < b.list_no;
+        });
+    }
+
+    struct SE {
+        size_t start; // start in the QC vector
+        size_t end;   // end in the QC vector
+        size_t list_size;
+    };
+    std::vector<SE> ses;
+    size_t i0_l = 0;
+    while (i0_l < qcs.size()) {
+        // find all queries that access this inverted list
+        int list_no = qcs[i0_l].list_no;
+        size_t i1 = i0_l + 1;
+
+        while (i1 < qcs.size() && i1 < i0_l + qbs2) {
+            if (qcs[i1].list_no != list_no) {
+                break;
+            }
+            i1++;
+        }
+
+        size_t list_size = invlists->list_size(list_no);
+
+        if (list_size == 0) {
+            i0_l = i1;
+            continue;
+        }
+        ses.push_back(SE{i0_l, i1, list_size});
+        i0_l = i1;
+    }
+    uint64_t ttg3 = get_cy();
+    uint64_t compute_clusters_tt = ttg3 - ttg2;
+
+    // function to handle the global heap
+    using HeapForIP = CMin<float, idx_t>;
+    using HeapForL2 = CMax<float, idx_t>;
+    auto init_result = [&](float* simi, idx_t* idxi) {
+        if (metric_type == METRIC_INNER_PRODUCT) {
+            heap_heapify<HeapForIP>(k, simi, idxi);
+        } else {
+            heap_heapify<HeapForL2>(k, simi, idxi);
+        }
+    };
+
+    auto add_local_results = [&](const float* local_dis,
+                                 const idx_t* local_idx,
+                                 float* simi,
+                                 idx_t* idxi) {
+        if (metric_type == METRIC_INNER_PRODUCT) {
+            heap_addn<HeapForIP>(k, simi, idxi, local_dis, local_idx, k);
+        } else {
+            heap_addn<HeapForL2>(k, simi, idxi, local_dis, local_idx, k);
+        }
+    };
+
+    auto reorder_result = [&](float* simi, idx_t* idxi) {
+        if (metric_type == METRIC_INNER_PRODUCT) {
+            heap_reorder<HeapForIP>(k, simi, idxi);
+        } else {
+            heap_reorder<HeapForL2>(k, simi, idxi);
+        }
+    };
+    uint64_t ttg4 = get_cy();
+    uint64_t fn_tt = ttg4 - ttg3;
+
+    size_t ndis = 0;
+    size_t nlist_visited = 0;
+
+#pragma omp parallel reduction(+ : ndis, nlist_visited)
+    {
+        // storage for each thread
+        std::vector<idx_t> local_idx(k * n);
+        std::vector<float> local_dis(k * n);
+
+        // prepare the result handlers
+        std::unique_ptr<SIMDResultHandler<C, true>> handler;
+        AlignedTable<uint16_t> tmp_distances;
+
+        using HeapHC = HeapHandler<C, true>;
+        using ReservoirHC = ReservoirHandler<C, true>;
+        using SingleResultHC = SingleResultHandler<C, true>;
+
+        if (k == 1) {
+            handler.reset(new SingleResultHC(n, 0, sel));
+        } else if (impl == 14) {
+            tmp_distances.resize(n * k);
+            handler.reset(new HeapHC(
+                    n, tmp_distances.get(), local_idx.data(), k, 0, sel));
+        } else if (impl == 15) {
+            handler.reset(new ReservoirHC(n, 0, k, 2 * k, sel));
+        }
+
+        int qbs2 = this->qbs2 ? this->qbs2 : 11;
+
+        std::vector<uint16_t> tmp_bias;
+        if (biases.get()) {
+            tmp_bias.resize(qbs2);
+            handler->dbias = tmp_bias.data();
+        }
+
+        uint64_t ttg5 = get_cy();
+        uint64_t handler_tt = ttg5 - ttg4;
+
+        std::set<int> q_set;
+        uint64_t t_copy_pack = 0, t_scan = 0;
+#pragma omp for schedule(dynamic)
+        for (idx_t cluster = 0; cluster < ses.size(); cluster++) {
+            uint64_t tt0 = get_cy();
+            size_t i0 = ses[cluster].start;
+            size_t i1 = ses[cluster].end;
+            size_t list_size = ses[cluster].list_size;
+            nlist_visited++;
+            int list_no = qcs[i0].list_no;
+
+            // re-organize LUTs and biases into the right order
+            int nc = i1 - i0;
+
+            std::vector<int> q_map(nc), lut_entries(nc);
+            AlignedTable<uint8_t> LUT(nc * dim12);
+            memset(LUT.get(), -1, nc * dim12);
+            int qbs = pq4_preferred_qbs(nc);
+
+            for (size_t i = i0; i < i1; i++) {
+                const QC& qc = qcs[i];
+                q_map[i - i0] = qc.qno;
+                q_set.insert(qc.qno);
+                int ij = qc.qno * nprobe + qc.rank;
+                lut_entries[i - i0] = single_LUT ? qc.qno : ij;
+                if (biases.get()) {
+                    tmp_bias[i - i0] = biases[ij];
+                }
+            }
+            pq4_pack_LUT_qbs_q_map(
+                    qbs, M2, dis_tables.get(), lut_entries.data(), LUT.get());
+
+            // access the inverted list
+
+            ndis += (i1 - i0) * list_size;
+
+            InvertedLists::ScopedCodes codes(invlists, list_no);
+            InvertedLists::ScopedIds ids(invlists, list_no);
+
+            // prepare the handler
+
+            handler->ntotal = list_size;
+            handler->q_map = q_map.data();
+            handler->id_map = ids.get();
+            uint64_t tt1 = get_cy();
+
+#define DISPATCH(classHC)                                                  \
+    if (dynamic_cast<classHC*>(handler.get())) {                           \
+        auto* res = static_cast<classHC*>(handler.get());                  \
+        pq4_accumulate_loop_qbs(                                           \
+                qbs, list_size, M2, codes.get(), LUT.get(), *res, scaler); \
+    }
+            DISPATCH(HeapHC)
+            else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
+
+                    uint64_t tt2 = get_cy();
+            t_copy_pack += tt1 - tt0;
+            t_scan += tt2 - tt1;
+        }
+
+        // labels is in-place for HeapHC
+        handler->to_flat_arrays(
+                local_dis.data(),
+                local_idx.data(),
+                skip & 16 ? nullptr : normalizers.get());
+
+#pragma omp single
+        {
+            // we init the results as a heap
+            for (idx_t i = 0; i < n; i++) {
+                init_result(distances + i * k, labels + i * k);
+            }
+        }
+#pragma omp barrier
+#pragma omp critical
+        {
+            // write to global heap  #go over only the queries
+            for (std::set<int>::iterator it = q_set.begin(); it != q_set.end();
+                 ++it) {
+                add_local_results(
+                        local_dis.data() + *it * k,
+                        local_idx.data() + *it * k,
+                        distances + *it * k,
+                        labels + *it * k);
+            }
+
+            // IVFFastScan_stats.t_copy_pack += t_copy_pack;
+            // IVFFastScan_stats.t_scan += t_scan;
+            //
+            // if (auto* rh = dynamic_cast<ReservoirHC*>(handler.get())) {
+            //     for (int i = 0; i < 4; i++) {
+            //         IVFFastScan_stats.reservoir_times[i] += rh->times[i];
+            //     }
+            // }
+        }
+#pragma omp barrier
+#pragma omp single
+        {
+            for (idx_t i = 0; i < n; i++) {
+                reorder_result(distances + i * k, labels + i * k);
+            }
+        }
+    }
+
+    indexIVF_stats.nq += n;
+    indexIVF_stats.ndis += ndis;
+    indexIVF_stats.nlist += nlist_visited;
+}
+
+template <bool is_max, class Scaler>
+void IndexIVFFastScan::range_search_dispatch_implem(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    using Cfloat = typename std::conditional<
+            is_max,
+            CMax<float, int64_t>,
+            CMin<float, int64_t>>::type;
+
+    if (n == 0) {
+        return;
+    }
+
+    size_t ndis = 0, nlist_visited = 0;
+
+    // currently, only impl 12 is implemented
+    constexpr int impl = 12;
+    range_search_implem_12<Cfloat, Scaler>(
+            n, x, radius, result, impl, &ndis, &nlist_visited, scaler, params);
+}
+
+template <class C, class Scaler>
+void IndexIVFFastScan::range_search_implem_12(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        int impl,
+        size_t* ndis_out,
+        size_t* nlist_out,
+        const Scaler& scaler,
+        const IVFSearchParameters* params) const {
+    if (n == 0) { // does not work well with reservoir
+        return;
+    }
+    FAISS_THROW_IF_NOT(n == 1); // in knowhere, all request will make nq=1
+    FAISS_THROW_IF_NOT(
+            impl == 12); // the only implementation is supported so far
+    FAISS_THROW_IF_NOT(bbs == 32);
+
+    const size_t nprobe = params ? params->nprobe : this->nprobe;
+    const size_t max_codes = params ? params->max_codes : this->max_codes;
+    const IDSelector* sel = params ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
+    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
+    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+    uint64_t times[10];
+    memset(times, 0, sizeof(times));
+    int ti = 0;
+#define TIC times[ti++] = get_cy()
+    TIC;
+
+    quantizer->search(
+            n, x, nprobe, coarse_dis.get(), coarse_ids.get(), quantizer_params);
+
+    TIC;
+
+    size_t dim12 = ksub * M2;
+    AlignedTable<uint8_t> dis_tables;
+    AlignedTable<uint16_t> biases;
+    std::unique_ptr<float[]> normalizers(new float[2 * n]);
+
+    compute_LUT_uint8(
+            n,
+            x,
+            coarse_ids.get(),
+            coarse_dis.get(),
+            dis_tables,
+            biases,
+            normalizers.get());
+
+    TIC;
+
+    struct QC {
+        int qno;     // sequence number of the query
+        int list_no; // list to visit
+        int rank;    // this is the rank'th result of the coarse quantizer
+    };
+    bool single_LUT = !lookup_table_is_3d();
+
+    std::vector<QC> qcs;
+    {
+        int ij = 0;
+        for (int i = 0; i < n; i++) {
+            for (int j = 0; j < nprobe; j++) {
+                if (coarse_ids[ij] >= 0) {
+                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
+                }
+                ij++;
+            }
+        }
+        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
+            return a.list_no < b.list_no;
+        });
+    }
+    TIC;
+
+    // prepare the result handlers
+    std::unique_ptr<RangeSearchResultHandler<C, true>> handler(
+            new RangeSearchResultHandler<C, true>(result, radius, 0, sel));
+    handler->normalizers = normalizers.get();
+    int qbs2 = this->qbs2 ? this->qbs2 : 11;
+
+    std::vector<uint16_t> tmp_bias;
+    if (biases.get()) {
+        tmp_bias.resize(qbs2);
+        handler->dbias = tmp_bias.data();
+    }
+    TIC;
+
+    size_t ndis = 0;
+
+    size_t i0 = 0;
+    uint64_t t_copy_pack = 0, t_scan = 0;
+    while (i0 < qcs.size()) {
+        uint64_t tt0 = get_cy();
+
+        // find all queries that access this inverted list
+        int list_no = qcs[i0].list_no;
+        size_t i1 = i0 + 1;
+
+        while (i1 < qcs.size() && i1 < i0 + qbs2) {
+            if (qcs[i1].list_no != list_no) {
+                break;
+            }
+            i1++;
+        }
+
+        size_t list_size = invlists->list_size(list_no);
+
+        if (list_size == 0) {
+            i0 = i1;
+            continue;
+        }
+
+        // re-organize LUTs and biases into the right order
+        int nc = i1 - i0;
+
+        std::vector<int> q_map(nc), lut_entries(nc);
+        AlignedTable<uint8_t> LUT(nc * dim12);
+        memset(LUT.get(), -1, nc * dim12);
+        int qbs = pq4_preferred_qbs(nc);
+
+        for (size_t i = i0; i < i1; i++) {
+            const QC& qc = qcs[i];
+            q_map[i - i0] = qc.qno;
+            int ij = qc.qno * nprobe + qc.rank;
+            lut_entries[i - i0] = single_LUT ? qc.qno : ij;
+            if (biases.get()) {
+                tmp_bias[i - i0] = biases[ij];
+            }
+        }
+        pq4_pack_LUT_qbs_q_map(
+                qbs, M2, dis_tables.get(), lut_entries.data(), LUT.get());
+
+        // access the inverted list
+
+        ndis += (i1 - i0) * list_size;
+
+        InvertedLists::ScopedCodes codes(invlists, list_no);
+        InvertedLists::ScopedIds ids(invlists, list_no);
+
+        // prepare the handler
+
+        handler->ntotal = list_size;
+        handler->q_map = q_map.data();
+        handler->id_map = ids.get();
+        handler->in_range_num = 0;
+        uint64_t tt1 = get_cy();
+
+        pq4_accumulate_loop_qbs(
+                qbs,
+                list_size,
+                M2,
+                codes.get(),
+                LUT.get(),
+                *(handler.get()),
+                scaler);
+        if (handler->in_range_num <= 0) {
+            break;
+        }
+
+        // prepare for next loop
+        i0 = i1;
+
+        uint64_t tt2 = get_cy();
+        t_copy_pack += tt1 - tt0;
+        t_scan += tt2 - tt1;
+    }
+    TIC;
+
+    handler->to_result();
+
+    TIC;
+
+    // these stats are not thread-safe
+
+    // for (int i = 1; i < ti; i++) {
+    //     IVFFastScan_stats.times[i] += times[i] - times[i - 1];
+    // }
+    // IVFFastScan_stats.t_copy_pack += t_copy_pack;
+    // IVFFastScan_stats.t_scan += t_scan;
+    //
+    // if (auto* rh = dynamic_cast<ReservoirHC*>(handler.get())) {
+    //     for (int i = 0; i < 4; i++) {
+    //         IVFFastScan_stats.reservoir_times[i] += rh->times[i];
+    //     }
+    // }
+
+    *ndis_out = ndis;
+    *nlist_out = nlist;
+}
+
+void IndexIVFFastScan::reconstruct_from_offset(
+        int64_t list_no,
+        int64_t offset,
+        float* recons) const {
+    // unpack codes
+    InvertedLists::ScopedCodes list_codes(invlists, list_no);
+    std::vector<uint8_t> code(code_size, 0);
+    BitstringWriter bsw(code.data(), code_size);
+    for (size_t m = 0; m < M; m++) {
+        uint8_t c =
+                pq4_get_packed_element(list_codes.get(), bbs, M2, offset, m);
+        bsw.write(c, nbits);
+    }
+    sa_decode(1, code.data(), recons);
+
+    // add centroid to it
+    if (by_residual) {
+        std::vector<float> centroid(d);
+        quantizer->reconstruct(list_no, centroid.data());
+        for (int i = 0; i < d; ++i) {
+            recons[i] += centroid[i];
+        }
+    }
+}
+
+void IndexIVFFastScan::reconstruct_orig_invlists() {
+    FAISS_THROW_IF_NOT(orig_invlists != nullptr);
+    FAISS_THROW_IF_NOT(orig_invlists->list_size(0) == 0);
+
+    for (size_t list_no = 0; list_no < nlist; list_no++) {
+        InvertedLists::ScopedCodes codes(invlists, list_no);
+        InvertedLists::ScopedIds ids(invlists, list_no);
+        size_t list_size = orig_invlists->list_size(list_no);
+        std::vector<uint8_t> code(code_size, 0);
+
+        for (size_t offset = 0; offset < list_size; offset++) {
+            // unpack codes
+            BitstringWriter bsw(code.data(), code_size);
+            for (size_t m = 0; m < M; m++) {
+                uint8_t c =
+                        pq4_get_packed_element(codes.get(), bbs, M2, offset, m);
+                bsw.write(c, nbits);
+            }
+
+            // get id
+            idx_t id = ids.get()[offset];
+
+            orig_invlists->add_entry(list_no, id, code.data());
+        }
+    }
+}
+
+// IVFFastScanStats IVFFastScan_stats;
+
+template void IndexIVFFastScan::search_dispatch_implem<true, NormTableScaler>(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const NormTableScaler& scaler,
+        const IVFSearchParameters* params = nullptr) const;
+
+template void IndexIVFFastScan::search_dispatch_implem<false, NormTableScaler>(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const NormTableScaler& scaler,
+        const IVFSearchParameters* params = nullptr) const;
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFFastScan.h b/thirdparty/faiss/faiss/IndexIVFFastScan.h
new file mode 100644
index 000000000..23d762523
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIVFFastScan.h
@@ -0,0 +1,263 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+/** Fast scan version of IVFPQ and IVFAQ. Works for 4-bit PQ/AQ for now.
+ *
+ * The codes in the inverted lists are not stored sequentially but
+ * grouped in blocks of size bbs. This makes it possible to very quickly
+ * compute distances with SIMD instructions.
+ *
+ * Implementations (implem):
+ * 0: auto-select implementation (default)
+ * 1: orig's search, re-implemented
+ * 2: orig's search, re-ordered by invlist
+ * 10: optimizer int16 search, collect results in heap, no qbs
+ * 11: idem, collect results in reservoir
+ * 12: optimizer int16 search, collect results in heap, uses qbs
+ * 13: idem, collect results in reservoir
+ */
+
+struct IndexIVFFastScan : IndexIVF {
+    // size of the kernel
+    int bbs; // set at build time
+
+    size_t M;
+    size_t nbits;
+    size_t ksub;
+
+    // M rounded up to a multiple of 2
+    size_t M2;
+
+    // search-time implementation
+    int implem = 0;
+    // skip some parts of the computation (for timing)
+    int skip = 0;
+
+    // batching factors at search time (0 = default)
+    int qbs = 0;
+    size_t qbs2 = 0;
+
+    // // todo aguzhva: get rid of this
+    std::vector<float> norms;
+
+    IndexIVFFastScan(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t code_size,
+            MetricType metric = METRIC_L2,
+            bool is_cosine = false);
+
+    IndexIVFFastScan();
+
+    void init_fastscan(
+            size_t M,
+            size_t nbits,
+            size_t nlist,
+            MetricType metric,
+            int bbs);
+
+    // initialize the CodePacker in the InvertedLists
+    void init_code_packer();
+
+    ~IndexIVFFastScan() override;
+
+    /// orig's inverted lists (for debugging)
+    InvertedLists* orig_invlists = nullptr;
+
+    // Knowhere-specific function, needed for norms, introduced in PR #1
+    // final is needed because 'x' can be renormalized inside it,
+    //   so a derived class is not allowed to override this function.
+    void add_with_ids(idx_t n, const float* x, const idx_t* xids)
+            override final;
+
+    // This matches Faiss baseline.
+    void add_with_ids_impl(idx_t n, const float* x, const idx_t* xids);
+
+    // Knowhere-specific override.
+    // final is needed because 'x' can be renormalized inside it,
+    //   so a derived class is not allowed to override this function.
+    void train(idx_t n, const float* x) override final;
+
+    // prepare look-up tables
+
+    virtual bool lookup_table_is_3d() const = 0;
+
+    virtual void compute_LUT(
+            size_t n,
+            const float* x,
+            const idx_t* coarse_ids,
+            const float* coarse_dis,
+            AlignedTable<float>& dis_tables,
+            AlignedTable<float>& biases) const = 0;
+
+    void compute_LUT_uint8(
+            size_t n,
+            const float* x,
+            const idx_t* coarse_ids,
+            const float* coarse_dis,
+            AlignedTable<uint8_t>& dis_tables,
+            AlignedTable<uint16_t>& biases,
+            float* normalizers) const;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    // range_search implementation was introduced in Knowhere,
+    //   diff 73f03354568b4bf5a370df6f37e8d56dfc3a9c85
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result,
+            const SearchParameters* params = nullptr) const override;
+
+    // internal search funcs
+
+    template <bool is_max, class Scaler>
+    void search_dispatch_implem(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    template <bool is_max, class Scaler>
+    void range_search_dispatch_implem(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    template <class C, class Scaler>
+    void search_implem_1(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    template <class C, class Scaler>
+    void search_implem_2(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    // implem 10 and 12 are not multithreaded internally, so
+    // export search stats
+    template <class C, class Scaler>
+    void search_implem_10(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            size_t* ndis_out,
+            size_t* nlist_out,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    template <class C, class Scaler>
+    void search_implem_12(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            size_t* ndis_out,
+            size_t* nlist_out,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    template <class C, class Scaler>
+    void range_search_implem_12(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result,
+            int impl,
+            size_t* ndis_out,
+            size_t* nlist_out,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    // implem 14 is multithreaded internally across nprobes and queries
+    template <class C, class Scaler>
+    void search_implem_14(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            int impl,
+            const Scaler& scaler,
+            const IVFSearchParameters* params = nullptr) const;
+
+    // reconstruct vectors from packed invlists
+    void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
+            const override;
+
+    CodePacker* get_CodePacker() const override;
+
+    // reconstruct orig invlists (for debugging)
+    void reconstruct_orig_invlists();
+};
+
+// // todo aguzhva: removed in https://github.com/zilliztech/knowhere/pull/180,
+// //   but commented out here
+// struct IVFFastScanStats {
+//     uint64_t times[10];
+//     uint64_t t_compute_distance_tables, t_round;
+//     uint64_t t_copy_pack, t_scan, t_to_flat;
+//     uint64_t reservoir_times[4];
+//     double t_aq_encode;
+//     double t_aq_norm_encode;
+//
+//     double Mcy_at(int i) {
+//         return times[i] / (1000 * 1000.0);
+//     }
+//
+//     double Mcy_reservoir_at(int i) {
+//         return reservoir_times[i] / (1000 * 1000.0);
+//     }
+//     IVFFastScanStats() {
+//         reset();
+//     }
+//     void reset() {
+//         memset(this, 0, sizeof(*this));
+//     }
+// };
+//
+// FAISS_API extern IVFFastScanStats IVFFastScan_stats;
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFFlat.cpp b/thirdparty/faiss/faiss/IndexIVFFlat.cpp
index 1133cad05..875637185 100644
--- a/thirdparty/faiss/faiss/IndexIVFFlat.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFFlat.cpp
@@ -16,13 +16,17 @@
 #include <cstdio>
 
 #include "knowhere/utils.h"
+#include "knowhere/bitsetview_idselector.h"
 
 #include <faiss/IndexFlat.h>
 
 #include <faiss/FaissHook.h>
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/IDSelector.h>
+
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
+#include <faiss/utils/distances_if.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
@@ -40,6 +44,8 @@ IndexIVFFlat::IndexIVFFlat(
         : IndexIVF(quantizer, d, nlist, sizeof(float) * d, metric) {
     this->is_cosine = is_cosine;
     code_size = sizeof(float) * d;
+    by_residual = false;
+
     replace_invlists(new ArrayInvertedLists(nlist, code_size, is_cosine), true);
 }
 
@@ -76,6 +82,11 @@ void IndexIVFFlat::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
     }
 }
 
+
+IndexIVFFlat::IndexIVFFlat() {
+    by_residual = false;
+}
+
 void IndexIVFFlat::add_core(
         idx_t n,
         const float* x,
@@ -84,6 +95,7 @@ void IndexIVFFlat::add_core(
         const idx_t* coarse_idx) {
     FAISS_THROW_IF_NOT(is_trained);
     FAISS_THROW_IF_NOT(coarse_idx);
+    FAISS_THROW_IF_NOT(!by_residual);
     assert(invlists);
     direct_map.check_can_add(xids);
 
@@ -129,6 +141,7 @@ void IndexIVFFlat::encode_vectors(
         const idx_t* list_nos,
         uint8_t* codes,
         bool include_listnos) const {
+    FAISS_THROW_IF_NOT(!by_residual);
     if (!include_listnos) {
         memcpy(codes, x, code_size * n);
     } else {
@@ -158,20 +171,21 @@ void IndexIVFFlat::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
 
 namespace {
 
-template <MetricType metric, class C>
+/*
+// Baseline implementation that is kept for the reference.
+template <MetricType metric, class C, bool use_sel>
 struct IVFFlatScanner : InvertedListScanner {
     size_t d;
 
-    IVFFlatScanner(size_t d, bool store_pairs) : d(d) {
-        this->store_pairs = store_pairs;
-    }
+    IVFFlatScanner(size_t d, bool store_pairs, const IDSelector* sel)
+            : InvertedListScanner(store_pairs, sel), d(d) {}
 
     const float* xi;
     void set_query(const float* query) override {
         this->xi = query;
     }
 
-    void set_list(idx_t list_no, float /* coarse_dis */) override {
+    void set_list(idx_t list_no, float coarse_dis) override {
         this->list_no = list_no;
     }
 
@@ -190,26 +204,116 @@ struct IVFFlatScanner : InvertedListScanner {
             const idx_t* ids,
             float* simi,
             idx_t* idxi,
-            size_t k,
-            const BitsetView bitset) const override {
+            size_t k) const override {
         const float* list_vecs = (const float*)codes;
         size_t nup = 0;
         for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                const float* yj = list_vecs + d * j;
-                float dis = metric == METRIC_INNER_PRODUCT
-                        ? fvec_inner_product(xi, yj, d)
-                        : fvec_L2sqr(xi, yj, d);
-                if (code_norms) {
-                    dis /= code_norms[j];
-                }
+            const float* yj = list_vecs + d * j;
+            if (use_sel && !sel->is_member(ids[j])) {
+                continue;
+            }
+            float dis = metric == METRIC_INNER_PRODUCT
+                    ? fvec_inner_product(xi, yj, d)
+                    : fvec_L2sqr(xi, yj, d);
+            if (code_norms) {
+                dis /= code_norms[j];
+            }
+            if (C::cmp(simi[0], dis)) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                heap_replace_top<C>(k, simi, idxi, dis, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const float* code_norms,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        const float* list_vecs = (const float*)codes;
+        for (size_t j = 0; j < list_size; j++) {
+            const float* yj = list_vecs + d * j;
+            if (use_sel && !sel->is_member(ids[j])) {
+                continue;
+            }
+            float dis = metric == METRIC_INNER_PRODUCT
+                    ? fvec_inner_product(xi, yj, d)
+                    : fvec_L2sqr(xi, yj, d);
+            if (code_norms) {
+                dis /= code_norms[j];
+            }
+            if (C::cmp(radius, dis)) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                res.add(dis, id);
+            }
+        }
+    }
+};
+*/
+
+template <MetricType metric, class C, bool use_sel>
+struct IVFFlatScanner : InvertedListScanner {
+    size_t d;
+
+    IVFFlatScanner(size_t d, bool store_pairs, const IDSelector* sel)
+            : InvertedListScanner(store_pairs, sel), d(d) {}
+
+    const float* xi;
+    void set_query(const float* query) override {
+        this->xi = query;
+    }
+
+    void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+    }
+
+    float distance_to_code(const uint8_t* code) const override {
+        const float* yj = (float*)code;
+        float dis = metric == METRIC_INNER_PRODUCT
+                ? fvec_inner_product(xi, yj, d)
+                : fvec_L2sqr(xi, yj, d);
+        return dis;
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const float* code_norms,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        const float* list_vecs = (const float*)codes;
+        size_t nup = 0;
+
+        // the lambda that filters acceptable elements.
+        auto filter = 
+            [&](const size_t j) { return (!use_sel || sel->is_member(ids[j])); };
+
+        // the lambda that applies a filtered element.
+        auto apply = 
+            [&](const float dis_in, const size_t j) {
+                const float dis = (code_norms == nullptr) ? dis_in : (dis_in / code_norms[j]); 
                 if (C::cmp(simi[0], dis)) {
-                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                    const int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
                     heap_replace_top<C>(k, simi, idxi, dis, id);
                     nup++;
                 }
-            }
+            };
+
+        if constexpr (metric == METRIC_INNER_PRODUCT) {
+            fvec_inner_products_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
+        }
+        else {
+            fvec_L2sqr_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
         }
+
         return nup;
     }
 
@@ -219,41 +323,184 @@ struct IVFFlatScanner : InvertedListScanner {
             const float* code_norms,
             const idx_t* ids,
             float radius,
-            RangeQueryResult& res,
-            const BitsetView bitset) const override {
+            RangeQueryResult& res) const override {
         const float* list_vecs = (const float*)codes;
-        for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                const float* yj = list_vecs + d * j;
-                float dis = metric == METRIC_INNER_PRODUCT
-                        ? fvec_inner_product(xi, yj, d)
-                        : fvec_L2sqr(xi, yj, d);
-                if (code_norms) {
-                    dis /= code_norms[j];
+
+        // the lambda that filters acceptable elements.
+        auto filter = 
+            [&](const size_t j) { return (!use_sel || sel->is_member(ids[j])); };
+
+        // the lambda that applies a filtered element.
+        auto apply = 
+            [&](const float dis_in, const size_t j) {
+                const float dis = (code_norms == nullptr) ? dis_in : (dis_in / code_norms[j]); 
+                if (C::cmp(radius, dis)) {
+                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                    res.add(dis, id);
+                }
+            };
+
+        if constexpr (metric == METRIC_INNER_PRODUCT) {
+            fvec_inner_products_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
+        }
+        else {
+            fvec_L2sqr_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
+        }
+    }
+};
+
+// a custom version for Knowhere
+template <MetricType metric, class C, bool use_sel>
+struct IVFFlatBitsetViewScanner : InvertedListScanner {
+    size_t d;
+    knowhere::BitsetView bitset;
+
+    IVFFlatBitsetViewScanner(size_t d, bool store_pairs, const IDSelector* sel)
+            : InvertedListScanner(store_pairs, sel), d(d) {
+        const auto* bitsetview_sel = dynamic_cast<const knowhere::BitsetViewIDSelector*>(sel);
+        FAISS_ASSERT_MSG((bitsetview_sel != nullptr), "Unsupported scanner for IVFFlatBitsetViewScanner");
+
+        bitset = bitsetview_sel->bitset_view;
+    }
+
+    const float* xi;
+    void set_query(const float* query) override {
+        this->xi = query;
+    }
+
+    void set_list(idx_t list_no, float /* coarse_dis */) override {
+        this->list_no = list_no;
+    }
+
+    float distance_to_code(const uint8_t* code) const override {
+        const float* yj = (float*)code;
+        float dis = metric == METRIC_INNER_PRODUCT
+                ? fvec_inner_product(xi, yj, d)
+                : fvec_L2sqr(xi, yj, d);
+        return dis;
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* __restrict codes,
+            const float* __restrict code_norms,
+            const idx_t* __restrict ids,
+            float* __restrict simi,
+            idx_t* __restrict idxi,
+            size_t k) const override {
+        const float* list_vecs = (const float*)codes;
+        size_t nup = 0;
+
+        // the lambda that filters acceptable elements.
+        auto filter = 
+            [&](const size_t j) { return (!use_sel || !bitset.test(ids[j])); };
+
+        // the lambda that applies a filtered element.
+        auto apply = 
+            [&](const float dis_in, const size_t j) {
+                const float dis = (code_norms == nullptr) ? dis_in : (dis_in / code_norms[j]); 
+                if (C::cmp(simi[0], dis)) {
+                    const int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                    heap_replace_top<C>(k, simi, idxi, dis, id);
+                    nup++;
                 }
+            };
+
+        if constexpr (metric == METRIC_INNER_PRODUCT) {
+            fvec_inner_products_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
+        }
+        else {
+            fvec_L2sqr_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
+        }
+
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* __restrict codes,
+            const float* __restrict code_norms,
+            const idx_t* __restrict ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        const float* list_vecs = (const float*)codes;
+
+        // the lambda that filters acceptable elements.
+        auto filter = 
+            [&](const size_t j) { return (!use_sel || !bitset.test(ids[j])); };
+
+        // the lambda that applies a filtered element.
+        auto apply = 
+            [&](const float dis_in, const size_t j) {
+                const float dis = (code_norms == nullptr) ? dis_in : (dis_in / code_norms[j]); 
                 if (C::cmp(radius, dis)) {
                     int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
                     res.add(dis, id);
                 }
-            }
+            };
+
+        if constexpr (metric == METRIC_INNER_PRODUCT) {
+            fvec_inner_products_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
+        }
+        else {
+            fvec_L2sqr_ny_if(
+                xi, list_vecs, d, list_size, filter, apply);
         }
     }
 };
 
+template <bool use_sel>
+InvertedListScanner* get_InvertedListScanner1(
+        const IndexIVFFlat* ivf,
+        bool store_pairs,
+        const IDSelector* sel) {
+    // A specialized version for Knowhere. 
+    //   It is needed to get rid of virtual function calls, because sel 
+    //   can filter out 99% of samples, so the cost of virtual function calls
+    //   becomes noticeable compared to distance computations.
+    if (const auto* bitsetview_sel = dynamic_cast<const knowhere::BitsetViewIDSelector*>(sel)) {
+        if (ivf->metric_type == METRIC_INNER_PRODUCT) {
+            return new IVFFlatBitsetViewScanner<
+                    METRIC_INNER_PRODUCT,
+                    CMin<float, int64_t>,
+                    use_sel>(ivf->d, store_pairs, sel);
+        } else if (ivf->metric_type == METRIC_L2) {
+            return new IVFFlatBitsetViewScanner<METRIC_L2, CMax<float, int64_t>, use_sel>(
+                    ivf->d, store_pairs, sel);
+        } else {
+            FAISS_THROW_MSG("metric type not supported");
+        }
+    }
+
+    // default faiss version
+    if (ivf->metric_type == METRIC_INNER_PRODUCT) {
+        return new IVFFlatScanner<
+                METRIC_INNER_PRODUCT,
+                CMin<float, int64_t>,
+                use_sel>(ivf->d, store_pairs, sel);
+    } else if (ivf->metric_type == METRIC_L2) {
+        return new IVFFlatScanner<METRIC_L2, CMax<float, int64_t>, use_sel>(
+                ivf->d, store_pairs, sel);
+    } else {
+        FAISS_THROW_MSG("metric type not supported");
+    }
+}
+
 } // anonymous namespace
 
 InvertedListScanner* IndexIVFFlat::get_InvertedListScanner(
-        bool store_pairs) const {
-    if (metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFFlatScanner<METRIC_INNER_PRODUCT, CMin<float, int64_t>>(
-                d, store_pairs);
-    } else if (metric_type == METRIC_L2) {
-        return new IVFFlatScanner<METRIC_L2, CMax<float, int64_t>>(
-                d, store_pairs);
+        bool store_pairs,
+        const IDSelector* sel) const {
+    if (sel) {
+        return get_InvertedListScanner1<true>(this, store_pairs, sel);
     } else {
-        FAISS_THROW_MSG("metric type not supported");
+        return get_InvertedListScanner1<false>(this, store_pairs, sel);
     }
-    return nullptr;
 }
 
 void IndexIVFFlat::reconstruct_from_offset(
@@ -274,6 +521,8 @@ IndexIVFFlatCC::IndexIVFFlatCC(
     replace_invlists(new ConcurrentArrayInvertedLists(nlist, code_size, ssize, is_cosine), true);
 }
 
+IndexIVFFlatCC::IndexIVFFlatCC() {}
+
 /*****************************************
  * IndexIVFFlatDedup implementation
  ******************************************/
@@ -389,8 +638,7 @@ void IndexIVFFlatDedup::search_preassigned(
         idx_t* labels,
         bool store_pairs,
         const IVFSearchParameters* params,
-        IndexIVFStats* stats,
-        const BitsetView bitset) const {
+        IndexIVFStats* stats) const {
     FAISS_THROW_IF_NOT_MSG(
             !store_pairs, "store_pairs not supported in IVFDedup");
 
@@ -514,7 +762,7 @@ void IndexIVFFlatDedup::range_search(
         const float*,
         float,
         RangeSearchResult*,
-        const BitsetView) const {
+        const SearchParameters*) const {
     FAISS_THROW_MSG("not implemented");
 }
 
diff --git a/thirdparty/faiss/faiss/IndexIVFFlat.h b/thirdparty/faiss/faiss/IndexIVFFlat.h
index 7106a2ff6..42899708d 100644
--- a/thirdparty/faiss/faiss/IndexIVFFlat.h
+++ b/thirdparty/faiss/faiss/IndexIVFFlat.h
@@ -31,6 +31,9 @@ struct IndexIVFFlat : IndexIVF {
 
     void restore_codes(const uint8_t* raw_data, const size_t raw_size);
 
+    // Be careful with overriding this function, because
+    //   renormalized x may be used inside. 
+    // Overridden by IndexIVFFlatDedup.
     void train(idx_t n, const float* x) override;
 
     void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
@@ -50,14 +53,15 @@ struct IndexIVFFlat : IndexIVF {
             bool include_listnos = false) const override;
 
     InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs) const override;
+            bool store_pairs,
+            const IDSelector* sel) const override;
 
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
 
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 
-    IndexIVFFlat() {}
+    IndexIVFFlat();
 };
 
 struct IndexIVFFlatCC : IndexIVFFlat {
@@ -69,7 +73,7 @@ struct IndexIVFFlatCC : IndexIVFFlat {
             MetricType = METRIC_L2,
             bool is_cosine = false);
 
-    IndexIVFFlatCC() {}
+    IndexIVFFlatCC();
 };
 
 struct IndexIVFFlatDedup : IndexIVFFlat {
@@ -100,8 +104,7 @@ struct IndexIVFFlatDedup : IndexIVFFlat {
             idx_t* labels,
             bool store_pairs,
             const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr,
-            const BitsetView bitset = nullptr) const override;
+            IndexIVFStats* stats = nullptr) const override;
 
     size_t remove_ids(const IDSelector& sel) override;
 
@@ -111,7 +114,7 @@ struct IndexIVFFlatDedup : IndexIVFFlat {
             const float* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     /// not implemented
     void update_vectors(int nv, const idx_t* idx, const float* v) override;
diff --git a/thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.cpp b/thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.cpp
new file mode 100644
index 000000000..76ae6718a
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.cpp
@@ -0,0 +1,172 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexIVFIndependentQuantizer.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+IndexIVFIndependentQuantizer::IndexIVFIndependentQuantizer(
+        Index* quantizer,
+        IndexIVF* index_ivf,
+        VectorTransform* vt)
+        : Index(quantizer->d, index_ivf->metric_type),
+          quantizer(quantizer),
+          vt(vt),
+          index_ivf(index_ivf) {
+    if (vt) {
+        FAISS_THROW_IF_NOT_MSG(
+                vt->d_in == d && vt->d_out == index_ivf->d,
+                "invalid vector dimensions");
+    } else {
+        FAISS_THROW_IF_NOT_MSG(index_ivf->d == d, "invalid vector dimensions");
+    }
+
+    if (quantizer->is_trained && quantizer->ntotal != 0) {
+        FAISS_THROW_IF_NOT(quantizer->ntotal == index_ivf->nlist);
+    }
+    if (index_ivf->is_trained && vt) {
+        FAISS_THROW_IF_NOT(vt->is_trained);
+    }
+    ntotal = index_ivf->ntotal;
+    is_trained =
+            (quantizer->is_trained && quantizer->ntotal == index_ivf->nlist &&
+             (!vt || vt->is_trained) && index_ivf->is_trained);
+
+    // disable precomputed tables because they use the distances that are
+    // provided by the coarse quantizer (that are out of sync with the IVFPQ)
+    if (auto index_ivfpq = dynamic_cast<IndexIVFPQ*>(index_ivf)) {
+        index_ivfpq->use_precomputed_table = -1;
+    }
+}
+
+IndexIVFIndependentQuantizer::~IndexIVFIndependentQuantizer() {
+    if (own_fields) {
+        delete quantizer;
+        delete index_ivf;
+        delete vt;
+    }
+}
+
+namespace {
+
+struct VTransformedVectors : TransformedVectors {
+    VTransformedVectors(const VectorTransform* vt, idx_t n, const float* x)
+            : TransformedVectors(x, vt ? vt->apply(n, x) : x) {}
+};
+
+struct SubsampledVectors : TransformedVectors {
+    SubsampledVectors(int d, idx_t* n, idx_t max_n, const float* x)
+            : TransformedVectors(
+                      x,
+                      fvecs_maybe_subsample(d, (size_t*)n, max_n, x, true)) {}
+};
+
+} // anonymous namespace
+
+void IndexIVFIndependentQuantizer::add(idx_t n, const float* x) {
+    std::vector<float> D(n);
+    std::vector<idx_t> I(n);
+    quantizer->search(n, x, 1, D.data(), I.data());
+
+    VTransformedVectors tv(vt, n, x);
+
+    index_ivf->add_core(n, tv.x, nullptr, nullptr, I.data());
+}
+
+void IndexIVFIndependentQuantizer::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(!params, "search parameters not supported");
+    int nprobe = index_ivf->nprobe;
+    std::vector<float> D(n * nprobe);
+    std::vector<idx_t> I(n * nprobe);
+    quantizer->search(n, x, nprobe, D.data(), I.data());
+
+    VTransformedVectors tv(vt, n, x);
+
+    index_ivf->search_preassigned(
+            n, tv.x, k, I.data(), D.data(), distances, labels, false);
+}
+
+void IndexIVFIndependentQuantizer::reset() {
+    index_ivf->reset();
+    ntotal = 0;
+}
+
+void IndexIVFIndependentQuantizer::train(idx_t n, const float* x) {
+    // quantizer training
+    size_t nlist = index_ivf->nlist;
+    Level1Quantizer l1(quantizer, nlist);
+    l1.train_q1(n, x, verbose, metric_type);
+
+    // train the VectorTransform
+    if (vt && !vt->is_trained) {
+        if (verbose) {
+            printf("IndexIVFIndependentQuantizer: train the VectorTransform\n");
+        }
+        vt->train(n, x);
+    }
+
+    // get the centroids from the quantizer, transform them and
+    // add them to the index_ivf's quantizer
+    if (verbose) {
+        printf("IndexIVFIndependentQuantizer: extract the main quantizer centroids\n");
+    }
+    std::vector<float> centroids(nlist * d);
+    quantizer->reconstruct_n(0, nlist, centroids.data());
+    VTransformedVectors tcent(vt, nlist, centroids.data());
+
+    if (verbose) {
+        printf("IndexIVFIndependentQuantizer: add centroids to the secondary quantizer\n");
+    }
+    if (!index_ivf->quantizer->is_trained) {
+        index_ivf->quantizer->train(nlist, tcent.x);
+    }
+    index_ivf->quantizer->add(nlist, tcent.x);
+
+    // train the payload
+
+    // optional subsampling
+    idx_t max_nt = index_ivf->train_encoder_num_vectors();
+    if (max_nt <= 0) {
+        max_nt = (size_t)1 << 35;
+    }
+    SubsampledVectors sv(index_ivf->d, &n, max_nt, x);
+
+    // transform subsampled vectors
+    VTransformedVectors tv(vt, n, sv.x);
+
+    if (verbose) {
+        printf("IndexIVFIndependentQuantizer: train encoder\n");
+    }
+
+    if (index_ivf->by_residual) {
+        // assign with quantizer
+        std::vector<idx_t> assign(n);
+        quantizer->assign(n, sv.x, assign.data());
+
+        // compute residual with IVF quantizer
+        std::vector<float> residuals(n * index_ivf->d);
+        index_ivf->quantizer->compute_residual_n(
+                n, tv.x, residuals.data(), assign.data());
+
+        index_ivf->train_encoder(n, residuals.data(), assign.data());
+    } else {
+        index_ivf->train_encoder(n, tv.x, nullptr);
+    }
+    index_ivf->is_trained = true;
+    is_trained = true;
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.h b/thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.h
new file mode 100644
index 000000000..4fe166661
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexIVFIndependentQuantizer.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/VectorTransform.h>
+
+namespace faiss {
+
+/** An IVF index with a quantizer that has a different input dimension from the
+ * payload size. The vectors to encode are obtained from the input vectors by a
+ * VectorTransform.
+ */
+struct IndexIVFIndependentQuantizer : Index {
+    /// quantizer is fed directly with the input vectors
+    Index* quantizer = nullptr;
+
+    /// transform before the IVF vectors are applied
+    VectorTransform* vt = nullptr;
+
+    /// the IVF index, controls nlist and nprobe
+    IndexIVF* index_ivf = nullptr;
+
+    /// whether *this owns the 3 fields
+    bool own_fields = false;
+
+    IndexIVFIndependentQuantizer(
+            Index* quantizer,
+            IndexIVF* index_ivf,
+            VectorTransform* vt = nullptr);
+
+    IndexIVFIndependentQuantizer() {}
+
+    void train(idx_t n, const float* x) override;
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    void reset() override;
+
+    ~IndexIVFIndependentQuantizer() override;
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFPQ.cpp b/thirdparty/faiss/faiss/IndexIVFPQ.cpp
index be6d6f601..d0020b6ec 100644
--- a/thirdparty/faiss/faiss/IndexIVFPQ.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFPQ.cpp
@@ -9,10 +9,10 @@
 
 #include <faiss/IndexIVFPQ.h>
 
-#include <stdint.h>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
+#include <cstdint>
 #include <cstdio>
 
 #include <algorithm>
@@ -30,6 +30,11 @@
 #include <faiss/impl/FaissAssert.h>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/IDSelector.h>
+
+#include <faiss/impl/ProductQuantizer.h>
+
+#include <faiss/impl/code_distance/code_distance.h>
 
 namespace faiss {
 
@@ -45,7 +50,6 @@ IndexIVFPQ::IndexIVFPQ(
         size_t nbits_per_idx,
         MetricType metric)
         : IndexIVF(quantizer, d, nlist, 0, metric), pq(d, M, nbits_per_idx) {
-    // FAISS_THROW_IF_NOT(nbits_per_idx <= 8);
     code_size = pq.code_size;
     invlists->code_size = code_size;
     is_trained = false;
@@ -61,74 +65,16 @@ IndexIVFPQ::IndexIVFPQ(
 /****************************************************************
  * training                                                     */
 
-void IndexIVFPQ::train_residual(idx_t n, const float* x) {
-    train_residual_o(n, x, nullptr);
-}
-
-void IndexIVFPQ::train_residual_o(idx_t n, const float* x, float* residuals_2) {
-    const float* x_in = x;
-
-    x = fvecs_maybe_subsample(
-            d,
-            (size_t*)&n,
-            pq.cp.max_points_per_centroid * pq.ksub,
-            x,
-            verbose,
-            pq.cp.seed);
-
-    ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
-
-    const float* trainset;
-    ScopeDeleter<float> del_residuals;
-    if (by_residual) {
-        if (verbose)
-            printf("computing residuals\n");
-        idx_t* assign = new idx_t[n]; // assignement to coarse centroids
-        ScopeDeleter<idx_t> del(assign);
-        quantizer->assign(n, x, assign);
-        float* residuals = new float[n * d];
-        del_residuals.set(residuals);
-        for (idx_t i = 0; i < n; i++)
-            quantizer->compute_residual(
-                    x + i * d, residuals + i * d, assign[i]);
-
-        trainset = residuals;
-    } else {
-        trainset = x;
-    }
-    if (verbose)
-        printf("training %zdx%zd product quantizer on %" PRId64
-               " vectors in %dD\n",
-               pq.M,
-               pq.ksub,
-               n,
-               d);
-    pq.verbose = verbose;
-    pq.train(n, trainset);
+void IndexIVFPQ::train_encoder(idx_t n, const float* x, const idx_t* assign) {
+    pq.train(n, x);
 
     if (do_polysemous_training) {
         if (verbose)
             printf("doing polysemous training for PQ\n");
         PolysemousTraining default_pt;
-        PolysemousTraining* pt = polysemous_training;
-        if (!pt)
-            pt = &default_pt;
-        pt->optimize_pq_for_hamming(pq, n, trainset);
-    }
-
-    // prepare second-level residuals for refine PQ
-    if (residuals_2) {
-        uint8_t* train_codes = new uint8_t[pq.code_size * n];
-        ScopeDeleter<uint8_t> del(train_codes);
-        pq.compute_codes(trainset, train_codes, n);
-
-        for (idx_t i = 0; i < n; i++) {
-            const float* xx = trainset + i * d;
-            float* res = residuals_2 + i * d;
-            pq.decode(train_codes + i * pq.code_size, res);
-            for (int j = 0; j < d; j++)
-                res[j] = xx[j] - res[j];
-        }
+        PolysemousTraining* pt =
+                polysemous_training ? polysemous_training : &default_pt;
+        pt->optimize_pq_for_hamming(pq, n, x);
     }
 
     if (by_residual) {
@@ -136,6 +82,10 @@ void IndexIVFPQ::train_residual_o(idx_t n, const float* x, float* residuals_2) {
     }
 }
 
+idx_t IndexIVFPQ::train_encoder_num_vectors() const {
+    return pq.cp.max_points_per_centroid * pq.ksub;
+}
+
 /****************************************************************
  * IVFPQ as codec                                               */
 
@@ -193,9 +143,9 @@ void IndexIVFPQ::add_core(
 
 static float* compute_residuals(
         const Index* quantizer,
-        Index::idx_t n,
+        idx_t n,
         const float* x,
-        const Index::idx_t* list_nos) {
+        const idx_t* list_nos) {
     size_t d = quantizer->d;
     float* residuals = new float[n * d];
     // TODO: parallelize?
@@ -256,13 +206,16 @@ void IndexIVFPQ::sa_decode(idx_t n, const uint8_t* codes, float* x) const {
     }
 }
 
+// block size used in IndexIVFPQ::add_core_o
+int index_ivfpq_add_core_o_bs = 32768;
+
 void IndexIVFPQ::add_core_o(
         idx_t n,
         const float* x,
         const idx_t* xids,
         float* residuals_2,
         const idx_t* precomputed_idx) {
-    idx_t bs = 32768;
+    idx_t bs = index_ivfpq_add_core_o_bs;
     if (n > bs) {
         for (idx_t i0 = 0; i0 < n; i0 += bs) {
             idx_t i1 = std::min(i0 + bs, n);
@@ -415,6 +368,7 @@ void initialize_IVFPQ_precomputed_table(
         const Index* quantizer,
         const ProductQuantizer& pq,
         AlignedTable<float>& precomputed_table,
+        bool by_residual,
         bool verbose) {
     size_t nlist = quantizer->ntotal;
     size_t d = quantizer->d;
@@ -426,10 +380,10 @@ void initialize_IVFPQ_precomputed_table(
     }
 
     if (use_precomputed_table == 0) { // then choose the type of table
-        if (quantizer->metric_type == METRIC_INNER_PRODUCT) {
+        if (!(quantizer->metric_type == METRIC_L2 && by_residual)) {
             if (verbose) {
                 printf("IndexIVFPQ::precompute_table: precomputed "
-                       "tables not needed for inner product quantizers\n");
+                       "tables needed only for L2 metric and by_residual is enabled\n");
             }
             precomputed_table.resize(0);
             return;
@@ -508,13 +462,16 @@ void initialize_IVFPQ_precomputed_table(
 
 void IndexIVFPQ::precompute_table() {
     initialize_IVFPQ_precomputed_table(
-            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+            use_precomputed_table,
+            quantizer,
+            pq,
+            precomputed_table,
+            by_residual,
+            verbose);
 }
 
 namespace {
 
-using idx_t = Index::idx_t;
-
 #define TIC t0 = get_cycles()
 #define TOC get_cycles() - t0
 
@@ -615,7 +572,7 @@ struct QueryTables {
      *****************************************************/
 
     // fields specific to list
-    Index::idx_t key;
+    idx_t key;
     float coarse_dis;
     std::vector<uint8_t> q_code;
 
@@ -795,10 +752,13 @@ struct QueryTables {
     }
 };
 
-template <class C>
+// This way of handling the sleector is not optimal since all distances
+// are computed even if the id would filter it out.
+template <class C, bool use_sel>
 struct KnnSearchResults {
     idx_t key;
     const idx_t* ids;
+    const IDSelector* sel;
 
     // heap params
     size_t k;
@@ -807,6 +767,10 @@ struct KnnSearchResults {
 
     size_t nup;
 
+    inline bool skip_entry(idx_t j) {
+        return use_sel && !sel->is_member(ids[j]);
+    }
+
     inline void add(idx_t j, float dis) {
         if (C::cmp(heap_sim[0], dis)) {
             idx_t id = ids ? ids[j] : lo_build(key, j);
@@ -816,15 +780,20 @@ struct KnnSearchResults {
     }
 };
 
-template <class C>
+template <class C, bool use_sel>
 struct RangeSearchResults {
     idx_t key;
     const idx_t* ids;
+    const IDSelector* sel;
 
     // wrapped result structure
     float radius;
     RangeQueryResult& rres;
 
+    inline bool skip_entry(idx_t j) {
+        return use_sel && !sel->is_member(ids[j]);
+    }
+
     inline void add(idx_t j, float dis) {
         if (C::cmp(radius, dis)) {
             idx_t id = ids ? ids[j] : lo_build(key, j);
@@ -866,26 +835,102 @@ struct IVFPQScannerT : QueryTables {
      * Scaning the codes: simple PQ scan.
      *****************************************************/
 
-    /// version of the scan where we use precomputed tables
+    // This is the baseline version of scan_list_with_tables().
+    // It demonstrates what this function actually does.
+    //
+    // /// version of the scan where we use precomputed tables.
+    // template <class SearchResultType>
+    // void scan_list_with_table(
+    //         size_t ncode,
+    //         const uint8_t* codes,
+    //         SearchResultType& res) const {
+    //
+    //     for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
+    //         if (res.skip_entry(j)) {
+    //             continue;
+    //         }
+    //         float dis = dis0 + distance_single_code<PQDecoder>(
+    //             pq, sim_table, codes);
+    //         res.add(j, dis);
+    //     }
+    // }
+
+    // This is the modified version of scan_list_with_tables().
+    // It was observed that doing manual unrolling of the loop that
+    //    utilizes distance_single_code() speeds up the computations.
+
+    /// version of the scan where we use precomputed tables.
     template <class SearchResultType>
     void scan_list_with_table(
             size_t ncode,
             const uint8_t* codes,
-            SearchResultType& res,
-            const BitsetView bitset = nullptr) const {
+            SearchResultType& res) const {
+        int counter = 0;
+
+        size_t saved_j[4] = {0, 0, 0, 0};
         for (size_t j = 0; j < ncode; j++) {
-            if (bitset.empty() || !bitset.test(res.ids[j])) {
-                PQDecoder decoder(codes, pq.nbits);
-                codes += pq.code_size;
-                float dis = dis0;
-                const float* tab = sim_table;
-
-                for (size_t m = 0; m < pq.M; m++) {
-                    dis += tab[decoder.decode()];
-                    tab += pq.ksub;
-                }
-                res.add(j, dis);
+            if (res.skip_entry(j)) {
+                continue;
             }
+
+            saved_j[0] = (counter == 0) ? j : saved_j[0];
+            saved_j[1] = (counter == 1) ? j : saved_j[1];
+            saved_j[2] = (counter == 2) ? j : saved_j[2];
+            saved_j[3] = (counter == 3) ? j : saved_j[3];
+
+            counter += 1;
+            if (counter == 4) {
+                float distance_0 = 0;
+                float distance_1 = 0;
+                float distance_2 = 0;
+                float distance_3 = 0;
+                distance_four_codes<PQDecoder>(
+                        pq.M,
+                        pq.nbits,
+                        sim_table,
+                        codes + saved_j[0] * pq.code_size,
+                        codes + saved_j[1] * pq.code_size,
+                        codes + saved_j[2] * pq.code_size,
+                        codes + saved_j[3] * pq.code_size,
+                        distance_0,
+                        distance_1,
+                        distance_2,
+                        distance_3);
+
+                res.add(saved_j[0], dis0 + distance_0);
+                res.add(saved_j[1], dis0 + distance_1);
+                res.add(saved_j[2], dis0 + distance_2);
+                res.add(saved_j[3], dis0 + distance_3);
+                counter = 0;
+            }
+        }
+
+        if (counter >= 1) {
+            float dis = dis0 +
+                    distance_single_code<PQDecoder>(
+                                pq.M,
+                                pq.nbits,
+                                sim_table,
+                                codes + saved_j[0] * pq.code_size);
+            res.add(saved_j[0], dis);
+        }
+        if (counter >= 2) {
+            float dis = dis0 +
+                    distance_single_code<PQDecoder>(
+                                pq.M,
+                                pq.nbits,
+                                sim_table,
+                                codes + saved_j[1] * pq.code_size);
+            res.add(saved_j[1], dis);
+        }
+        if (counter >= 3) {
+            float dis = dis0 +
+                    distance_single_code<PQDecoder>(
+                                pq.M,
+                                pq.nbits,
+                                sim_table,
+                                codes + saved_j[2] * pq.code_size);
+            res.add(saved_j[2], dis);
         }
     }
 
@@ -895,23 +940,21 @@ struct IVFPQScannerT : QueryTables {
     void scan_list_with_pointer(
             size_t ncode,
             const uint8_t* codes,
-            SearchResultType& res,
-            const BitsetView bitset = nullptr) const {
-        for (size_t j = 0; j < ncode; j++) {
-            if (bitset.empty() || !bitset.test(res.ids[j])) {
-                PQDecoder decoder(codes, pq.nbits);
-                codes += pq.code_size;
-
-                float dis = dis0;
-                const float* tab = sim_table_2;
-
-                for (size_t m = 0; m < pq.M; m++) {
-                    int ci = decoder.decode();
-                    dis += sim_table_ptrs[m][ci] - 2 * tab[ci];
-                    tab += pq.ksub;
-                }
-                res.add(j, dis);
+            SearchResultType& res) const {
+        for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
+            if (res.skip_entry(j)) {
+                continue;
+            }
+            PQDecoder decoder(codes, pq.nbits);
+            float dis = dis0;
+            const float* tab = sim_table_2;
+
+            for (size_t m = 0; m < pq.M; m++) {
+                int ci = decoder.decode();
+                dis += sim_table_ptrs[m][ci] - 2 * tab[ci];
+                tab += pq.ksub;
             }
+            res.add(j, dis);
         }
     }
 
@@ -920,8 +963,7 @@ struct IVFPQScannerT : QueryTables {
     void scan_on_the_fly_dist(
             size_t ncode,
             const uint8_t* codes,
-            SearchResultType& res,
-            const BitsetView bitset = nullptr) const {
+            SearchResultType& res) const {
         const float* dvec;
         float dis0 = 0;
         if (by_residual) {
@@ -937,19 +979,19 @@ struct IVFPQScannerT : QueryTables {
             dis0 = 0;
         }
 
-        for (size_t j = 0; j < ncode; j++) {
-            if (bitset.empty() || !bitset.test(res.ids[j])) {
-                pq.decode(codes, decoded_vec);
-                codes += pq.code_size;
+        for (size_t j = 0; j < ncode; j++, codes += pq.code_size) {
+            if (res.skip_entry(j)) {
+                continue;
+            }
+            pq.decode(codes, decoded_vec);
 
-                float dis;
-                if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
-                    dis = dis0 + fvec_inner_product(decoded_vec, qi, d);
-                } else {
-                    dis = fvec_L2sqr(decoded_vec, dvec, d);
-                }
-                res.add(j, dis);
+            float dis;
+            if (METRIC_TYPE == METRIC_INNER_PRODUCT) {
+                dis = dis0 + fvec_inner_product(decoded_vec, qi, d);
+            } else {
+                dis = fvec_L2sqr(decoded_vec, dvec, d);
             }
+            res.add(j, dis);
         }
     }
 
@@ -957,87 +999,208 @@ struct IVFPQScannerT : QueryTables {
      * Scanning codes with polysemous filtering
      *****************************************************/
 
+    // This is the baseline version of scan_list_polysemous_hc().
+    // It demonstrates what this function actually does.
+
+    //     template <class HammingComputer, class SearchResultType>
+    //     void scan_list_polysemous_hc(
+    //             size_t ncode,
+    //             const uint8_t* codes,
+    //             SearchResultType& res) const {
+    //         int ht = ivfpq.polysemous_ht;
+    //         size_t n_hamming_pass = 0, nup = 0;
+    //
+    //         int code_size = pq.code_size;
+    //
+    //         HammingComputer hc(q_code.data(), code_size);
+    //
+    //         for (size_t j = 0; j < ncode; j++, codes += code_size) {
+    //             if (res.skip_entry(j)) {
+    //                 continue;
+    //             }
+    //             const uint8_t* b_code = codes;
+    //             int hd = hc.hamming(b_code);
+    //             if (hd < ht) {
+    //                 n_hamming_pass++;
+    //
+    //                 float dis =
+    //                         dis0 +
+    //                         distance_single_code<PQDecoder>(
+    //                             pq, sim_table, codes);
+    //
+    //                 res.add(j, dis);
+    //             }
+    //         }
+    // #pragma omp critical
+    //         { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
+    //     }
+
+    // This is the modified version of scan_list_with_tables().
+    // It was observed that doing manual unrolling of the loop that
+    //    utilizes distance_single_code() speeds up the computations.
+
     template <class HammingComputer, class SearchResultType>
     void scan_list_polysemous_hc(
             size_t ncode,
             const uint8_t* codes,
-            SearchResultType& res,
-            const BitsetView bitset = nullptr) const {
+            SearchResultType& res) const {
         int ht = ivfpq.polysemous_ht;
         size_t n_hamming_pass = 0, nup = 0;
 
         int code_size = pq.code_size;
 
+        size_t saved_j[8];
+        int counter = 0;
+
         HammingComputer hc(q_code.data(), code_size);
 
-        for (size_t j = 0; j < ncode; j++) {
-            if (bitset.empty() || !bitset.test(res.ids[j])) {
-                const uint8_t* b_code = codes;
-                int hd = hc.compute(b_code);
-                if (hd < ht) {
-                    n_hamming_pass++;
-                    PQDecoder decoder(codes, pq.nbits);
-
-                    float dis = dis0;
-                    const float* tab = sim_table;
-
-                    for (size_t m = 0; m < pq.M; m++) {
-                        dis += tab[decoder.decode()];
-                        tab += pq.ksub;
-                    }
-                    res.add(j, dis);
-                }
+        for (size_t j = 0; j < (ncode / 4) * 4; j += 4) {
+            const uint8_t* b_code = codes + j * code_size;
+
+            // Unrolling is a key. Basically, doing multiple popcount
+            // operations one after another speeds things up.
+
+            // 9999999 is just an arbitrary large number
+            int hd0 = (res.skip_entry(j + 0))
+                    ? 99999999
+                    : hc.compute(b_code + 0 * code_size);
+            int hd1 = (res.skip_entry(j + 1))
+                    ? 99999999
+                    : hc.compute(b_code + 1 * code_size);
+            int hd2 = (res.skip_entry(j + 2))
+                    ? 99999999
+                    : hc.compute(b_code + 2 * code_size);
+            int hd3 = (res.skip_entry(j + 3))
+                    ? 99999999
+                    : hc.compute(b_code + 3 * code_size);
+
+            saved_j[counter] = j + 0;
+            counter = (hd0 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 1;
+            counter = (hd1 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 2;
+            counter = (hd2 < ht) ? (counter + 1) : counter;
+            saved_j[counter] = j + 3;
+            counter = (hd3 < ht) ? (counter + 1) : counter;
+
+            if (counter >= 4) {
+                // process four codes at the same time
+                n_hamming_pass += 4;
+
+                float distance_0 = dis0;
+                float distance_1 = dis0;
+                float distance_2 = dis0;
+                float distance_3 = dis0;
+                distance_four_codes<PQDecoder>(
+                        pq.M,
+                        pq.nbits,
+                        sim_table,
+                        codes + saved_j[0] * pq.code_size,
+                        codes + saved_j[1] * pq.code_size,
+                        codes + saved_j[2] * pq.code_size,
+                        codes + saved_j[3] * pq.code_size,
+                        distance_0,
+                        distance_1,
+                        distance_2,
+                        distance_3);
+
+                res.add(saved_j[0], dis0 + distance_0);
+                res.add(saved_j[1], dis0 + distance_1);
+                res.add(saved_j[2], dis0 + distance_2);
+                res.add(saved_j[3], dis0 + distance_3);
+
+                //
+                counter -= 4;
+                saved_j[0] = saved_j[4];
+                saved_j[1] = saved_j[5];
+                saved_j[2] = saved_j[6];
+                saved_j[3] = saved_j[7];
+            }
+        }
+
+        for (size_t kk = 0; kk < counter; kk++) {
+            n_hamming_pass++;
+
+            float dis = dis0 +
+                    distance_single_code<PQDecoder>(
+                                pq.M,
+                                pq.nbits,
+                                sim_table,
+                                codes + saved_j[kk] * pq.code_size);
+
+            res.add(saved_j[kk], dis);
+        }
+
+        // process leftovers
+        for (size_t j = (ncode / 4) * 4; j < ncode; j++) {
+            if (res.skip_entry(j)) {
+                continue;
+            }
+            const uint8_t* b_code = codes + j * code_size;
+            int hd = hc.compute(b_code);
+            if (hd < ht) {
+                n_hamming_pass++;
+
+                float dis = dis0 +
+                        distance_single_code<PQDecoder>(
+                                    pq.M,
+                                    pq.nbits,
+                                    sim_table,
+                                    codes + j * code_size);
+
+                res.add(j, dis);
             }
-            codes += code_size;
         }
+
 #pragma omp critical
         { indexIVFPQ_stats.n_hamming_pass += n_hamming_pass; }
     }
 
+    template <class SearchResultType>
+    struct Run_scan_list_polysemous_hc {
+        using T = void;
+        template <class HammingComputer, class... Types>
+        void f(const IVFPQScannerT* scanner, Types... args) {
+            scanner->scan_list_polysemous_hc<HammingComputer, SearchResultType>(
+                    args...);
+        }
+    };
+
     template <class SearchResultType>
     void scan_list_polysemous(
             size_t ncode,
             const uint8_t* codes,
-            SearchResultType& res,
-            const BitsetView bitset = nullptr) const {
-        switch (pq.code_size) {
-#define HANDLE_CODE_SIZE(cs)                                            \
-    case cs:                                                            \
-        scan_list_polysemous_hc<HammingComputer##cs, SearchResultType>( \
-                ncode, codes, res, bitset);                             \
-        break
-            HANDLE_CODE_SIZE(4);
-            HANDLE_CODE_SIZE(8);
-            HANDLE_CODE_SIZE(16);
-            HANDLE_CODE_SIZE(20);
-            HANDLE_CODE_SIZE(32);
-            HANDLE_CODE_SIZE(64);
-#undef HANDLE_CODE_SIZE
-            default:
-                scan_list_polysemous_hc<
-                        HammingComputerDefault,
-                        SearchResultType>(ncode, codes, res, bitset);
-                break;
-        }
+            SearchResultType& res) const {
+        Run_scan_list_polysemous_hc<SearchResultType> r;
+        dispatch_HammingComputer(pq.code_size, r, this, ncode, codes, res);
     }
 };
 
 /* We put as many parameters as possible in template. Hopefully the
- * gain in runtime is worth the code bloat. C is the comparator < or
- * >, it is directly related to METRIC_TYPE. precompute_mode is how
- * much we precompute (2 = precompute distance tables, 1 = precompute
- * pointers to distances, 0 = compute distances one by one).
- * Currently only 2 is supported */
-template <MetricType METRIC_TYPE, class C, class PQDecoder>
-struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
+ * gain in runtime is worth the code bloat.
+ *
+ * C is the comparator < or >, it is directly related to METRIC_TYPE.
+ *
+ * precompute_mode is how much we precompute (2 = precompute distance tables,
+ * 1 = precompute pointers to distances, 0 = compute distances one by one).
+ * Currently only 2 is supported
+ *
+ * use_sel: store or ignore the IDSelector
+ */
+template <MetricType METRIC_TYPE, class C, class PQDecoder, bool use_sel>
+struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>,
                       InvertedListScanner {
     int precompute_mode;
+    const IDSelector* sel;
 
-    IVFPQScanner(const IndexIVFPQ& ivfpq, bool store_pairs, int precompute_mode)
-            : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>(
-                      ivfpq,
-                      nullptr),
-              precompute_mode(precompute_mode) {
+    IVFPQScanner(
+            const IndexIVFPQ& ivfpq,
+            bool store_pairs,
+            int precompute_mode,
+            const IDSelector* sel)
+            : IVFPQScannerT<idx_t, METRIC_TYPE, PQDecoder>(ivfpq, nullptr),
+              precompute_mode(precompute_mode),
+              sel(sel) {
         this->store_pairs = store_pairs;
     }
 
@@ -1052,14 +1215,9 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
 
     float distance_to_code(const uint8_t* code) const override {
         assert(precompute_mode == 2);
-        float dis = this->dis0;
-        const float* tab = this->sim_table;
-        PQDecoder decoder(code, this->pq.nbits);
-
-        for (size_t m = 0; m < this->pq.M; m++) {
-            dis += tab[decoder.decode()];
-            tab += this->pq.ksub;
-        }
+        float dis = this->dis0 +
+                distance_single_code<PQDecoder>(
+                            this->pq.M, this->pq.nbits, this->sim_table, code);
         return dis;
     }
 
@@ -1070,11 +1228,11 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
             const idx_t* ids,
             float* heap_sim,
             idx_t* heap_ids,
-            size_t k,
-            const BitsetView bitset = nullptr) const override {
-        KnnSearchResults<C> res = {
+            size_t k) const override {
+        KnnSearchResults<C, use_sel> res = {
                 /* key */ this->key,
                 /* ids */ this->store_pairs ? nullptr : ids,
+                /* sel */ this->sel,
                 /* k */ k,
                 /* heap_sim */ heap_sim,
                 /* heap_ids */ heap_ids,
@@ -1082,13 +1240,13 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
 
         if (this->polysemous_ht > 0) {
             assert(precompute_mode == 2);
-            this->scan_list_polysemous(ncode, codes, res, bitset);
+            this->scan_list_polysemous(ncode, codes, res);
         } else if (precompute_mode == 2) {
-            this->scan_list_with_table(ncode, codes, res, bitset);
+            this->scan_list_with_table(ncode, codes, res);
         } else if (precompute_mode == 1) {
-            this->scan_list_with_pointer(ncode, codes, res, bitset);
+            this->scan_list_with_pointer(ncode, codes, res);
         } else if (precompute_mode == 0) {
-            this->scan_on_the_fly_dist(ncode, codes, res, bitset);
+            this->scan_on_the_fly_dist(ncode, codes, res);
         } else {
             FAISS_THROW_MSG("bad precomp mode");
         }
@@ -1101,55 +1259,76 @@ struct IVFPQScanner : IVFPQScannerT<Index::idx_t, METRIC_TYPE, PQDecoder>,
             const float* code_norms,
             const idx_t* ids,
             float radius,
-            RangeQueryResult& rres,
-            const BitsetView bitset = nullptr) const override {
-        RangeSearchResults<C> res = {
+            RangeQueryResult& rres) const override {
+        RangeSearchResults<C, use_sel> res = {
                 /* key */ this->key,
                 /* ids */ this->store_pairs ? nullptr : ids,
+                /* sel */ this->sel,
                 /* radius */ radius,
                 /* rres */ rres};
 
         if (this->polysemous_ht > 0) {
             assert(precompute_mode == 2);
-            this->scan_list_polysemous(ncode, codes, res, bitset);
+            this->scan_list_polysemous(ncode, codes, res);
         } else if (precompute_mode == 2) {
-            this->scan_list_with_table(ncode, codes, res, bitset);
+            this->scan_list_with_table(ncode, codes, res);
         } else if (precompute_mode == 1) {
-            this->scan_list_with_pointer(ncode, codes, res, bitset);
+            this->scan_list_with_pointer(ncode, codes, res);
         } else if (precompute_mode == 0) {
-            this->scan_on_the_fly_dist(ncode, codes, res, bitset);
+            this->scan_on_the_fly_dist(ncode, codes, res);
         } else {
             FAISS_THROW_MSG("bad precomp mode");
         }
     }
 };
 
-template <class PQDecoder>
+template <class PQDecoder, bool use_sel>
 InvertedListScanner* get_InvertedListScanner1(
         const IndexIVFPQ& index,
-        bool store_pairs) {
+        bool store_pairs,
+        const IDSelector* sel) {
     if (index.metric_type == METRIC_INNER_PRODUCT) {
         return new IVFPQScanner<
                 METRIC_INNER_PRODUCT,
                 CMin<float, idx_t>,
-                PQDecoder>(index, store_pairs, 2);
+                PQDecoder,
+                use_sel>(index, store_pairs, 2, sel);
     } else if (index.metric_type == METRIC_L2) {
-        return new IVFPQScanner<METRIC_L2, CMax<float, idx_t>, PQDecoder>(
-                index, store_pairs, 2);
+        return new IVFPQScanner<
+                METRIC_L2,
+                CMax<float, idx_t>,
+                PQDecoder,
+                use_sel>(index, store_pairs, 2, sel);
     }
     return nullptr;
 }
 
+template <bool use_sel>
+InvertedListScanner* get_InvertedListScanner2(
+        const IndexIVFPQ& index,
+        bool store_pairs,
+        const IDSelector* sel) {
+    if (index.pq.nbits == 8) {
+        return get_InvertedListScanner1<PQDecoder8, use_sel>(
+                index, store_pairs, sel);
+    } else if (index.pq.nbits == 16) {
+        return get_InvertedListScanner1<PQDecoder16, use_sel>(
+                index, store_pairs, sel);
+    } else {
+        return get_InvertedListScanner1<PQDecoderGeneric, use_sel>(
+                index, store_pairs, sel);
+    }
+}
+
 } // anonymous namespace
 
 InvertedListScanner* IndexIVFPQ::get_InvertedListScanner(
-        bool store_pairs) const {
-    if (pq.nbits == 8) {
-        return get_InvertedListScanner1<PQDecoder8>(*this, store_pairs);
-    } else if (pq.nbits == 16) {
-        return get_InvertedListScanner1<PQDecoder16>(*this, store_pairs);
+        bool store_pairs,
+        const IDSelector* sel) const {
+    if (sel) {
+        return get_InvertedListScanner2<true>(*this, store_pairs, sel);
     } else {
-        return get_InvertedListScanner1<PQDecoderGeneric>(*this, store_pairs);
+        return get_InvertedListScanner2<false>(*this, store_pairs, sel);
     }
     return nullptr;
 }
diff --git a/thirdparty/faiss/faiss/IndexIVFPQ.h b/thirdparty/faiss/faiss/IndexIVFPQ.h
index 573eff5da..83a4015ea 100644
--- a/thirdparty/faiss/faiss/IndexIVFPQ.h
+++ b/thirdparty/faiss/faiss/IndexIVFPQ.h
@@ -32,8 +32,6 @@ FAISS_API extern size_t precomputed_table_max_bytes;
  * vector is encoded as a product quantizer code.
  */
 struct IndexIVFPQ : IndexIVF {
-    bool by_residual; ///< Encode residual or plain vector?
-
     ProductQuantizer pq; ///< produces the codes
 
     bool do_polysemous_training; ///< reorder PQ centroids after training?
@@ -87,10 +85,9 @@ struct IndexIVFPQ : IndexIVF {
             const idx_t* precomputed_idx = nullptr);
 
     /// trains the product quantizer
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
 
-    /// same as train_residual, also output 2nd level residuals
-    void train_residual_o(idx_t n, const float* x, float* residuals_2);
+    idx_t train_encoder_num_vectors() const override;
 
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
@@ -135,7 +132,8 @@ struct IndexIVFPQ : IndexIVF {
             float* x) const;
 
     InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs) const override;
+            bool store_pairs,
+            const IDSelector* sel) const override;
 
     /// build precomputed table
     void precompute_table();
@@ -143,6 +141,9 @@ struct IndexIVFPQ : IndexIVF {
     IndexIVFPQ();
 };
 
+// block size used in IndexIVFPQ::add_core_o
+FAISS_API extern int index_ivfpq_add_core_o_bs;
+
 /** Pre-compute distance tables for IVFPQ with by-residual and METRIC_L2
  *
  * @param use_precomputed_table (I/O)
@@ -159,6 +160,7 @@ void initialize_IVFPQ_precomputed_table(
         const Index* quantizer,
         const ProductQuantizer& pq,
         AlignedTable<float>& precomputed_table,
+        bool by_residual,
         bool verbose);
 
 /// statistics are robust to internal threading, but not if
diff --git a/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp b/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp
index 9342dcc0c..af9346d14 100644
--- a/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFPQFastScan.cpp
@@ -5,14 +5,12 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <knowhere/utils.h>
-
 #include <faiss/IndexIVFPQFastScan.h>
 
 #include <cassert>
 #include <cinttypes>
 #include <cstdio>
-#include <iostream>
+
 #include <omp.h>
 
 #include <memory>
@@ -29,6 +27,8 @@
 #include <faiss/impl/simd_result_handlers.h>
 #include <faiss/utils/quantize_lut.h>
 
+#include <knowhere/utils.h>
+
 namespace faiss {
 
 using namespace simd_result_handlers;
@@ -42,19 +42,13 @@ IndexIVFPQFastScan::IndexIVFPQFastScan(
         size_t d,
         size_t nlist,
         size_t M,
-        size_t nbits_per_idx,
+        size_t nbits,
         MetricType metric,
         int bbs)
-        : IndexIVF(quantizer, d, nlist, 0, metric),
-          pq(d, M, nbits_per_idx),
-          bbs(bbs) {
-    FAISS_THROW_IF_NOT(nbits_per_idx == 4);
-    M2 = roundup(pq.M, 2);
-    by_residual = false; // set to false by default because it's much faster
-    is_trained = false;
-    code_size = pq.code_size;
+        : IndexIVFFastScan(quantizer, d, nlist, 0, metric), pq(d, M, nbits) {
+    by_residual = false; // set to false by default because it's faster
 
-    replace_invlists(new BlockInvertedLists(nlist, bbs, bbs * M2 / 2), true);
+    init_fastscan(M, nbits, nlist, metric, bbs);
 }
 
 IndexIVFPQFastScan::IndexIVFPQFastScan(
@@ -67,7 +61,7 @@ IndexIVFPQFastScan::IndexIVFPQFastScan(
         MetricType metric,
         int bbs)
         : IndexIVFPQFastScan(quantizer, d, nlist, M, nbits_per_idx, metric, bbs) {
-    is_cosine_ = is_cosine;
+    this->is_cosine = is_cosine;
 }
 
 IndexIVFPQFastScan::IndexIVFPQFastScan() {
@@ -77,26 +71,21 @@ IndexIVFPQFastScan::IndexIVFPQFastScan() {
 }
 
 IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs)
-        : IndexIVF(
+        : IndexIVFFastScan(
                   orig.quantizer,
                   orig.d,
                   orig.nlist,
                   orig.pq.code_size,
                   orig.metric_type),
-          pq(orig.pq),
-          bbs(bbs) {
+          pq(orig.pq) {
     FAISS_THROW_IF_NOT(orig.pq.nbits == 4);
 
+    init_fastscan(orig.pq.M, orig.pq.nbits, orig.nlist, orig.metric_type, bbs);
+
     by_residual = orig.by_residual;
     ntotal = orig.ntotal;
     is_trained = orig.is_trained;
     nprobe = orig.nprobe;
-    size_t M = pq.M;
-
-    M2 = roundup(M, 2);
-
-    replace_invlists(
-            new BlockInvertedLists(orig.nlist, bbs, bbs * M2 / 2), true);
 
     precomputed_table.resize(orig.precomputed_table.size());
 
@@ -132,71 +121,32 @@ IndexIVFPQFastScan::IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs)
  * Training
  *********************************************************/
 
-void IndexIVFPQFastScan::train_residual(idx_t n, const float* x_in) {
-    const float* x = fvecs_maybe_subsample(
-            d,
-            (size_t*)&n,
-            pq.cp.max_points_per_centroid * pq.ksub,
-            x_in,
-            verbose,
-            pq.cp.seed);
-
-    std::unique_ptr<float[]> del_x;
-    if (x != x_in) {
-        del_x.reset((float*)x);
-    }
-
-    const float* trainset;
-    AlignedTable<float> residuals;
-
-    if (by_residual) {
-        if (verbose)
-            printf("computing residuals\n");
-        std::vector<idx_t> assign(n);
-        quantizer->assign(n, x, assign.data());
-        residuals.resize(n * d);
-        for (idx_t i = 0; i < n; i++) {
-            quantizer->compute_residual(
-                    x + i * d, residuals.data() + i * d, assign[i]);
-        }
-        trainset = residuals.data();
-    } else {
-        trainset = x;
-    }
-
-    if (verbose) {
-        printf("training %zdx%zd product quantizer on "
-               "%" PRId64 " vectors in %dD\n",
-               pq.M,
-               pq.ksub,
-               n,
-               d);
-    }
+void IndexIVFPQFastScan::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
     pq.verbose = verbose;
-    pq.train(n, trainset);
+    pq.train(n, x);
 
     if (by_residual && metric_type == METRIC_L2) {
         precompute_table();
     }
 }
 
-void IndexIVFPQFastScan::precompute_table() {
-    initialize_IVFPQ_precomputed_table(
-            use_precomputed_table, quantizer, pq, precomputed_table, verbose);
+idx_t IndexIVFPQFastScan::train_encoder_num_vectors() const {
+    return pq.cp.max_points_per_centroid * pq.ksub;
 }
 
-void IndexIVFPQFastScan::train(idx_t n, const float* x) {
-    if (is_cosine_) {
-        auto norm_data = std::make_unique<float[]>(n * d);
-        std::memcpy(norm_data.get(), x, n * d * sizeof(float));
-        knowhere::NormalizeVecs(norm_data.get(), n, d);
-        IndexIVF::train(n, norm_data.get());
-    } else {
-        IndexIVF::train(n, x);
-    }
+void IndexIVFPQFastScan::precompute_table() {
+    initialize_IVFPQ_precomputed_table(
+            use_precomputed_table,
+            quantizer,
+            pq,
+            precomputed_table,
+            by_residual,
+            verbose);
 }
 
-
 /*********************************************************
  * Code management functions
  *********************************************************/
@@ -232,170 +182,16 @@ void IndexIVFPQFastScan::encode_vectors(
     }
 }
 
-void IndexIVFPQFastScan::add_with_ids_impl(
-        idx_t n,
-        const float* x,
-        const idx_t* xids) {
-    // copied from IndexIVF::add_with_ids --->
-
-    // do some blocking to avoid excessive allocs
-    idx_t bs = 65536;
-    if (n > bs) {
-        for (idx_t i0 = 0; i0 < n; i0 += bs) {
-            idx_t i1 = std::min(n, i0 + bs);
-            if (verbose) {
-                printf("   IndexIVFPQFastScan::add_with_ids %zd: %zd",
-                       size_t(i0),
-                       size_t(i1));
-            }
-            add_with_ids_impl(i1 - i0, x + i0 * d, xids ? xids + i0 : nullptr);
-        }
-        return;
-    }
-    InterruptCallback::check();
-
-    AlignedTable<uint8_t> codes(n * code_size);
-
-    FAISS_THROW_IF_NOT(is_trained);
-    direct_map.check_can_add(xids);
-
-    std::unique_ptr<idx_t[]> idx(new idx_t[n]);
-    quantizer->assign(n, x, idx.get());
-    size_t nadd = 0, nminus1 = 0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (idx[i] < 0)
-            nminus1++;
-    }
-
-    AlignedTable<uint8_t> flat_codes(n * code_size);
-    encode_vectors(n, x, idx.get(), flat_codes.get());
-
-    DirectMapAdd dm_adder(direct_map, n, xids);
-
-    // <---
-
-    BlockInvertedLists* bil = dynamic_cast<BlockInvertedLists*>(invlists);
-    FAISS_THROW_IF_NOT_MSG(bil, "only block inverted lists supported");
-
-    // prepare batches
-    std::vector<idx_t> order(n);
-    for (idx_t i = 0; i < n; i++) {
-        order[i] = i;
-    }
-
-    // TODO should not need stable
-    std::stable_sort(order.begin(), order.end(), [&idx](idx_t a, idx_t b) {
-        return idx[a] < idx[b];
-    });
-
-    // TODO parallelize
-    idx_t i0 = 0;
-    while (i0 < n) {
-        idx_t list_no = idx[order[i0]];
-        idx_t i1 = i0 + 1;
-        while (i1 < n && idx[order[i1]] == list_no) {
-            i1++;
-        }
-
-        if (list_no == -1) {
-            i0 = i1;
-            continue;
-        }
-
-        // make linear array
-        AlignedTable<uint8_t> list_codes((i1 - i0) * code_size);
-        size_t list_size = bil->list_size(list_no);
-
-        bil->resize(list_no, list_size + i1 - i0);
-
-        for (idx_t i = i0; i < i1; i++) {
-            size_t ofs = list_size + i - i0;
-            idx_t id = xids ? xids[order[i]] : ntotal + order[i];
-            dm_adder.add(order[i], list_no, ofs);
-            bil->ids[list_no][ofs] = id;
-            memcpy(list_codes.data() + (i - i0) * code_size,
-                   flat_codes.data() + order[i] * code_size,
-                   code_size);
-            nadd++;
-        }
-        pq4_pack_codes_range(
-                list_codes.data(),
-                pq.M,
-                list_size,
-                list_size + i1 - i0,
-                bbs,
-                M2,
-                bil->codes[list_no].data());
-
-        i0 = i1;
-    }
-
-    ntotal += n;
-}
-
-void IndexIVFPQFastScan::add_with_ids(
-        idx_t n,
-        const float* x,
-        const idx_t* xids) {
-    if (is_cosine_) {
-        auto norm_data = std::make_unique<float[]>(n * d);
-        std::memcpy(norm_data.get(), x, n * d * sizeof(float));
-        norms = std::move(knowhere::NormalizeVecs(norm_data.get(), n, d));
-        add_with_ids_impl(n, norm_data.get(), xids);
-    } else {
-        add_with_ids_impl(n, x, xids);
-    }
-}
-
 /*********************************************************
- * search
+ * Look-Up Table functions
  *********************************************************/
 
-namespace {
-
-// from impl/ProductQuantizer.cpp
-template <class C, typename dis_t>
-void pq_estimators_from_tables_generic(
-        const ProductQuantizer& pq,
-        size_t nbits,
-        const uint8_t* codes,
-        size_t ncodes,
-        const dis_t* dis_table,
-        const int64_t* ids,
-        float dis0,
-        size_t k,
-        typename C::T* heap_dis,
-        int64_t* heap_ids) {
-    using accu_t = typename C::T;
-    const size_t M = pq.M;
-    const size_t ksub = pq.ksub;
-    for (size_t j = 0; j < ncodes; ++j) {
-        PQDecoderGeneric decoder(codes + j * pq.code_size, nbits);
-        accu_t dis = dis0;
-        const dis_t* dt = dis_table;
-        for (size_t m = 0; m < M; m++) {
-            uint64_t c = decoder.decode();
-            dis += dt[c];
-            dt += ksub;
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_pop<C>(k, heap_dis, heap_ids);
-            heap_push<C>(k, heap_dis, heap_ids, dis, ids[j]);
-        }
-    }
-}
-
-using idx_t = Index::idx_t;
-using namespace quantize_lut;
-
-void fvec_madd_avx(
+void fvec_madd_avx_internal(
         size_t n,
-        const float* a,
+        const float* __restrict a,
         float bf,
-        const float* b,
-        float* c) {
+        const float* __restrict b,
+        float* __restrict c) {
     assert(is_aligned_pointer(a));
     assert(is_aligned_pointer(b));
     assert(is_aligned_pointer(c));
@@ -414,11 +210,9 @@ void fvec_madd_avx(
     }
 }
 
-} // anonymous namespace
-
-/*********************************************************
- * Look-Up Table functions
- *********************************************************/
+bool IndexIVFPQFastScan::lookup_table_is_3d() const {
+    return by_residual && metric_type == METRIC_L2;
+}
 
 void IndexIVFPQFastScan::compute_LUT(
         size_t n,
@@ -427,16 +221,14 @@ void IndexIVFPQFastScan::compute_LUT(
         const float* coarse_dis,
         AlignedTable<float>& dis_tables,
         AlignedTable<float>& biases) const {
-    const IndexIVFPQFastScan& ivfpq = *this;
     size_t dim12 = pq.ksub * pq.M;
     size_t d = pq.d;
-    size_t nprobe = ivfpq.nprobe;
 
-    if (ivfpq.by_residual) {
-        if (ivfpq.metric_type == METRIC_L2) {
+    if (by_residual) {
+        if (metric_type == METRIC_L2) {
             dis_tables.resize(n * nprobe * dim12);
 
-            if (ivfpq.use_precomputed_table == 1) {
+            if (use_precomputed_table == 1) {
                 biases.resize(n * nprobe);
                 memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
 
@@ -450,7 +242,7 @@ void IndexIVFPQFastScan::compute_LUT(
                     idx_t cij = coarse_ids[ij];
 
                     if (cij >= 0) {
-                        fvec_madd_avx(
+                        fvec_madd_avx_internal(
                                 dim12,
                                 precomputed_table.get() + cij * dim12,
                                 -2,
@@ -475,7 +267,7 @@ void IndexIVFPQFastScan::compute_LUT(
                     idx_t cij = coarse_ids[ij];
 
                     if (cij >= 0) {
-                        ivfpq.quantizer->compute_residual(x + i * d, xij, cij);
+                        quantizer->compute_residual(x + i * d, xij, cij);
                     } else {
                         // will fill with NaNs
                         memset(xij, -1, sizeof(float) * d);
@@ -486,7 +278,7 @@ void IndexIVFPQFastScan::compute_LUT(
                         n * nprobe, xrel.get(), dis_tables.get());
             }
 
-        } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
+        } else if (metric_type == METRIC_INNER_PRODUCT) {
             dis_tables.resize(n * dim12);
             pq.compute_inner_prod_tables(n, x, dis_tables.get());
             // compute_inner_prod_tables(pq, n, x, dis_tables.get());
@@ -494,936 +286,24 @@ void IndexIVFPQFastScan::compute_LUT(
             biases.resize(n * nprobe);
             memcpy(biases.get(), coarse_dis, sizeof(float) * n * nprobe);
         } else {
-            FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
+            FAISS_THROW_FMT("metric %d not supported", metric_type);
         }
 
     } else {
         dis_tables.resize(n * dim12);
-        if (ivfpq.metric_type == METRIC_L2) {
+        if (metric_type == METRIC_L2) {
             pq.compute_distance_tables(n, x, dis_tables.get());
-        } else if (ivfpq.metric_type == METRIC_INNER_PRODUCT) {
+        } else if (metric_type == METRIC_INNER_PRODUCT) {
             pq.compute_inner_prod_tables(n, x, dis_tables.get());
         } else {
-            FAISS_THROW_FMT("metric %d not supported", ivfpq.metric_type);
-        }
-    }
-}
-
-void IndexIVFPQFastScan::compute_LUT_uint8(
-        size_t n,
-        const float* x,
-        const idx_t* coarse_ids,
-        const float* coarse_dis,
-        AlignedTable<uint8_t>& dis_tables,
-        AlignedTable<uint16_t>& biases,
-        float* normalizers) const {
-    const IndexIVFPQFastScan& ivfpq = *this;
-    AlignedTable<float> dis_tables_float;
-    AlignedTable<float> biases_float;
-
-    compute_LUT(n, x, coarse_ids, coarse_dis, dis_tables_float, biases_float);
-
-    bool lut_is_3d = ivfpq.by_residual && ivfpq.metric_type == METRIC_L2;
-    size_t dim123 = pq.ksub * pq.M;
-    size_t dim123_2 = pq.ksub * M2;
-    if (lut_is_3d) {
-        dim123 *= nprobe;
-        dim123_2 *= nprobe;
-    }
-    dis_tables.resize(n * dim123_2);
-    if (biases_float.get()) {
-        biases.resize(n * nprobe);
-    }
-    uint64_t t1 = get_cy();
-
-#pragma omp parallel for if (n > 100)
-    for (int64_t i = 0; i < n; i++) {
-        const float* t_in = dis_tables_float.get() + i * dim123;
-        const float* b_in = nullptr;
-        uint8_t* t_out = dis_tables.get() + i * dim123_2;
-        uint16_t* b_out = nullptr;
-        if (biases_float.get()) {
-            b_in = biases_float.get() + i * nprobe;
-            b_out = biases.get() + i * nprobe;
-        }
-
-        quantize_LUT_and_bias(
-                nprobe,
-                pq.M,
-                pq.ksub,
-                lut_is_3d,
-                t_in,
-                b_in,
-                t_out,
-                M2,
-                b_out,
-                normalizers + 2 * i,
-                normalizers + 2 * i + 1);
-    }
-}
-
-/*********************************************************
- * Search functions
- *********************************************************/
-
-template <bool is_max>
-void IndexIVFPQFastScan::range_search_dispatch_implem(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const IVFSearchParameters* params,
-        const BitsetView bitset) const {
-    idx_t nprobe = params ? params->nprobe : this->nprobe;
-
-    using Cfloat = typename std::conditional<
-            is_max,
-            CMax<float, int64_t>,
-            CMin<float, int64_t>>::type;
-
-    if (n == 0) {
-        return;
-    }
-    size_t ndis = 0, nlist_visited = 0;
-    range_search_implem_12<Cfloat>(
-            n,
-            x,
-            radius,
-            result,
-            &ndis,
-            &nlist_visited,
-            nprobe,
-            bitset);
-}
-
-template <bool is_max>
-void IndexIVFPQFastScan::search_dispatch_implem(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const IVFSearchParameters* params,
-        const BitsetView bitset) const {
-    idx_t nprobe = params ? params->nprobe : this->nprobe;
-
-    using Cfloat = typename std::conditional<
-            is_max,
-            CMax<float, int64_t>,
-            CMin<float, int64_t>>::type;
-
-    using C = typename std::conditional<
-            is_max,
-            CMax<uint16_t, int64_t>,
-            CMin<uint16_t, int64_t>>::type;
-
-    if (n == 0) {
-        return;
-    }
-
-    // actual implementation used
-    int impl = implem;
-
-    if (impl == 0) {
-        if (bbs == 32) {
-            impl = 12;
-        } else {
-            impl = 10;
-        }
-        if (k > 20) {
-            impl++;
-        }
-    }
-
-    if (impl == 1) {
-        search_implem_1<Cfloat>(n, x, k, distances, labels, nprobe, bitset);
-    } else if (impl == 2) {
-        search_implem_2<C>(n, x, k, distances, labels, nprobe, bitset);
-
-    } else if (impl >= 10 && impl <= 13) {
-        size_t ndis = 0, nlist_visited = 0;
-
-        if (n < 2) {
-            if (impl == 12 || impl == 13) {
-                search_implem_12<C>(
-                        n,
-                        x,
-                        k,
-                        distances,
-                        labels,
-                        impl,
-                        &ndis,
-                        &nlist_visited,
-                        nprobe,
-                        bitset);
-            } else {
-                search_implem_10<C>(
-                        n,
-                        x,
-                        k,
-                        distances,
-                        labels,
-                        impl,
-                        &ndis,
-                        &nlist_visited,
-                        nprobe,
-                        bitset);
-            }
-        } else {
-            // explicitly slice over threads
-            int nslice;
-            if (n <= omp_get_max_threads()) {
-                nslice = n;
-            } else if (by_residual && metric_type == METRIC_L2) {
-                // make sure we don't make too big LUT tables
-                size_t lut_size_per_query = pq.M * pq.ksub * nprobe *
-                        (sizeof(float) + sizeof(uint8_t));
-
-                size_t max_lut_size = precomputed_table_max_bytes;
-                // how many queries we can handle within mem budget
-                size_t nq_ok =
-                        std::max(max_lut_size / lut_size_per_query, size_t(1));
-                nslice =
-                        roundup(std::max(size_t(n / nq_ok), size_t(1)),
-                                omp_get_max_threads());
-            } else {
-                // LUTs unlikely to be a limiting factor
-                nslice = omp_get_max_threads();
-            }
-
-#pragma omp parallel for reduction(+ : ndis, nlist_visited)
-            for (int slice = 0; slice < nslice; slice++) {
-                idx_t i0 = n * slice / nslice;
-                idx_t i1 = n * (slice + 1) / nslice;
-                float* dis_i = distances + i0 * k;
-                idx_t* lab_i = labels + i0 * k;
-                if (impl == 12 || impl == 13) {
-                    search_implem_12<C>(
-                            i1 - i0,
-                            x + i0 * d,
-                            k,
-                            dis_i,
-                            lab_i,
-                            impl,
-                            &ndis,
-                            &nlist_visited,
-                            nprobe,
-                            bitset);
-                } else {
-                    search_implem_10<C>(
-                            i1 - i0,
-                            x + i0 * d,
-                            k,
-                            dis_i,
-                            lab_i,
-                            impl,
-                            &ndis,
-                            &nlist_visited,
-                            nprobe,
-                            bitset);
-                }
-            }
-        }
-        indexIVF_stats.nq += n;
-        indexIVF_stats.ndis += ndis;
-        indexIVF_stats.nlist += nlist_visited;
-    } else {
-        FAISS_THROW_FMT("implem %d does not exist", implem);
-    }
-}
-
-void IndexIVFPQFastScan::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(n, x, k, distances, labels, nullptr, bitset);
-    } else {
-        search_dispatch_implem<false>(n, x, k, distances, labels, nullptr, bitset);
-    }
-}
-
-void IndexIVFPQFastScan::search_thread_safe(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const size_t nprobe,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    const size_t final_nprobe = std::min(nlist, nprobe);
-    FAISS_THROW_IF_NOT(final_nprobe > 0);
-    IVFSearchParameters params;
-    params.nprobe = final_nprobe;
-
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(n, x, k, distances, labels, &params, bitset);
-    } else {
-        search_dispatch_implem<false>(n, x, k, distances, labels, &params, bitset);
-    }
-}
-
-void IndexIVFPQFastScan::range_search_thread_safe(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const size_t nprobe,
-        const BitsetView bitset) const {
-    const size_t final_nprobe = std::min(nlist, nprobe);
-    FAISS_THROW_IF_NOT(final_nprobe > 0);
-    IVFSearchParameters params;
-    params.nprobe = final_nprobe;
-
-    if (metric_type == METRIC_L2) {
-        range_search_dispatch_implem<true>(
-                n, x, radius, result, &params, bitset);
-    } else {
-        range_search_dispatch_implem<false>(
-                n, x, radius, result, &params, bitset);
-    }
-}
-
-template <class C>
-void IndexIVFPQFastScan::search_implem_1(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        idx_t nprobe,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(orig_invlists);
-
-    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
-
-    size_t dim12 = pq.ksub * pq.M;
-    AlignedTable<float> dis_tables;
-    AlignedTable<float> biases;
-
-    compute_LUT(n, x, coarse_ids.get(), coarse_dis.get(), dis_tables, biases);
-
-    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
-
-    size_t ndis = 0, nlist_visited = 0;
-
-#pragma omp parallel for reduction(+ : ndis, nlist_visited)
-    for (idx_t i = 0; i < n; i++) {
-        int64_t* heap_ids = labels + i * k;
-        float* heap_dis = distances + i * k;
-        heap_heapify<C>(k, heap_dis, heap_ids);
-        float* LUT = nullptr;
-
-        if (single_LUT) {
-            LUT = dis_tables.get() + i * dim12;
-        }
-        for (idx_t j = 0; j < nprobe; j++) {
-            if (!single_LUT) {
-                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
-            }
-            idx_t list_no = coarse_ids[i * nprobe + j];
-            if (list_no < 0)
-                continue;
-            size_t ls = orig_invlists->list_size(list_no);
-            if (ls == 0)
-                continue;
-            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
-            InvertedLists::ScopedIds ids(orig_invlists, list_no);
-
-            float bias = biases.get() ? biases[i * nprobe + j] : 0;
-
-            pq_estimators_from_tables_generic<C>(
-                    pq,
-                    pq.nbits,
-                    codes.get(),
-                    ls,
-                    LUT,
-                    ids.get(),
-                    bias,
-                    k,
-                    heap_dis,
-                    heap_ids);
-            nlist_visited++;
-            ndis++;
-        }
-        heap_reorder<C>(k, heap_dis, heap_ids);
-    }
-    indexIVF_stats.nq += n;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nlist += nlist_visited;
-}
-
-template <class C>
-void IndexIVFPQFastScan::search_implem_2(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        idx_t nprobe,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(orig_invlists);
-
-    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
-
-    size_t dim12 = pq.ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids.get(),
-            coarse_dis.get(),
-            dis_tables,
-            biases,
-            normalizers.get());
-
-    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
-
-    size_t ndis = 0, nlist_visited = 0;
-
-#pragma omp parallel for reduction(+ : ndis, nlist_visited)
-    for (idx_t i = 0; i < n; i++) {
-        std::vector<uint16_t> tmp_dis(k);
-        int64_t* heap_ids = labels + i * k;
-        uint16_t* heap_dis = tmp_dis.data();
-        heap_heapify<C>(k, heap_dis, heap_ids);
-        const uint8_t* LUT = nullptr;
-
-        if (single_LUT) {
-            LUT = dis_tables.get() + i * dim12;
-        }
-        for (idx_t j = 0; j < nprobe; j++) {
-            if (!single_LUT) {
-                LUT = dis_tables.get() + (i * nprobe + j) * dim12;
-            }
-            idx_t list_no = coarse_ids[i * nprobe + j];
-            if (list_no < 0)
-                continue;
-            size_t ls = orig_invlists->list_size(list_no);
-            if (ls == 0)
-                continue;
-            InvertedLists::ScopedCodes codes(orig_invlists, list_no);
-            InvertedLists::ScopedIds ids(orig_invlists, list_no);
-
-            uint16_t bias = biases.get() ? biases[i * nprobe + j] : 0;
-
-            pq_estimators_from_tables_generic<C>(
-                    pq,
-                    pq.nbits,
-                    codes.get(),
-                    ls,
-                    LUT,
-                    ids.get(),
-                    bias,
-                    k,
-                    heap_dis,
-                    heap_ids);
-
-            nlist_visited++;
-            ndis += ls;
-        }
-        heap_reorder<C>(k, heap_dis, heap_ids);
-        // convert distances to float
-        {
-            float one_a = 1 / normalizers[2 * i], b = normalizers[2 * i + 1];
-            if (skip & 16) {
-                one_a = 1;
-                b = 0;
-            }
-            float* heap_dis_float = distances + i * k;
-            for (int j = 0; j < k; j++) {
-                heap_dis_float[j] = b + heap_dis[j] * one_a;
-            }
-        }
-    }
-    indexIVF_stats.nq += n;
-    indexIVF_stats.ndis += ndis;
-    indexIVF_stats.nlist += nlist_visited;
-}
-
-template <class C>
-void IndexIVFPQFastScan::search_implem_10(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        int impl,
-        size_t* ndis_out,
-        size_t* nlist_out,
-        idx_t nprobe,
-        const BitsetView bitset) const {
-    memset(distances, -1, sizeof(float) * k * n);
-    memset(labels, -1, sizeof(idx_t) * k * n);
-
-    using HeapHC = HeapHandler<C, true>;
-    using ReservoirHC = ReservoirHandler<C, true>;
-    using SingleResultHC = SingleResultHandler<C, true>;
-
-    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-    uint64_t times[10];
-    memset(times, 0, sizeof(times));
-    int ti = 0;
-#define TIC times[ti++] = get_cy()
-    TIC;
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
-
-    TIC;
-
-    size_t dim12 = pq.ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids.get(),
-            coarse_dis.get(),
-            dis_tables,
-            biases,
-            normalizers.get());
-
-    TIC;
-
-    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
-
-    TIC;
-    size_t ndis = 0, nlist_visited = 0;
-
-    {
-        AlignedTable<uint16_t> tmp_distances(k);
-        for (idx_t i = 0; i < n; i++) {
-            const uint8_t* LUT = nullptr;
-            int qmap1[1] = {0};
-            std::unique_ptr<SIMDResultHandler<C, true>> handler;
-
-            if (k == 1) {
-                handler.reset(new SingleResultHC(1, 0, bitset));
-            } else if (impl == 10) {
-                handler.reset(new HeapHC(
-                        1, tmp_distances.get(), labels + i * k, k, 0, bitset));
-            } else if (impl == 11) {
-                handler.reset(new ReservoirHC(1, 0, k, 2 * k, bitset));
-            } else {
-                FAISS_THROW_MSG("invalid");
-            }
-
-            handler->q_map = qmap1;
-
-            if (single_LUT) {
-                LUT = dis_tables.get() + i * dim12;
-            }
-            for (idx_t j = 0; j < nprobe; j++) {
-                size_t ij = i * nprobe + j;
-                if (!single_LUT) {
-                    LUT = dis_tables.get() + ij * dim12;
-                }
-                if (biases.get()) {
-                    handler->dbias = biases.get() + ij;
-                }
-
-                idx_t list_no = coarse_ids[ij];
-                if (list_no < 0)
-                    continue;
-                size_t ls = invlists->list_size(list_no);
-                if (ls == 0)
-                    continue;
-
-                InvertedLists::ScopedCodes codes(invlists, list_no);
-                InvertedLists::ScopedIds ids(invlists, list_no);
-
-                handler->ntotal = ls;
-                handler->id_map = ids.get();
-
-#define DISPATCH(classHC)                                              \
-    if (dynamic_cast<classHC*>(handler.get())) {                       \
-        auto* res = static_cast<classHC*>(handler.get());              \
-        pq4_accumulate_loop(                                           \
-                1, roundup(ls, bbs), bbs, M2, codes.get(), LUT, *res); \
-    }
-                DISPATCH(HeapHC)
-                else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
-#undef DISPATCH
-
-                        nlist_visited++;
-                ndis++;
-            }
-
-            handler->to_flat_arrays(
-                    distances + i * k,
-                    labels + i * k,
-                    skip & 16 ? nullptr : normalizers.get() + i * 2);
-        }
-    }
-    *ndis_out = ndis;
-    *nlist_out = nlist;
-}
-
-template <class C>
-void IndexIVFPQFastScan::search_implem_12(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        int impl,
-        size_t* ndis_out,
-        size_t* nlist_out,
-        idx_t nprobe,
-        const BitsetView bitset) const {
-    if (n == 0) { // does not work well with reservoir
-        return;
-    }
-    FAISS_THROW_IF_NOT(bbs == 32);
-
-    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-    uint64_t times[10];
-    memset(times, 0, sizeof(times));
-    int ti = 0;
-#define TIC times[ti++] = get_cy()
-    TIC;
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
-
-    TIC;
-
-    size_t dim12 = pq.ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids.get(),
-            coarse_dis.get(),
-            dis_tables,
-            biases,
-            normalizers.get());
-
-    TIC;
-
-    struct QC {
-        int qno;     // sequence number of the query
-        int list_no; // list to visit
-        int rank;    // this is the rank'th result of the coarse quantizer
-    };
-    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
-
-    std::vector<QC> qcs;
-    {
-        int ij = 0;
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < nprobe; j++) {
-                if (coarse_ids[ij] >= 0) {
-                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
-                }
-                ij++;
-            }
-        }
-        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
-            return a.list_no < b.list_no;
-        });
-    }
-    TIC;
-
-    // prepare the result handlers
-
-    std::unique_ptr<SIMDResultHandler<C, true>> handler;
-    AlignedTable<uint16_t> tmp_distances;
-
-    using HeapHC = HeapHandler<C, true>;
-    using ReservoirHC = ReservoirHandler<C, true>;
-    using SingleResultHC = SingleResultHandler<C, true>;
-
-    if (k == 1) {
-        handler.reset(new SingleResultHC(n, 0, bitset));
-    } else if (impl == 12) {
-        tmp_distances.resize(n * k);
-        handler.reset(new HeapHC(n, tmp_distances.get(), labels, k, 0, bitset));
-    } else if (impl == 13) {
-        handler.reset(new ReservoirHC(n, 0, k, 2 * k, bitset));
-    }
-
-    int qbs2 = this->qbs2 ? this->qbs2 : 11;
-
-    std::vector<uint16_t> tmp_bias;
-    if (biases.get()) {
-        tmp_bias.resize(qbs2);
-        handler->dbias = tmp_bias.data();
-    }
-    TIC;
-
-    size_t ndis = 0;
-
-    size_t i0 = 0;
-    uint64_t t_copy_pack = 0, t_scan = 0;
-    while (i0 < qcs.size()) {
-        uint64_t tt0 = get_cy();
-
-        // find all queries that access this inverted list
-        int list_no = qcs[i0].list_no;
-        size_t i1 = i0 + 1;
-
-        while (i1 < qcs.size() && i1 < i0 + qbs2) {
-            if (qcs[i1].list_no != list_no) {
-                break;
-            }
-            i1++;
-        }
-
-        size_t list_size = invlists->list_size(list_no);
-
-        if (list_size == 0) {
-            i0 = i1;
-            continue;
+            FAISS_THROW_FMT("metric %d not supported", metric_type);
         }
-
-        // re-organize LUTs and biases into the right order
-        int nc = i1 - i0;
-
-        std::vector<int> q_map(nc), lut_entries(nc);
-        AlignedTable<uint8_t> LUT(nc * dim12);
-        memset(LUT.get(), -1, nc * dim12);
-        int qbs = pq4_preferred_qbs(nc);
-
-        for (size_t i = i0; i < i1; i++) {
-            const QC& qc = qcs[i];
-            q_map[i - i0] = qc.qno;
-            int ij = qc.qno * nprobe + qc.rank;
-            lut_entries[i - i0] = single_LUT ? qc.qno : ij;
-            if (biases.get()) {
-                tmp_bias[i - i0] = biases[ij];
-            }
-        }
-        pq4_pack_LUT_qbs_q_map(
-                qbs, M2, dis_tables.get(), lut_entries.data(), LUT.get());
-
-        // access the inverted list
-
-        ndis += (i1 - i0) * list_size;
-
-        InvertedLists::ScopedCodes codes(invlists, list_no);
-        InvertedLists::ScopedIds ids(invlists, list_no);
-
-        // prepare the handler
-
-        handler->ntotal = list_size;
-        handler->q_map = q_map.data();
-        handler->id_map = ids.get();
-        uint64_t tt1 = get_cy();
-
-#define DISPATCH(classHC)                                          \
-    if (dynamic_cast<classHC*>(handler.get())) {                   \
-        auto* res = static_cast<classHC*>(handler.get());          \
-        pq4_accumulate_loop_qbs(                                   \
-                qbs, list_size, M2, codes.get(), LUT.get(), *res); \
     }
-        DISPATCH(HeapHC)
-        else DISPATCH(ReservoirHC) else DISPATCH(SingleResultHC)
-
-                // prepare for next loop
-                i0 = i1;
-
-        uint64_t tt2 = get_cy();
-        t_copy_pack += tt1 - tt0;
-        t_scan += tt2 - tt1;
-    }
-    TIC;
-
-    // labels is in-place for HeapHC
-    handler->to_flat_arrays(
-            distances, labels, skip & 16 ? nullptr : normalizers.get());
-
-    TIC;
-
-    *ndis_out = ndis;
-    *nlist_out = nlist;
 }
 
-template <class C>
-void IndexIVFPQFastScan::range_search_implem_12(
-        idx_t n,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        size_t* ndis_out,
-        size_t* nlist_out,
-        idx_t nprobe,
-        const BitsetView bitset) const {
-    if (n == 0) { // does not work well with reservoir
-        return;
-    }
-    FAISS_THROW_IF_NOT(n == 1);  // in knowhere, all request will make nq=1
-    FAISS_THROW_IF_NOT(bbs == 32);
-
-    std::unique_ptr<idx_t[]> coarse_ids(new idx_t[n * nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-    uint64_t times[10];
-    memset(times, 0, sizeof(times));
-    int ti = 0;
-#define TIC times[ti++] = get_cy()
-    TIC;
-
-    quantizer->search(n, x, nprobe, coarse_dis.get(), coarse_ids.get());
-
-    TIC;
-
-    size_t dim12 = pq.ksub * M2;
-    AlignedTable<uint8_t> dis_tables;
-    AlignedTable<uint16_t> biases;
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    compute_LUT_uint8(
-            n,
-            x,
-            coarse_ids.get(),
-            coarse_dis.get(),
-            dis_tables,
-            biases,
-            normalizers.get());
-
-    TIC;
-
-    struct QC {
-        int qno;     // sequence number of the query
-        int list_no; // list to visit
-        int rank;    // this is the rank'th result of the coarse quantizer
-    };
-    bool single_LUT = !(by_residual && metric_type == METRIC_L2);
-
-    std::vector<QC> qcs;
-    {
-        int ij = 0;
-        for (int i = 0; i < n; i++) {
-            for (int j = 0; j < nprobe; j++) {
-                if (coarse_ids[ij] >= 0) {
-                    qcs.push_back(QC{i, int(coarse_ids[ij]), int(j)});
-                }
-                ij++;
-            }
-        }
-        std::sort(qcs.begin(), qcs.end(), [](const QC& a, const QC& b) {
-            return a.list_no < b.list_no;
-        });
-    }
-    TIC;
-
-    // prepare the result handlers
-    std::unique_ptr<RangeSearchResultHandler<C, true>> handler(new RangeSearchResultHandler<C, true>(result, radius, 0, bitset));
-    handler->normalizers = normalizers.get();
-    int qbs2 = this->qbs2 ? this->qbs2 : 11;
-
-    std::vector<uint16_t> tmp_bias;
-    if (biases.get()) {
-        tmp_bias.resize(qbs2);
-        handler->dbias = tmp_bias.data();
-    }
-    TIC;
-
-    size_t ndis = 0;
-
-    size_t i0 = 0;
-    uint64_t t_copy_pack = 0, t_scan = 0;
-    while (i0 < qcs.size()) {
-        uint64_t tt0 = get_cy();
-
-        // find all queries that access this inverted list
-        int list_no = qcs[i0].list_no;
-        size_t i1 = i0 + 1;
-
-        while (i1 < qcs.size() && i1 < i0 + qbs2) {
-            if (qcs[i1].list_no != list_no) {
-                break;
-            }
-            i1++;
-        }
-
-        size_t list_size = invlists->list_size(list_no);
-
-        if (list_size == 0) {
-            i0 = i1;
-            continue;
-        }
-
-        // re-organize LUTs and biases into the right order
-        int nc = i1 - i0;
-
-        std::vector<int> q_map(nc), lut_entries(nc);
-        AlignedTable<uint8_t> LUT(nc * dim12);
-        memset(LUT.get(), -1, nc * dim12);
-        int qbs = pq4_preferred_qbs(nc);
-
-        for (size_t i = i0; i < i1; i++) {
-            const QC& qc = qcs[i];
-            q_map[i - i0] = qc.qno;
-            int ij = qc.qno * nprobe + qc.rank;
-            lut_entries[i - i0] = single_LUT ? qc.qno : ij;
-            if (biases.get()) {
-                tmp_bias[i - i0] = biases[ij];
-            }
-        }
-        pq4_pack_LUT_qbs_q_map(
-                qbs, M2, dis_tables.get(), lut_entries.data(),
-                LUT.get());
-
-        // access the inverted list
-
-        ndis += (i1 - i0) * list_size;
-
-        InvertedLists::ScopedCodes codes(invlists, list_no);
-        InvertedLists::ScopedIds ids(invlists, list_no);
-
-        // prepare the handler
-
-        handler->ntotal = list_size;
-        handler->q_map = q_map.data();
-        handler->id_map = ids.get();
-        handler->in_range_num = 0;
-        uint64_t tt1 = get_cy();
-
-        pq4_accumulate_loop_qbs(qbs, list_size, M2, codes.get(), LUT.get(), *(handler.get()));
-        if (handler->in_range_num <= 0) {
-            break;
-        }
-        
-    // prepare for next loop
-        i0 = i1;
-
-        uint64_t tt2 = get_cy();
-        t_copy_pack += tt1 - tt0;
-        t_scan += tt2 - tt1;
-    }
-    TIC;
-
-    handler->to_result();
-
-    TIC;
-
-    *ndis_out = ndis;
-    *nlist_out = nlist;
+void IndexIVFPQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x)
+        const {
+    pq.decode(bytes, x, n);
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFPQFastScan.h b/thirdparty/faiss/faiss/IndexIVFPQFastScan.h
index a57d60b90..66915ad7b 100644
--- a/thirdparty/faiss/faiss/IndexIVFPQFastScan.h
+++ b/thirdparty/faiss/faiss/IndexIVFPQFastScan.h
@@ -9,6 +9,7 @@
 
 #include <memory>
 
+#include <faiss/IndexIVFFastScan.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/utils/AlignedTable.h>
@@ -31,39 +32,21 @@ namespace faiss {
  * 13: idem, collect results in reservoir
  */
 
-struct IndexIVFPQFastScan : IndexIVF {
-    bool by_residual;    ///< Encode residual or plain vector?
+struct IndexIVFPQFastScan : IndexIVFFastScan {
     ProductQuantizer pq; ///< produces the codes
 
-    // size of the kernel
-    int bbs; // set at build time
-
-    // M rounded up to a multiple of 2
-    size_t M2;
-
     /// precomputed tables management
     int use_precomputed_table = 0;
     /// if use_precompute_table size (nlist, pq.M, pq.ksub)
     AlignedTable<float> precomputed_table;
 
-    // search-time implementation
-    int implem = 0;
-    // skip some parts of the computation (for timing)
-    int skip = 0;
-
-    // batching factors at search time (0 = default)
-    int qbs = 0;
-    size_t qbs2 = 0;
-
-    bool is_cosine_ = false;
-    std::vector<float> norms;
-
+    // todo agzuhva: add back cosine support from knowhere
     IndexIVFPQFastScan(
             Index* quantizer,
             size_t d,
             size_t nlist,
             size_t M,
-            size_t nbits_per_idx,
+            size_t nbits,
             MetricType metric = METRIC_L2,
             int bbs = 32);
 
@@ -72,7 +55,7 @@ struct IndexIVFPQFastScan : IndexIVF {
             size_t d,
             size_t nlist,
             size_t M,
-            size_t nbits_per_idx,
+            size_t nbits,
             bool is_cosine,
             MetricType metric = METRIC_L2,
             int bbs = 32);
@@ -82,16 +65,9 @@ struct IndexIVFPQFastScan : IndexIVF {
     // built from an IndexIVFPQ
     explicit IndexIVFPQFastScan(const IndexIVFPQ& orig, int bbs = 32);
 
-    /// orig's inverted lists (for debugging)
-    InvertedLists* orig_invlists = nullptr;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
 
-    void train_residual(idx_t n, const float* x) override;
-
-    void train(idx_t n, const float* x) override;
-
-    void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
-
-    void add_with_ids_impl(idx_t n, const float* x, const idx_t* xids);
+    idx_t train_encoder_num_vectors() const override;
 
     /// build precomputed table, possibly updating use_precomputed_table
     void precompute_table();
@@ -105,129 +81,19 @@ struct IndexIVFPQFastScan : IndexIVF {
             uint8_t* codes,
             bool include_listno = false) const override;
 
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
-
-    void search_thread_safe(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const size_t nprobe,
-            const BitsetView bitset = nullptr) const;
-
-    void range_search_thread_safe(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const size_t nprobe,
-            const BitsetView bitset = nullptr) const;
-
     // prepare look-up tables
 
+    bool lookup_table_is_3d() const override;
+
     void compute_LUT(
             size_t n,
             const float* x,
             const idx_t* coarse_ids,
             const float* coarse_dis,
             AlignedTable<float>& dis_tables,
-            AlignedTable<float>& biases) const;
+            AlignedTable<float>& biases) const override;
 
-    void compute_LUT_uint8(
-            size_t n,
-            const float* x,
-            const idx_t* coarse_ids,
-            const float* coarse_dis,
-            AlignedTable<uint8_t>& dis_tables,
-            AlignedTable<uint16_t>& biases,
-            float* normalizers) const;
-
-    // internal search funcs
-
-    template <bool is_max>
-    void search_dispatch_implem(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const IVFSearchParameters* params = nullptr,
-            const BitsetView bitset = nullptr) const;
-
-    template <bool is_max>
-    void range_search_dispatch_implem(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            const IVFSearchParameters* params = nullptr,
-            const BitsetView bitset = nullptr) const;
-
-    template <class C>
-    void search_implem_1(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            idx_t nprobe,
-            const BitsetView bitset = nullptr) const;
-
-    template <class C>
-    void search_implem_2(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            idx_t nprobe,
-            const BitsetView bitset = nullptr) const;
-
-    // implem 10 and 12 are not multithreaded internally, so
-    // export search stats
-    template <class C>
-    void search_implem_10(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            int impl,
-            size_t* ndis_out,
-            size_t* nlist_out,
-            idx_t nprobe,
-            const BitsetView bitset = nullptr) const;
-
-    template <class C>
-    void search_implem_12(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            int impl,
-            size_t* ndis_out,
-            size_t* nlist_out,
-            idx_t nprobe,
-            const BitsetView bitset = nullptr) const;
-
-    template <class C>
-    void range_search_implem_12(
-            idx_t n,
-            const float* x,
-            float radius,
-            RangeSearchResult* result,
-            size_t* ndis_out,
-            size_t* nlist_out,
-            idx_t nprobe,
-            const BitsetView bitset = nullptr) const;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexIVFPQR.cpp b/thirdparty/faiss/faiss/IndexIVFPQR.cpp
index 596ded87c..c9d81f9c0 100644
--- a/thirdparty/faiss/faiss/IndexIVFPQR.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFPQR.cpp
@@ -36,10 +36,12 @@ IndexIVFPQR::IndexIVFPQR(
           refine_pq(d, M_refine, nbits_per_idx_refine),
           k_factor(4) {
     by_residual = true;
+    refine_pq.cp.max_points_per_centroid = 1000;
 }
 
 IndexIVFPQR::IndexIVFPQR() : k_factor(1) {
     by_residual = true;
+    refine_pq.cp.max_points_per_centroid = 1000;
 }
 
 void IndexIVFPQR::reset() {
@@ -47,24 +49,39 @@ void IndexIVFPQR::reset() {
     refine_codes.clear();
 }
 
-void IndexIVFPQR::train_residual(idx_t n, const float* x) {
-    float* residual_2 = new float[n * d];
-    ScopeDeleter<float> del(residual_2);
-
-    train_residual_o(n, x, residual_2);
-
-    if (verbose)
+void IndexIVFPQR::train_encoder(idx_t n, const float* x, const idx_t* assign) {
+    IndexIVFPQ::train_encoder(n, x, assign);
+    if (verbose) {
         printf("training %zdx%zd 2nd level PQ quantizer on %" PRId64
                " %dD-vectors\n",
                refine_pq.M,
                refine_pq.ksub,
                n,
                d);
-
-    refine_pq.cp.max_points_per_centroid = 1000;
+    }
     refine_pq.cp.verbose = verbose;
 
-    refine_pq.train(n, residual_2);
+    // 2nd level residual
+    std::vector<float> residual_2(n * d);
+    std::vector<uint8_t> train_codes(pq.code_size * n);
+    pq.compute_codes(x, train_codes.data(), n);
+
+    for (idx_t i = 0; i < n; i++) {
+        const float* xx = x + i * d;
+        float* res = residual_2.data() + i * d;
+        pq.decode(train_codes.data() + i * pq.code_size, res);
+        for (int j = 0; j < d; j++) {
+            res[j] = xx[j] - res[j];
+        }
+    }
+
+    refine_pq.train(n, residual_2.data());
+}
+
+idx_t IndexIVFPQR::train_encoder_num_vectors() const {
+    return std::max(
+            pq.cp.max_points_per_centroid * pq.ksub,
+            refine_pq.cp.max_points_per_centroid * refine_pq.ksub);
 }
 
 void IndexIVFPQR::add_with_ids(idx_t n, const float* x, const idx_t* xids) {
@@ -102,8 +119,7 @@ void IndexIVFPQR::search_preassigned(
         idx_t* labels,
         bool store_pairs,
         const IVFSearchParameters* params,
-        IndexIVFStats* stats,
-        const BitsetView bitset) const {
+        IndexIVFStats* stats) const {
     uint64_t t0;
     TIC;
     size_t k_coarse = long(k * k_factor);
@@ -122,9 +138,7 @@ void IndexIVFPQR::search_preassigned(
                 coarse_distances,
                 coarse_labels,
                 true,
-                params,
-                stats,
-                bitset);
+                params);
     }
 
     indexIVFPQ_stats.search_cycles += TOC;
@@ -206,11 +220,11 @@ void IndexIVFPQR::reconstruct_from_offset(
     }
 }
 
-void IndexIVFPQR::merge_from(IndexIVF& other_in, idx_t add_id) {
-    IndexIVFPQR* other = dynamic_cast<IndexIVFPQR*>(&other_in);
+void IndexIVFPQR::merge_from(Index& otherIndex, idx_t add_id) {
+    IndexIVFPQR* other = dynamic_cast<IndexIVFPQR*>(&otherIndex);
     FAISS_THROW_IF_NOT(other);
 
-    IndexIVF::merge_from(other_in, add_id);
+    IndexIVF::merge_from(otherIndex, add_id);
 
     refine_codes.insert(
             refine_codes.end(),
diff --git a/thirdparty/faiss/faiss/IndexIVFPQR.h b/thirdparty/faiss/faiss/IndexIVFPQR.h
index 85f403ad4..1002524d6 100644
--- a/thirdparty/faiss/faiss/IndexIVFPQR.h
+++ b/thirdparty/faiss/faiss/IndexIVFPQR.h
@@ -37,7 +37,9 @@ struct IndexIVFPQR : IndexIVFPQ {
     size_t remove_ids(const IDSelector& sel) override;
 
     /// trains the two product quantizers
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+
+    idx_t train_encoder_num_vectors() const override;
 
     void add_with_ids(idx_t n, const float* x, const idx_t* xids) override;
 
@@ -52,7 +54,7 @@ struct IndexIVFPQR : IndexIVFPQ {
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
 
-    void merge_from(IndexIVF& other, idx_t add_id) override;
+    void merge_from(Index& otherIndex, idx_t add_id) override;
 
     void search_preassigned(
             idx_t n,
@@ -64,8 +66,7 @@ struct IndexIVFPQR : IndexIVFPQ {
             idx_t* labels,
             bool store_pairs,
             const IVFSearchParameters* params = nullptr,
-            IndexIVFStats* stats = nullptr,
-            const BitsetView bitset = nullptr) const override;
+            IndexIVFStats* stats = nullptr) const override;
 
     IndexIVFPQR();
 };
diff --git a/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp b/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp
index 64ba0c045..e5d9a8e82 100644
--- a/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp
+++ b/thirdparty/faiss/faiss/IndexIVFSpectralHash.cpp
@@ -9,8 +9,8 @@
 
 #include <faiss/IndexIVFSpectralHash.h>
 
-#include <stdint.h>
 #include <algorithm>
+#include <cstdint>
 #include <memory>
 
 #include <faiss/IndexLSH.h>
@@ -31,22 +31,17 @@ IndexIVFSpectralHash::IndexIVFSpectralHash(
         float period)
         : IndexIVF(quantizer, d, nlist, (nbit + 7) / 8, METRIC_L2),
           nbit(nbit),
-          period(period),
-          threshold_type(Thresh_global) {
+          period(period) {
     RandomRotationMatrix* rr = new RandomRotationMatrix(d, nbit);
     rr->init(1234);
     vt = rr;
-    own_fields = true;
     is_trained = false;
+    by_residual = false;
 }
 
-IndexIVFSpectralHash::IndexIVFSpectralHash()
-        : IndexIVF(),
-          vt(nullptr),
-          own_fields(false),
-          nbit(0),
-          period(0),
-          threshold_type(Thresh_global) {}
+IndexIVFSpectralHash::IndexIVFSpectralHash() : IndexIVF() {
+    by_residual = false;
+}
 
 IndexIVFSpectralHash::~IndexIVFSpectralHash() {
     if (own_fields) {
@@ -67,10 +62,14 @@ float median(size_t n, float* x) {
 
 } // namespace
 
-void IndexIVFSpectralHash::train_residual(idx_t n, const float* x) {
+void IndexIVFSpectralHash::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
     if (!vt->is_trained) {
         vt->train(n, x);
     }
+    FAISS_THROW_IF_NOT(!by_residual);
 
     if (threshold_type == Thresh_global) {
         // nothing to do
@@ -167,6 +166,7 @@ void IndexIVFSpectralHash::encode_vectors(
         uint8_t* codes,
         bool include_listnos) const {
     FAISS_THROW_IF_NOT(is_trained);
+    FAISS_THROW_IF_NOT(!by_residual);
     float freq = 2.0 / period;
     size_t coarse_size = include_listnos ? coarse_code_size() : 0;
 
@@ -213,9 +213,7 @@ struct IVFScanner : InvertedListScanner {
     std::vector<uint8_t> qcode;
     HammingComputer hc;
 
-    using idx_t = Index::idx_t;
-
-    IVFScanner(const IndexIVFSpectralHash* index, bool store_pairs)
+    IVFScanner(const IndexIVFSpectralHash* index, bool store_pairs, const IDSelector* sel = nullptr)
             : index(index),
               nbit(index->nbit),
               period(index->period),
@@ -225,6 +223,7 @@ struct IVFScanner : InvertedListScanner {
               qcode(index->code_size),
               hc(qcode.data(), index->code_size) {
         this->store_pairs = store_pairs;
+        this->sel = sel;
         this->code_size = index->code_size;
     }
 
@@ -259,11 +258,10 @@ struct IVFScanner : InvertedListScanner {
             const idx_t* ids,
             float* simi,
             idx_t* idxi,
-            size_t k,
-            const BitsetView bitset) const override {
+            size_t k) const override {
         size_t nup = 0;
         for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
+            if (!sel || sel->is_member(ids[j])) {
                 float dis = hc.compute(codes);
 
                 if (dis < simi[0]) {
@@ -283,10 +281,9 @@ struct IVFScanner : InvertedListScanner {
             const float* code_norms,
             const idx_t* ids,
             float radius,
-            RangeQueryResult& res,
-            const BitsetView bitset) const override {
+            RangeQueryResult& res) const override {
         for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
+            if (!sel || sel->is_member(ids[j])) {
                 float dis = hc.compute(codes);
                 if (dis < radius) {
                     int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
@@ -298,24 +295,23 @@ struct IVFScanner : InvertedListScanner {
     }
 };
 
+struct BuildScanner {
+    using T = InvertedListScanner*;
+
+    template <class HammingComputer>
+    static T f(const IndexIVFSpectralHash* index, bool store_pairs, const IDSelector* sel) {
+        return new IVFScanner<HammingComputer>(index, store_pairs, sel);
+    }
+};
+
 } // anonymous namespace
 
 InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner(
-        bool store_pairs) const {
-    switch (code_size) {
-#define HANDLE_CODE_SIZE(cs) \
-    case cs:                 \
-        return new IVFScanner<HammingComputer##cs>(this, store_pairs)
-        HANDLE_CODE_SIZE(4);
-        HANDLE_CODE_SIZE(8);
-        HANDLE_CODE_SIZE(16);
-        HANDLE_CODE_SIZE(20);
-        HANDLE_CODE_SIZE(32);
-        HANDLE_CODE_SIZE(64);
-#undef HANDLE_CODE_SIZE
-        default:
-            return new IVFScanner<HammingComputerDefault>(this, store_pairs);
-    }
+        bool store_pairs,
+        const IDSelector* sel) const {
+    FAISS_THROW_IF_NOT(!sel);
+    BuildScanner bs;
+    return dispatch_HammingComputer(code_size, bs, this, store_pairs, sel);
 }
 
 void IndexIVFSpectralHash::replace_vt(VectorTransform* vt_in, bool own) {
diff --git a/thirdparty/faiss/faiss/IndexIVFSpectralHash.h b/thirdparty/faiss/faiss/IndexIVFSpectralHash.h
index 50da7a915..ae7df58e4 100644
--- a/thirdparty/faiss/faiss/IndexIVFSpectralHash.h
+++ b/thirdparty/faiss/faiss/IndexIVFSpectralHash.h
@@ -30,14 +30,14 @@ struct IndexPreTransform;
  */
 struct IndexIVFSpectralHash : IndexIVF {
     /// transformation from d to nbit dim
-    VectorTransform* vt;
+    VectorTransform* vt = nullptr;
     /// own the vt
-    bool own_fields;
+    bool own_fields = true;
 
     /// nb of bits of the binary signature
-    int nbit;
+    int nbit = 0;
     /// interval size for 0s and 1s
-    float period;
+    float period = 0;
 
     enum ThresholdType {
         Thresh_global,        ///< global threshold at 0
@@ -45,7 +45,7 @@ struct IndexIVFSpectralHash : IndexIVF {
         Thresh_centroid_half, ///< central interval around centroid
         Thresh_median         ///< median of training set
     };
-    ThresholdType threshold_type;
+    ThresholdType threshold_type = Thresh_global;
 
     /// Trained threshold.
     /// size nlist * nbit or 0 if Thresh_global
@@ -60,7 +60,7 @@ struct IndexIVFSpectralHash : IndexIVF {
 
     IndexIVFSpectralHash();
 
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
 
     void encode_vectors(
             idx_t n,
@@ -70,7 +70,8 @@ struct IndexIVFSpectralHash : IndexIVF {
             bool include_listnos = false) const override;
 
     InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs) const override;
+            bool store_pairs,
+            const IDSelector* sel) const override;
 
     /** replace the vector transform for an empty (and possibly untrained) index
      */
diff --git a/thirdparty/faiss/faiss/IndexIVFThreadSafe.cpp b/thirdparty/faiss/faiss/IndexIVFThreadSafe.cpp
deleted file mode 100644
index a272ebdfe..000000000
--- a/thirdparty/faiss/faiss/IndexIVFThreadSafe.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright (C) 2019-2023 Zilliz. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License"); you may not
-// use this file except in compliance with the License. You may obtain a copy of
-// the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License
-
-#include <faiss/IndexIVF.h>
-
-#include <faiss/utils/utils.h>
-
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/FaissAssert.h>
-#include <omp.h>
-#include <cinttypes>
-namespace faiss {
-
-namespace {
-IVFSearchParameters gen_search_param(
-        const size_t& nprobe,
-        const int parallel_mode,
-        const size_t& max_codes) {
-    IVFSearchParameters params;
-    params.nprobe = nprobe;
-    params.max_codes = max_codes;
-    params.parallel_mode = parallel_mode;
-    return params;
-}
-} // namespace
-
-void IndexIVF::search_thread_safe(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const size_t nprobe,
-        const size_t max_codes,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    const size_t final_nprobe = std::min(nlist, nprobe);
-    FAISS_THROW_IF_NOT(final_nprobe > 0);
-    IVFSearchParameters params = gen_search_param(final_nprobe, 0, max_codes);
-
-    // search function for a subset of queries
-    auto sub_search_func = [this, k, final_nprobe, bitset, &params](
-                                   idx_t n,
-                                   const float* x,
-                                   float* distances,
-                                   idx_t* labels,
-                                   IndexIVFStats* ivf_stats) {
-        std::unique_ptr<idx_t[]> idx(new idx_t[n * final_nprobe]);
-        std::unique_ptr<float[]> coarse_dis(new float[n * final_nprobe]);
-
-        double t0 = getmillisecs();
-        quantizer->search(n, x, final_nprobe, coarse_dis.get(), idx.get());
-
-        double t1 = getmillisecs();
-        invlists->prefetch_lists(idx.get(), n * final_nprobe);
-
-        search_preassigned(
-                n,
-                x,
-                k,
-                idx.get(),
-                coarse_dis.get(),
-                distances,
-                labels,
-                false,
-                &params,
-                ivf_stats,
-                bitset);
-        double t2 = getmillisecs();
-        ivf_stats->quantization_time += t1 - t0;
-        ivf_stats->search_time += t2 - t0;
-    };
-
-    if ((parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT) == 0) {
-        int nt = std::min(omp_get_max_threads(), int(n));
-        std::vector<IndexIVFStats> stats(nt);
-        std::mutex exception_mutex;
-        std::string exception_string;
-
-#pragma omp parallel for if (nt > 1)
-        for (idx_t slice = 0; slice < nt; slice++) {
-            IndexIVFStats local_stats;
-            idx_t i0 = n * slice / nt;
-            idx_t i1 = n * (slice + 1) / nt;
-            if (i1 > i0) {
-                try {
-                    sub_search_func(
-                            i1 - i0,
-                            x + i0 * d,
-                            distances + i0 * k,
-                            labels + i0 * k,
-                            &stats[slice]);
-                } catch (const std::exception& e) {
-                    std::lock_guard<std::mutex> lock(exception_mutex);
-                    exception_string = e.what();
-                }
-            }
-        }
-
-        if (!exception_string.empty()) {
-            FAISS_THROW_MSG(exception_string.c_str());
-        }
-
-        // collect stats
-        for (idx_t slice = 0; slice < nt; slice++) {
-            indexIVF_stats.add(stats[slice]);
-        }
-    } else {
-        // handle paralellization at level below (or don't run in parallel at
-        // all)
-        sub_search_func(n, x, distances, labels, &indexIVF_stats);
-    }
-}
-
-void IndexIVF::range_search_thread_safe(
-        idx_t nx,
-        const float* x,
-        float radius,
-        RangeSearchResult* result,
-        const size_t nprobe,
-        const size_t max_codes,
-        const BitsetView bitset) const {
-    const size_t final_nprobe = std::min(nlist, nprobe);
-    std::unique_ptr<idx_t[]> keys(new idx_t[nx * final_nprobe]);
-    std::unique_ptr<float[]> coarse_dis(new float[nx * final_nprobe]);
-
-    double t0 = getmillisecs();
-    quantizer->search(nx, x, final_nprobe, coarse_dis.get(), keys.get());
-    indexIVF_stats.quantization_time += getmillisecs() - t0;
-
-    t0 = getmillisecs();
-    invlists->prefetch_lists(keys.get(), nx * final_nprobe);
-
-    IVFSearchParameters params = gen_search_param(final_nprobe, 0, max_codes);
-
-    range_search_preassigned(
-            nx,
-            x,
-            radius,
-            keys.get(),
-            coarse_dis.get(),
-            result,
-            false,
-            &params,
-            &indexIVF_stats,
-            bitset);
-
-    indexIVF_stats.search_time += getmillisecs() - t0;
-}
-
-} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexLSH.cpp b/thirdparty/faiss/faiss/IndexLSH.cpp
index cb18660cf..840e4e310 100644
--- a/thirdparty/faiss/faiss/IndexLSH.cpp
+++ b/thirdparty/faiss/faiss/IndexLSH.cpp
@@ -57,7 +57,7 @@ const float* IndexLSH::apply_preprocess(idx_t n, const float* x) const {
     }
 
     if (train_thresholds) {
-        if (xt == NULL) {
+        if (xt == nullptr) {
             xt = new float[nbits * n];
             memcpy(xt, x, sizeof(*x) * n * nbits);
         }
@@ -105,9 +105,10 @@ void IndexLSH::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    // FAISS_THROW_IF_NOT_MSG(
+    //         !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT(is_trained);
     const float* xt = apply_preprocess(n, x);
     ScopeDeleter<float> del(xt == x ? nullptr : xt);
@@ -122,8 +123,14 @@ void IndexLSH::search(
 
     int_maxheap_array_t res = {size_t(n), size_t(k), labels, idistances};
 
-    binary_knn_hc(faiss::METRIC_Hamming, &res, (const uint8_t*)&qcodes,
-                  codes.data(), ntotal, code_size, bitset);
+    binary_knn_hc(
+        faiss::METRIC_Hamming,
+        &res, 
+        qcodes, 
+        codes.data(), 
+        ntotal, 
+        code_size, 
+        (params == nullptr) ? nullptr : params->sel);
 
     // convert distances to floats
     for (int i = 0; i < k * n; i++)
diff --git a/thirdparty/faiss/faiss/IndexLSH.h b/thirdparty/faiss/faiss/IndexLSH.h
index 5600939e2..bba0f9748 100644
--- a/thirdparty/faiss/faiss/IndexLSH.h
+++ b/thirdparty/faiss/faiss/IndexLSH.h
@@ -50,7 +50,7 @@ struct IndexLSH : IndexFlatCodes {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     /// transfer the thresholds to a pre-processing stage (and unset
     /// train_thresholds)
diff --git a/thirdparty/faiss/faiss/IndexLattice.cpp b/thirdparty/faiss/faiss/IndexLattice.cpp
index de64b4be7..ffa294aa9 100644
--- a/thirdparty/faiss/faiss/IndexLattice.cpp
+++ b/thirdparty/faiss/faiss/IndexLattice.cpp
@@ -7,12 +7,13 @@
 
 // -*- c++ -*-
 
-#include <faiss/FaissHook.h>
 #include <faiss/IndexLattice.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h> // for the bitstring routines
 
+#include "simd/hook.h"
+
 namespace faiss {
 
 IndexLattice::IndexLattice(idx_t d, int nsq, int scale_nbit, int r2)
@@ -120,7 +121,12 @@ void IndexLattice::add(idx_t, const float*) {
 }
 
 void IndexLattice::search(
-        idx_t, const float*, idx_t, float*, idx_t*, const BitsetView) const {
+        idx_t,
+        const float*,
+        idx_t,
+        float*,
+        idx_t*,
+        const SearchParameters*) const {
     FAISS_THROW_MSG("not implemented");
 }
 
diff --git a/thirdparty/faiss/faiss/IndexLattice.h b/thirdparty/faiss/faiss/IndexLattice.h
index 0bfb0ed2f..a9eb62b6d 100644
--- a/thirdparty/faiss/faiss/IndexLattice.h
+++ b/thirdparty/faiss/faiss/IndexLattice.h
@@ -55,7 +55,8 @@ struct IndexLattice : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
+
     void reset() override;
 };
 
diff --git a/thirdparty/faiss/faiss/IndexNNDescent.cpp b/thirdparty/faiss/faiss/IndexNNDescent.cpp
index 629ce1e30..8cdc0c4ab 100644
--- a/thirdparty/faiss/faiss/IndexNNDescent.cpp
+++ b/thirdparty/faiss/faiss/IndexNNDescent.cpp
@@ -50,7 +50,6 @@ int sgemm_(
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using storage_idx_t = NNDescent::storage_idx_t;
 
 /**************************************************************
@@ -89,7 +88,7 @@ struct NegativeDistanceComputer : DistanceComputer {
 };
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
     } else {
         return storage->get_distance_computer();
@@ -136,9 +135,9 @@ void IndexNNDescent::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const
-
-{
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT_MSG(
             storage,
             "Please use IndexNNDescentFlat (or variants) "
diff --git a/thirdparty/faiss/faiss/IndexNNDescent.h b/thirdparty/faiss/faiss/IndexNNDescent.h
index ee4d922f8..9b2532054 100644
--- a/thirdparty/faiss/faiss/IndexNNDescent.h
+++ b/thirdparty/faiss/faiss/IndexNNDescent.h
@@ -25,7 +25,6 @@ struct IndexNNDescent : Index {
     using storage_idx_t = NNDescent::storage_idx_t;
 
     /// Faiss results are 64-bit
-    using idx_t = Index::idx_t;
 
     // the link strcuture
     NNDescent nndescent;
@@ -54,7 +53,7 @@ struct IndexNNDescent : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
 
diff --git a/thirdparty/faiss/faiss/IndexNSG.cpp b/thirdparty/faiss/faiss/IndexNSG.cpp
index 08ab35c37..23710c4f5 100644
--- a/thirdparty/faiss/faiss/IndexNSG.cpp
+++ b/thirdparty/faiss/faiss/IndexNSG.cpp
@@ -23,39 +23,22 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using namespace nsg;
 
 /**************************************************************
  * IndexNSG implementation
  **************************************************************/
 
-IndexNSG::IndexNSG(int d, int R, MetricType metric)
-        : Index(d, metric),
-          nsg(R),
-          own_fields(false),
-          storage(nullptr),
-          is_built(false),
-          GK(64),
-          build_type(0) {
-    nndescent_S = 10;
-    nndescent_R = 100;
+IndexNSG::IndexNSG(int d, int R, MetricType metric) : Index(d, metric), nsg(R) {
     nndescent_L = GK + 50;
-    nndescent_iter = 10;
 }
 
 IndexNSG::IndexNSG(Index* storage, int R)
         : Index(storage->d, storage->metric_type),
           nsg(R),
-          own_fields(false),
           storage(storage),
-          is_built(false),
-          GK(64),
           build_type(1) {
-    nndescent_S = 10;
-    nndescent_R = 100;
     nndescent_L = GK + 50;
-    nndescent_iter = 10;
 }
 
 IndexNSG::~IndexNSG() {
@@ -79,9 +62,9 @@ void IndexNSG::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const
-
-{
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT_MSG(
             storage,
             "Please use IndexNSGFlat (or variants) instead of IndexNSG directly");
@@ -113,7 +96,7 @@ void IndexNSG::search(
         InterruptCallback::check();
     }
 
-    if (metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(metric_type)) {
         // we need to revert the negated distances
         for (size_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];
@@ -166,7 +149,7 @@ void IndexNSG::add(idx_t n, const float* x) {
         FAISS_THROW_IF_NOT(ntotal == n);
 
         knng.resize(ntotal * (GK + 1));
-        storage->assign(ntotal, x, knng.data()/*, GK + 1*/);
+        storage->assign(ntotal, x, knng.data(), GK + 1);
 
         // Remove itself
         // - For metric distance, we just need to remove the first neighbor
@@ -299,4 +282,37 @@ IndexNSGFlat::IndexNSGFlat(int d, int R, MetricType metric)
     is_trained = true;
 }
 
+/**************************************************************
+ * IndexNSGPQ implementation
+ **************************************************************/
+
+IndexNSGPQ::IndexNSGPQ() = default;
+
+IndexNSGPQ::IndexNSGPQ(int d, int pq_m, int M, int pq_nbits)
+        : IndexNSG(new IndexPQ(d, pq_m, pq_nbits), M) {
+    own_fields = true;
+    is_trained = false;
+}
+
+void IndexNSGPQ::train(idx_t n, const float* x) {
+    IndexNSG::train(n, x);
+    (dynamic_cast<IndexPQ*>(storage))->pq.compute_sdc_table();
+}
+
+/**************************************************************
+ * IndexNSGSQ implementation
+ **************************************************************/
+
+IndexNSGSQ::IndexNSGSQ(
+        int d,
+        ScalarQuantizer::QuantizerType qtype,
+        int M,
+        MetricType metric)
+        : IndexNSG(new IndexScalarQuantizer(d, qtype, metric), M) {
+    is_trained = false;
+    own_fields = true;
+}
+
+IndexNSGSQ::IndexNSGSQ() = default;
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexNSG.h b/thirdparty/faiss/faiss/IndexNSG.h
index 1ade79d50..172b10c98 100644
--- a/thirdparty/faiss/faiss/IndexNSG.h
+++ b/thirdparty/faiss/faiss/IndexNSG.h
@@ -13,6 +13,8 @@
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexNNDescent.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/IndexScalarQuantizer.h>
 #include <faiss/impl/NSG.h>
 #include <faiss/utils/utils.h>
 
@@ -26,25 +28,25 @@ struct IndexNSG : Index {
     NSG nsg;
 
     /// the sequential storage
-    bool own_fields;
-    Index* storage;
+    bool own_fields = false;
+    Index* storage = nullptr;
 
     /// the index is built or not
-    bool is_built;
+    bool is_built = false;
 
     /// K of KNN graph for building
-    int GK;
+    int GK = 64;
 
     /// indicate how to build a knn graph
     /// - 0: build NSG with brute force search
     /// - 1: build NSG with NNDescent
-    char build_type;
+    char build_type = 0;
 
     /// parameters for nndescent
-    int nndescent_S;
-    int nndescent_R;
-    int nndescent_L;
-    int nndescent_iter;
+    int nndescent_S = 10;
+    int nndescent_R = 100;
+    int nndescent_L; // set to GK + 50
+    int nndescent_iter = 10;
 
     explicit IndexNSG(int d = 0, int R = 32, MetricType metric = METRIC_L2);
     explicit IndexNSG(Index* storage, int R = 32);
@@ -65,7 +67,7 @@ struct IndexNSG : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
 
@@ -83,4 +85,25 @@ struct IndexNSGFlat : IndexNSG {
     IndexNSGFlat(int d, int R, MetricType metric = METRIC_L2);
 };
 
+/** PQ index topped with with a NSG structure to access elements
+ *  more efficiently.
+ */
+struct IndexNSGPQ : IndexNSG {
+    IndexNSGPQ();
+    IndexNSGPQ(int d, int pq_m, int M, int pq_nbits = 8);
+    void train(idx_t n, const float* x) override;
+};
+
+/** SQ index topped with with a NSG structure to access elements
+ *  more efficiently.
+ */
+struct IndexNSGSQ : IndexNSG {
+    IndexNSGSQ();
+    IndexNSGSQ(
+            int d,
+            ScalarQuantizer::QuantizerType qtype,
+            int M,
+            MetricType metric = METRIC_L2);
+};
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexPQ.cpp b/thirdparty/faiss/faiss/IndexPQ.cpp
index 5b84c8965..3a5fe6042 100644
--- a/thirdparty/faiss/faiss/IndexPQ.cpp
+++ b/thirdparty/faiss/faiss/IndexPQ.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexPQ.h>
 
 #include <cinttypes>
@@ -17,11 +15,12 @@
 
 #include <algorithm>
 
-#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/binary_distances.h>
 #include <faiss/utils/hamming.h>
 
+#include <faiss/impl/code_distance/code_distance.h>
+
 namespace faiss {
 
 /*********************************************************
@@ -71,31 +70,28 @@ void IndexPQ::train(idx_t n, const float* x) {
     is_trained = true;
 }
 
+size_t IndexPQ::cal_size() const {
+    return codes.size() * sizeof(uint8_t) + pq.cal_size();
+}
+
 namespace {
 
 template <class PQDecoder>
-struct PQDistanceComputer : DistanceComputer {
+struct PQDistanceComputer : FlatCodesDistanceComputer {
     size_t d;
     MetricType metric;
-    Index::idx_t nb;
-    const uint8_t* codes;
-    size_t code_size;
+    idx_t nb;
     const ProductQuantizer& pq;
     const float* sdc;
     std::vector<float> precomputed_table;
     size_t ndis;
 
-    float operator()(idx_t i) override {
-        const uint8_t* code = codes + i * code_size;
-        const float* dt = precomputed_table.data();
-        PQDecoder decoder(code, pq.nbits);
-        float accu = 0;
-        for (int j = 0; j < pq.M; j++) {
-            accu += dt[decoder.decode()];
-            dt += 1 << decoder.nbits;
-        }
+    float distance_to_code(const uint8_t* code) final {
         ndis++;
-        return accu;
+
+        float dis = distance_single_code<PQDecoder>(
+                pq.M, pq.nbits, precomputed_table.data(), code);
+        return dis;
     }
 
     float symmetric_dis(idx_t i, idx_t j) override {
@@ -113,13 +109,15 @@ struct PQDistanceComputer : DistanceComputer {
         return accu;
     }
 
-    explicit PQDistanceComputer(const IndexPQ& storage) : pq(storage.pq) {
+    explicit PQDistanceComputer(const IndexPQ& storage)
+            : FlatCodesDistanceComputer(
+                      storage.codes.data(),
+                      storage.code_size),
+              pq(storage.pq) {
         precomputed_table.resize(pq.M * pq.ksub);
         nb = storage.ntotal;
         d = storage.d;
         metric = storage.metric_type;
-        codes = storage.codes.data();
-        code_size = pq.code_size;
         if (pq.sdc_table.size() == pq.ksub * pq.ksub * pq.M) {
             sdc = pq.sdc_table.data();
         } else {
@@ -139,7 +137,7 @@ struct PQDistanceComputer : DistanceComputer {
 
 } // namespace
 
-DistanceComputer* IndexPQ::get_distance_computer() const {
+FlatCodesDistanceComputer* IndexPQ::get_FlatCodesDistanceComputer() const {
     if (pq.nbits == 8) {
         return new PQDistanceComputer<PQDecoder8>(*this);
     } else if (pq.nbits == 16) {
@@ -159,10 +157,20 @@ void IndexPQ::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* iparams) const {
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT(is_trained);
+
+    const SearchParametersPQ* params = nullptr;
+    Search_type_t search_type = this->search_type;
+
+    if (iparams) {
+        params = dynamic_cast<const SearchParametersPQ*>(iparams);
+        FAISS_THROW_IF_NOT_MSG(params, "invalid search params");
+        FAISS_THROW_IF_NOT_MSG(!params->sel, "selector not supported");
+        search_type = params->search_type;
+    }
+
     if (search_type == ST_PQ) { // Simple PQ search
 
         if (metric_type == METRIC_L2) {
@@ -181,8 +189,16 @@ void IndexPQ::search(
             search_type == ST_polysemous ||
             search_type == ST_polysemous_generalize) {
         FAISS_THROW_IF_NOT(metric_type == METRIC_L2);
-
-        search_core_polysemous(n, x, k, distances, labels);
+        int polysemous_ht =
+                params ? params->polysemous_ht : this->polysemous_ht;
+        search_core_polysemous(
+                n,
+                x,
+                k,
+                distances,
+                labels,
+                polysemous_ht,
+                search_type == ST_polysemous_generalize);
 
     } else { // code-to-code distances
 
@@ -217,14 +233,15 @@ void IndexPQ::search(
                     size_t(n), size_t(k), labels, idistances};
 
             if (search_type == ST_HE) {
-                binary_knn_hc(
-                        METRIC_Hamming,
+                // todo aguzhva: baseline knowhere had bitset here (binary_knn_hc call)
+                hammings_knn_hc(
                         &res,
-                        (const uint8_t*)q_codes,
+                        q_codes,
                         codes.data(),
                         ntotal,
                         pq.code_size,
-                        bitset);
+                        true);
+
             } else if (search_type == ST_generalized_HE) {
                 generalized_hammings_knn_hc(
                         &res,
@@ -251,21 +268,23 @@ void IndexPQStats::reset() {
 
 IndexPQStats indexPQ_stats;
 
+namespace {
+
 template <class HammingComputer>
-static size_t polysemous_inner_loop(
-        const IndexPQ& index,
+size_t polysemous_inner_loop(
+        const IndexPQ* index,
         const float* dis_table_qi,
         const uint8_t* q_code,
         size_t k,
         float* heap_dis,
-        int64_t* heap_ids) {
-    int M = index.pq.M;
-    int code_size = index.pq.code_size;
-    int ksub = index.pq.ksub;
-    size_t ntotal = index.ntotal;
-    int ht = index.polysemous_ht;
+        int64_t* heap_ids,
+        int ht) {
+    int M = index->pq.M;
+    int code_size = index->pq.code_size;
+    int ksub = index->pq.ksub;
+    size_t ntotal = index->ntotal;
 
-    const uint8_t* b_code = index.codes.data();
+    const uint8_t* b_code = index->codes.data();
 
     size_t n_pass_i = 0;
 
@@ -293,16 +312,31 @@ static size_t polysemous_inner_loop(
     return n_pass_i;
 }
 
+struct Run_polysemous_inner_loop {
+    using T = size_t;
+    template <class HammingComputer, class... Types>
+    size_t f(Types... args) {
+        return polysemous_inner_loop<HammingComputer>(args...);
+    }
+};
+
+} // anonymous namespace
+
 void IndexPQ::search_core_polysemous(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
-        idx_t* labels) const {
+        idx_t* labels,
+        int polysemous_ht,
+        bool generalized_hamming) const {
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT(pq.nbits == 8);
 
+    if (polysemous_ht == 0) {
+        polysemous_ht = pq.nbits * pq.M + 1;
+    }
+
     // PQ distance tables
     float* dis_tables = new float[n * pq.ksub * pq.M];
     ScopeDeleter<float> del(dis_tables);
@@ -325,7 +359,9 @@ void IndexPQ::search_core_polysemous(
 
     size_t n_pass = 0;
 
-#pragma omp parallel for reduction(+ : n_pass)
+    int bad_code_size = 0;
+
+#pragma omp parallel for reduction(+ : n_pass, bad_code_size)
     for (idx_t qi = 0; qi < n; qi++) {
         const uint8_t* q_code = q_codes + qi * pq.code_size;
 
@@ -335,78 +371,60 @@ void IndexPQ::search_core_polysemous(
         float* heap_dis = distances + qi * k;
         maxheap_heapify(k, heap_dis, heap_ids);
 
-        if (search_type == ST_polysemous) {
-            switch (pq.code_size) {
-                case 4:
-                    n_pass += polysemous_inner_loop<HammingComputer4>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
-                case 8:
-                    n_pass += polysemous_inner_loop<HammingComputer8>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
-                case 16:
-                    n_pass += polysemous_inner_loop<HammingComputer16>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
-                case 32:
-                    n_pass += polysemous_inner_loop<HammingComputer32>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
-                case 20:
-                    n_pass += polysemous_inner_loop<HammingComputer20>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
-                default:
-                    if (pq.code_size % 4 == 0) {
-                        n_pass += polysemous_inner_loop<HammingComputerDefault>(
-                                *this,
-                                dis_table_qi,
-                                q_code,
-                                k,
-                                heap_dis,
-                                heap_ids);
-                    } else {
-                        FAISS_THROW_FMT(
-                                "code size %zd not supported for polysemous",
-                                pq.code_size);
-                    }
-                    break;
-            }
-        } else {
+        if (!generalized_hamming) {
+            Run_polysemous_inner_loop r;
+            n_pass += dispatch_HammingComputer(
+                    pq.code_size,
+                    r,
+                    this,
+                    dis_table_qi,
+                    q_code,
+                    k,
+                    heap_dis,
+                    heap_ids,
+                    polysemous_ht);
+
+        } else { // generalized hamming
             switch (pq.code_size) {
-                case 8:
-                    n_pass += polysemous_inner_loop<GenHammingComputer8>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
-                case 16:
-                    n_pass += polysemous_inner_loop<GenHammingComputer16>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
-                case 32:
-                    n_pass += polysemous_inner_loop<GenHammingComputer32>(
-                            *this, dis_table_qi, q_code, k, heap_dis, heap_ids);
-                    break;
+#define DISPATCH(cs)                                             \
+    case cs:                                                     \
+        n_pass += polysemous_inner_loop<GenHammingComputer##cs>( \
+                this,                                            \
+                dis_table_qi,                                    \
+                q_code,                                          \
+                k,                                               \
+                heap_dis,                                        \
+                heap_ids,                                        \
+                polysemous_ht);                                  \
+        break;
+                DISPATCH(8)
+                DISPATCH(16)
+                DISPATCH(32)
                 default:
                     if (pq.code_size % 8 == 0) {
                         n_pass += polysemous_inner_loop<GenHammingComputerM8>(
-                                *this,
+                                this,
                                 dis_table_qi,
                                 q_code,
                                 k,
                                 heap_dis,
-                                heap_ids);
+                                heap_ids,
+                                polysemous_ht);
                     } else {
-                        FAISS_THROW_FMT(
-                                "code size %zd not supported for polysemous",
-                                pq.code_size);
+                        bad_code_size++;
                     }
                     break;
+#undef DISPATCH
             }
         }
         maxheap_reorder(k, heap_dis, heap_ids);
     }
 
+    if (bad_code_size) {
+        FAISS_THROW_FMT(
+                "code size %zd not supported for polysemous", pq.code_size);
+    }
+
     indexPQ_stats.nq += n;
     indexPQ_stats.ncode += n * ntotal;
     indexPQ_stats.n_hamming_pass += n_pass;
@@ -617,7 +635,7 @@ struct SemiSortedArray {
     int N;
 
     // type of the heap: CMax = sort ascending
-    typedef CMax<T, int> HC;
+    using HC = CMax<T, int>;
     std::vector<int> perm;
 
     int k; // k elements are sorted
@@ -711,7 +729,7 @@ struct MinSumK {
      * We use a heap to maintain a queue of sums, with the associated
      * terms involved in the sum.
      */
-    typedef CMin<T, int64_t> HC;
+    using HC = CMin<T, int64_t>;
     size_t heap_capacity, heap_size;
     T* bh_val;
     int64_t* bh_ids;
@@ -805,7 +823,7 @@ struct MinSumK {
             // enqueue followers
             int64_t ii = ti;
             for (int m = 0; m < M; m++) {
-                int64_t n = ii & ((1L << nbit) - 1);
+                int64_t n = ii & (((int64_t)1 << nbit) - 1);
                 ii >>= nbit;
                 if (n + 1 >= N)
                     continue;
@@ -829,7 +847,7 @@ struct MinSumK {
             }
             int64_t ti = 0;
             for (int m = 0; m < M; m++) {
-                int64_t n = ii & ((1L << nbit) - 1);
+                int64_t n = ii & (((int64_t)1 << nbit) - 1);
                 ti += int64_t(ssx[m].get_ord(n)) << (nbit * m);
                 ii >>= nbit;
             }
@@ -867,20 +885,25 @@ void MultiIndexQuantizer::train(idx_t n, const float* x) {
         ntotal *= pq.ksub;
 }
 
+// block size used in MultiIndexQuantizer::search
+int multi_index_quantizer_search_bs = 32768;
+
 void MultiIndexQuantizer::search(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
-    if (n == 0)
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+    if (n == 0) {
         return;
-
+    }
     FAISS_THROW_IF_NOT(k > 0);
 
     // the allocation just below can be severe...
-    idx_t bs = 32768;
+    idx_t bs = multi_index_quantizer_search_bs;
     if (n > bs) {
         for (idx_t i0 = 0; i0 < n; i0 += bs) {
             idx_t i1 = std::min(i0 + bs, n);
@@ -948,7 +971,7 @@ void MultiIndexQuantizer::search(
 void MultiIndexQuantizer::reconstruct(idx_t key, float* recons) const {
     int64_t jj = key;
     for (int m = 0; m < pq.M; m++) {
-        int64_t n = jj & ((1L << pq.nbits) - 1);
+        int64_t n = jj & (((int64_t)1 << pq.nbits) - 1);
         jj >>= pq.nbits;
         memcpy(recons, pq.get_centroids(m, n), sizeof(recons[0]) * pq.dsub);
         recons += pq.dsub;
@@ -1016,9 +1039,13 @@ void MultiIndexQuantizer2::search(
         idx_t K,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
-    if (n == 0)
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+
+    if (n == 0) {
         return;
+    }
 
     int k2 = std::min(K, int64_t(pq.ksub));
     FAISS_THROW_IF_NOT(k2);
@@ -1076,7 +1103,7 @@ void MultiIndexQuantizer2::search(
 
                 const idx_t* idmap0 = sub_ids.data() + i * k2;
                 int64_t ld_idmap = k2 * n;
-                int64_t mask1 = ksub - 1L;
+                int64_t mask1 = ksub - (int64_t)1;
 
                 for (int k = 0; k < K; k++) {
                     const idx_t* idmap = idmap0;
diff --git a/thirdparty/faiss/faiss/IndexPQ.h b/thirdparty/faiss/faiss/IndexPQ.h
index 5e65c00a6..32810a249 100644
--- a/thirdparty/faiss/faiss/IndexPQ.h
+++ b/thirdparty/faiss/faiss/IndexPQ.h
@@ -46,14 +46,14 @@ struct IndexPQ : IndexFlatCodes {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     /* The standalone codec interface */
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 
-    DistanceComputer* get_distance_computer() const override;
+    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
 
     /******************************************************
      * Polysemous codes implementation
@@ -88,7 +88,9 @@ struct IndexPQ : IndexFlatCodes {
             const float* x,
             idx_t k,
             float* distances,
-            idx_t* labels) const;
+            idx_t* labels,
+            int polysemous_ht,
+            bool generalized_hamming) const;
 
     /// prepare query for a polysemous search, but instead of
     /// computing the result, just get the histogram of Hamming
@@ -109,9 +111,13 @@ struct IndexPQ : IndexFlatCodes {
      */
     void hamming_distance_table(idx_t n, const float* x, int32_t* dis) const;
 
-    size_t cal_size() {
-        return codes.size() * sizeof(uint8_t) + pq.cal_size();
-    }
+    size_t cal_size() const;
+};
+
+/// override search parameters from the class
+struct SearchParametersPQ : SearchParameters {
+    IndexPQ::Search_type_t search_type;
+    int polysemous_ht;
 };
 
 /// statistics are robust to internal threading, but not if
@@ -148,7 +154,7 @@ struct MultiIndexQuantizer : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     /// add and reset will crash at runtime
     void add(idx_t n, const float* x) override;
@@ -159,6 +165,9 @@ struct MultiIndexQuantizer : Index {
     void reconstruct(idx_t key, float* recons) const override;
 };
 
+// block size used in MultiIndexQuantizer::search
+FAISS_API extern int multi_index_quantizer_search_bs;
+
 /** MultiIndexQuantizer where the PQ assignmnet is performed by sub-indexes
  */
 struct MultiIndexQuantizer2 : MultiIndexQuantizer {
@@ -182,7 +191,7 @@ struct MultiIndexQuantizer2 : MultiIndexQuantizer {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexPQFastScan.cpp b/thirdparty/faiss/faiss/IndexPQFastScan.cpp
index 3da3c252e..b8a6cdbee 100644
--- a/thirdparty/faiss/faiss/IndexPQFastScan.cpp
+++ b/thirdparty/faiss/faiss/IndexPQFastScan.cpp
@@ -7,24 +7,18 @@
 
 #include <faiss/IndexPQFastScan.h>
 
-#include <limits.h>
 #include <cassert>
+#include <climits>
 #include <memory>
 
 #include <omp.h>
 
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/random.h>
-#include <faiss/utils/utils.h>
-
 #include <faiss/impl/pq4_fast_scan.h>
-#include <faiss/impl/simd_result_handlers.h>
-#include <faiss/utils/quantize_lut.h>
+#include <faiss/utils/utils.h>
 
 namespace faiss {
 
-using namespace simd_result_handlers;
-
 inline size_t roundup(size_t a, size_t b) {
     return (a + b - 1) / b * b;
 }
@@ -35,37 +29,19 @@ IndexPQFastScan::IndexPQFastScan(
         size_t nbits,
         MetricType metric,
         int bbs)
-        : Index(d, metric),
-          pq(d, M, nbits),
-          bbs(bbs),
-          ntotal2(0),
-          M2(roundup(M, 2)) {
-    FAISS_THROW_IF_NOT(nbits == 4);
-    is_trained = false;
+        : pq(d, M, nbits) {
+    init_fastscan(d, M, nbits, metric, bbs);
 }
 
-IndexPQFastScan::IndexPQFastScan() : bbs(0), ntotal2(0), M2(0) {}
-
-IndexPQFastScan::IndexPQFastScan(const IndexPQ& orig, int bbs)
-        : Index(orig.d, orig.metric_type), pq(orig.pq), bbs(bbs) {
-    FAISS_THROW_IF_NOT(orig.pq.nbits == 4);
+IndexPQFastScan::IndexPQFastScan(const IndexPQ& orig, int bbs) : pq(orig.pq) {
+    init_fastscan(orig.d, pq.M, pq.nbits, orig.metric_type, bbs);
     ntotal = orig.ntotal;
+    ntotal2 = roundup(ntotal, bbs);
     is_trained = orig.is_trained;
     orig_codes = orig.codes.data();
 
-    qbs = 0; // means use default
-
     // pack the codes
-
-    size_t M = pq.M;
-
-    FAISS_THROW_IF_NOT(bbs % 32 == 0);
-    M2 = roundup(M, 2);
-    ntotal2 = roundup(ntotal, bbs);
-
     codes.resize(ntotal2 * M2 / 2);
-
-    // printf("M=%d M2=%d code_size=%d\n", M, M2, pq.code_size);
     pq4_pack_codes(orig.codes.data(), ntotal, M, ntotal2, bbs, M2, codes.get());
 }
 
@@ -77,427 +53,22 @@ void IndexPQFastScan::train(idx_t n, const float* x) {
     is_trained = true;
 }
 
-void IndexPQFastScan::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT(is_trained);
-    AlignedTable<uint8_t> tmp_codes(n * pq.code_size);
-    pq.compute_codes(x, tmp_codes.get(), n);
-    ntotal2 = roundup(ntotal + n, bbs);
-    size_t new_size = ntotal2 * M2 / 2;
-    size_t old_size = codes.size();
-    if (new_size > old_size) {
-        codes.resize(new_size);
-        memset(codes.get() + old_size, 0, new_size - old_size);
-    }
-    pq4_pack_codes_range(
-            tmp_codes.get(), pq.M, ntotal, ntotal + n, bbs, M2, codes.get());
-    ntotal += n;
-}
-
-void IndexPQFastScan::reset() {
-    codes.resize(0);
-    ntotal = 0;
+void IndexPQFastScan::compute_codes(uint8_t* codes, idx_t n, const float* x)
+        const {
+    pq.compute_codes(x, codes, n);
 }
 
-namespace {
-
-// from impl/ProductQuantizer.cpp
-template <class C, typename dis_t>
-void pq_estimators_from_tables_generic(
-        const ProductQuantizer& pq,
-        size_t nbits,
-        const uint8_t* codes,
-        size_t ncodes,
-        const dis_t* dis_table,
-        size_t k,
-        typename C::T* heap_dis,
-        int64_t* heap_ids) {
-    using accu_t = typename C::T;
-    const size_t M = pq.M;
-    const size_t ksub = pq.ksub;
-    for (size_t j = 0; j < ncodes; ++j) {
-        PQDecoderGeneric decoder(codes + j * pq.code_size, nbits);
-        accu_t dis = 0;
-        const dis_t* __restrict dt = dis_table;
-        for (size_t m = 0; m < M; m++) {
-            uint64_t c = decoder.decode();
-            dis += dt[c];
-            dt += ksub;
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_pop<C>(k, heap_dis, heap_ids);
-            heap_push<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-} // anonymous namespace
-
-using namespace quantize_lut;
-
-void IndexPQFastScan::compute_quantized_LUT(
-        idx_t n,
-        const float* x,
-        uint8_t* lut,
-        float* normalizers) const {
-    size_t dim12 = pq.ksub * pq.M;
-    std::unique_ptr<float[]> dis_tables(new float[n * dim12]);
+void IndexPQFastScan::compute_float_LUT(float* lut, idx_t n, const float* x)
+        const {
     if (metric_type == METRIC_L2) {
-        pq.compute_distance_tables(n, x, dis_tables.get());
+        pq.compute_distance_tables(n, x, lut);
     } else {
-        pq.compute_inner_prod_tables(n, x, dis_tables.get());
-    }
-
-    for (uint64_t i = 0; i < n; i++) {
-        round_uint8_per_column(
-                dis_tables.get() + i * dim12,
-                pq.M,
-                pq.ksub,
-                &normalizers[2 * i],
-                &normalizers[2 * i + 1]);
-    }
-
-    for (uint64_t i = 0; i < n; i++) {
-        const float* t_in = dis_tables.get() + i * dim12;
-        uint8_t* t_out = lut + i * M2 * pq.ksub;
-
-        for (int j = 0; j < dim12; j++) {
-            t_out[j] = int(t_in[j]);
-        }
-        memset(t_out + dim12, 0, (M2 - pq.M) * pq.ksub);
+        pq.compute_inner_prod_tables(n, x, lut);
     }
 }
 
-/******************************************************************************
- * Search driver routine
- ******************************************************************************/
-
-void IndexPQFastScan::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    if (metric_type == METRIC_L2) {
-        search_dispatch_implem<true>(n, x, k, distances, labels);
-    } else {
-        search_dispatch_implem<false>(n, x, k, distances, labels);
-    }
-}
-
-template <bool is_max>
-void IndexPQFastScan::search_dispatch_implem(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels) const {
-    using Cfloat = typename std::conditional<
-            is_max,
-            CMax<float, int64_t>,
-            CMin<float, int64_t>>::type;
-
-    using C = typename std::
-            conditional<is_max, CMax<uint16_t, int>, CMin<uint16_t, int>>::type;
-
-    if (n == 0) {
-        return;
-    }
-
-    // actual implementation used
-    int impl = implem;
-
-    if (impl == 0) {
-        if (bbs == 32) {
-            impl = 12;
-        } else {
-            impl = 14;
-        }
-        if (k > 20) {
-            impl++;
-        }
-    }
-
-    if (implem == 1) {
-        FAISS_THROW_IF_NOT(orig_codes);
-        FAISS_THROW_IF_NOT(is_max);
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        pq.search(x, n, orig_codes, ntotal, &res, true);
-    } else if (implem == 2 || implem == 3 || implem == 4) {
-        FAISS_THROW_IF_NOT(orig_codes);
-
-        size_t dim12 = pq.ksub * pq.M;
-        std::unique_ptr<float[]> dis_tables(new float[n * dim12]);
-        if (is_max) {
-            pq.compute_distance_tables(n, x, dis_tables.get());
-        } else {
-            pq.compute_inner_prod_tables(n, x, dis_tables.get());
-        }
-
-        std::vector<float> normalizers(n * 2);
-
-        if (implem == 2) {
-            // default float
-        } else if (implem == 3 || implem == 4) {
-            for (uint64_t i = 0; i < n; i++) {
-                round_uint8_per_column(
-                        dis_tables.get() + i * dim12,
-                        pq.M,
-                        pq.ksub,
-                        &normalizers[2 * i],
-                        &normalizers[2 * i + 1]);
-            }
-        }
-
-        for (int64_t i = 0; i < n; i++) {
-            int64_t* heap_ids = labels + i * k;
-            float* heap_dis = distances + i * k;
-
-            heap_heapify<Cfloat>(k, heap_dis, heap_ids);
-
-            pq_estimators_from_tables_generic<Cfloat>(
-                    pq,
-                    pq.nbits,
-                    orig_codes,
-                    ntotal,
-                    dis_tables.get() + i * dim12,
-                    k,
-                    heap_dis,
-                    heap_ids);
-
-            heap_reorder<Cfloat>(k, heap_dis, heap_ids);
-
-            if (implem == 4) {
-                float a = normalizers[2 * i];
-                float b = normalizers[2 * i + 1];
-
-                for (int j = 0; j < k; j++) {
-                    heap_dis[j] = heap_dis[j] / a + b;
-                }
-            }
-        }
-    } else if (impl >= 12 && impl <= 15) {
-        FAISS_THROW_IF_NOT(ntotal < INT_MAX);
-        int nt = std::min(omp_get_max_threads(), int(n));
-        if (nt < 2) {
-            if (impl == 12 || impl == 13) {
-                search_implem_12<C>(n, x, k, distances, labels, impl);
-            } else {
-                search_implem_14<C>(n, x, k, distances, labels, impl);
-            }
-        } else {
-            // explicitly slice over threads
-#pragma omp parallel for num_threads(nt)
-            for (int slice = 0; slice < nt; slice++) {
-                idx_t i0 = n * slice / nt;
-                idx_t i1 = n * (slice + 1) / nt;
-                float* dis_i = distances + i0 * k;
-                idx_t* lab_i = labels + i0 * k;
-                if (impl == 12 || impl == 13) {
-                    search_implem_12<C>(
-                            i1 - i0, x + i0 * d, k, dis_i, lab_i, impl);
-                } else {
-                    search_implem_14<C>(
-                            i1 - i0, x + i0 * d, k, dis_i, lab_i, impl);
-                }
-            }
-        }
-    } else {
-        FAISS_THROW_FMT("invalid implem %d impl=%d", implem, impl);
-    }
-}
-
-template <class C>
-void IndexPQFastScan::search_implem_12(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        int impl) const {
-    FAISS_THROW_IF_NOT(bbs == 32);
-
-    // handle qbs2 blocking by recursive call
-    int64_t qbs2 = this->qbs == 0 ? 11 : pq4_qbs_to_nq(this->qbs);
-    if (n > qbs2) {
-        for (int64_t i0 = 0; i0 < n; i0 += qbs2) {
-            int64_t i1 = std::min(i0 + qbs2, n);
-            search_implem_12<C>(
-                    i1 - i0,
-                    x + d * i0,
-                    k,
-                    distances + i0 * k,
-                    labels + i0 * k,
-                    impl);
-        }
-        return;
-    }
-
-    size_t dim12 = pq.ksub * M2;
-    AlignedTable<uint8_t> quantized_dis_tables(n * dim12);
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    if (skip & 1) {
-        quantized_dis_tables.clear();
-    } else {
-        compute_quantized_LUT(
-                n, x, quantized_dis_tables.get(), normalizers.get());
-    }
-
-    AlignedTable<uint8_t> LUT(n * dim12);
-
-    // block sizes are encoded in qbs, 4 bits at a time
-
-    // caution: we override an object field
-    int qbs = this->qbs;
-
-    if (n != pq4_qbs_to_nq(qbs)) {
-        qbs = pq4_preferred_qbs(n);
-    }
-
-    int LUT_nq =
-            pq4_pack_LUT_qbs(qbs, M2, quantized_dis_tables.get(), LUT.get());
-    FAISS_THROW_IF_NOT(LUT_nq == n);
-
-    if (k == 1) {
-        SingleResultHandler<C> handler(n, ntotal);
-        if (skip & 4) {
-            // pass
-        } else {
-            handler.disable = bool(skip & 2);
-            pq4_accumulate_loop_qbs(
-                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler);
-        }
-
-        handler.to_flat_arrays(distances, labels, normalizers.get());
-
-    } else if (impl == 12) {
-        std::vector<uint16_t> tmp_dis(n * k);
-        std::vector<int32_t> tmp_ids(n * k);
-
-        if (skip & 4) {
-            // skip
-        } else {
-            HeapHandler<C> handler(
-                    n, tmp_dis.data(), tmp_ids.data(), k, ntotal);
-            handler.disable = bool(skip & 2);
-
-            pq4_accumulate_loop_qbs(
-                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler);
-
-            if (!(skip & 8)) {
-                handler.to_flat_arrays(distances, labels, normalizers.get());
-            }
-        }
-
-    } else { // impl == 13
-
-        ReservoirHandler<C> handler(n, ntotal, k, 2 * k);
-        handler.disable = bool(skip & 2);
-
-        if (skip & 4) {
-            // skip
-        } else {
-            pq4_accumulate_loop_qbs(
-                    qbs, ntotal2, M2, codes.get(), LUT.get(), handler);
-        }
-
-        if (!(skip & 8)) {
-            handler.to_flat_arrays(distances, labels, normalizers.get());
-        }
-    }
-}
-
-template <class C>
-void IndexPQFastScan::search_implem_14(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        int impl) const {
-    FAISS_THROW_IF_NOT(bbs % 32 == 0);
-
-    int qbs2 = qbs == 0 ? 4 : qbs;
-
-    // handle qbs2 blocking by recursive call
-    if (n > qbs2) {
-        for (int64_t i0 = 0; i0 < n; i0 += qbs2) {
-            int64_t i1 = std::min(i0 + qbs2, n);
-            search_implem_14<C>(
-                    i1 - i0,
-                    x + d * i0,
-                    k,
-                    distances + i0 * k,
-                    labels + i0 * k,
-                    impl);
-        }
-        return;
-    }
-
-    size_t dim12 = pq.ksub * M2;
-    AlignedTable<uint8_t> quantized_dis_tables(n * dim12);
-    std::unique_ptr<float[]> normalizers(new float[2 * n]);
-
-    if (skip & 1) {
-        quantized_dis_tables.clear();
-    } else {
-        compute_quantized_LUT(
-                n, x, quantized_dis_tables.get(), normalizers.get());
-    }
-
-    AlignedTable<uint8_t> LUT(n * dim12);
-    pq4_pack_LUT(n, M2, quantized_dis_tables.get(), LUT.get());
-
-    if (k == 1) {
-        SingleResultHandler<C> handler(n, ntotal);
-        if (skip & 4) {
-            // pass
-        } else {
-            handler.disable = bool(skip & 2);
-            pq4_accumulate_loop(
-                    n, ntotal2, bbs, M2, codes.get(), LUT.get(), handler);
-        }
-        handler.to_flat_arrays(distances, labels, normalizers.get());
-
-    } else if (impl == 14) {
-        std::vector<uint16_t> tmp_dis(n * k);
-        std::vector<int32_t> tmp_ids(n * k);
-
-        if (skip & 4) {
-            // skip
-        } else if (k > 1) {
-            HeapHandler<C> handler(
-                    n, tmp_dis.data(), tmp_ids.data(), k, ntotal);
-            handler.disable = bool(skip & 2);
-
-            pq4_accumulate_loop(
-                    n, ntotal2, bbs, M2, codes.get(), LUT.get(), handler);
-
-            if (!(skip & 8)) {
-                handler.to_flat_arrays(distances, labels, normalizers.get());
-            }
-        }
-
-    } else { // impl == 15
-
-        ReservoirHandler<C> handler(n, ntotal, k, 2 * k);
-        handler.disable = bool(skip & 2);
-
-        if (skip & 4) {
-            // skip
-        } else {
-            pq4_accumulate_loop(
-                    n, ntotal2, bbs, M2, codes.get(), LUT.get(), handler);
-        }
-
-        if (!(skip & 8)) {
-            handler.to_flat_arrays(distances, labels, normalizers.get());
-        }
-    }
+void IndexPQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x) const {
+    pq.decode(bytes, x, n);
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexPQFastScan.h b/thirdparty/faiss/faiss/IndexPQFastScan.h
index 501772d5c..220da378c 100644
--- a/thirdparty/faiss/faiss/IndexPQFastScan.h
+++ b/thirdparty/faiss/faiss/IndexPQFastScan.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/IndexFastScan.h>
 #include <faiss/IndexPQ.h>
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/utils/AlignedTable.h>
@@ -25,27 +26,9 @@ namespace faiss {
  * 15: no qbs with reservoir accumulator
  */
 
-struct IndexPQFastScan : Index {
+struct IndexPQFastScan : IndexFastScan {
     ProductQuantizer pq;
 
-    // implementation to select
-    int implem = 0;
-    // skip some parts of the computation (for timing)
-    int skip = 0;
-
-    // size of the kernel
-    int bbs;     // set at build time
-    int qbs = 0; // query block size 0 = use default
-
-    // packed version of the codes
-    size_t ntotal2;
-    size_t M2;
-
-    AlignedTable<uint8_t> codes;
-
-    // this is for testing purposes only (set when initialized by IndexPQ)
-    const uint8_t* orig_codes = nullptr;
-
     IndexPQFastScan(
             int d,
             size_t M,
@@ -53,62 +36,27 @@ struct IndexPQFastScan : Index {
             MetricType metric = METRIC_L2,
             int bbs = 32);
 
-    IndexPQFastScan();
+    IndexPQFastScan() = default;
 
     /// build from an existing IndexPQ
     explicit IndexPQFastScan(const IndexPQ& orig, int bbs = 32);
 
     void train(idx_t n, const float* x) override;
-    void add(idx_t n, const float* x) override;
-    void reset() override;
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
-
-    // called by search function
-    void compute_quantized_LUT(
-            idx_t n,
-            const float* x,
-            uint8_t* lut,
-            float* normalizers) const;
-
-    template <bool is_max>
-    void search_dispatch_implem(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels) const;
 
-    template <class C>
-    void search_implem_2(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels) const;
+    void compute_codes(uint8_t* codes, idx_t n, const float* x) const override;
 
-    template <class C>
-    void search_implem_12(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            int impl) const;
+    void compute_float_LUT(float* lut, idx_t n, const float* x) const override;
 
-    template <class C>
-    void search_implem_14(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            int impl) const;
+    /** Decode a set of vectors.
+     *
+     *  NOTE: The codes in the IndexPQFastScan object are non-contiguous.
+     *        But this method requires a contiguous representation.
+     *
+     * @param n       number of vectors
+     * @param bytes   input encoded vectors, size n * code_size
+     * @param x       output vectors, size n * d
+     */
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexPreTransform.cpp b/thirdparty/faiss/faiss/IndexPreTransform.cpp
index 1110cb189..cde857c8e 100644
--- a/thirdparty/faiss/faiss/IndexPreTransform.cpp
+++ b/thirdparty/faiss/faiss/IndexPreTransform.cpp
@@ -15,6 +15,7 @@
 #include <memory>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
 
 namespace faiss {
@@ -140,9 +141,8 @@ void IndexPreTransform::reverse_chain(idx_t n, const float* xt, float* x)
 
 void IndexPreTransform::add(idx_t n, const float* x) {
     FAISS_THROW_IF_NOT(is_trained);
-    const float* xt = apply_chain(n, x);
-    ScopeDeleter<float> del(xt == x ? nullptr : xt);
-    index->add(n, xt);
+    TransformedVectors tv(x, apply_chain(n, x));
+    index->add(n, tv.x);
     ntotal = index->ntotal;
 }
 
@@ -151,25 +151,34 @@ void IndexPreTransform::add_with_ids(
         const float* x,
         const idx_t* xids) {
     FAISS_THROW_IF_NOT(is_trained);
-    const float* xt = apply_chain(n, x);
-    ScopeDeleter<float> del(xt == x ? nullptr : xt);
-    index->add_with_ids(n, xt, xids);
+    TransformedVectors tv(x, apply_chain(n, x));
+    index->add_with_ids(n, tv.x, xids);
     ntotal = index->ntotal;
 }
 
+namespace {
+
+const SearchParameters* extract_index_search_params(
+        const SearchParameters* params_in) {
+    auto params = dynamic_cast<const SearchParametersPreTransform*>(params_in);
+    return params ? params->index_params : params_in;
+}
+
+} // namespace
+
 void IndexPreTransform::search(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT(is_trained);
     const float* xt = apply_chain(n, x);
     ScopeDeleter<float> del(xt == x ? nullptr : xt);
-    index->search(n, xt, k, distances, labels);
+    index->search(
+            n, xt, k, distances, labels, extract_index_search_params(params));
 }
 
 void IndexPreTransform::range_search(
@@ -177,11 +186,11 @@ void IndexPreTransform::range_search(
         const float* x,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(is_trained);
-    const float* xt = apply_chain(n, x);
-    ScopeDeleter<float> del(xt == x ? nullptr : xt);
-    index->range_search(n, xt, radius, result);
+    TransformedVectors tv(x, apply_chain(n, x));
+    index->range_search(
+            n, tv.x, radius, result, extract_index_search_params(params));
 }
 
 void IndexPreTransform::reset() {
@@ -221,17 +230,23 @@ void IndexPreTransform::search_and_reconstruct(
         idx_t k,
         float* distances,
         idx_t* labels,
-        float* recons) const {
+        float* recons,
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT(is_trained);
 
-    const float* xt = apply_chain(n, x);
-    ScopeDeleter<float> del((xt == x) ? nullptr : xt);
+    TransformedVectors trans(x, apply_chain(n, x));
 
     float* recons_temp = chain.empty() ? recons : new float[n * k * index->d];
     ScopeDeleter<float> del2((recons_temp == recons) ? nullptr : recons_temp);
-    index->search_and_reconstruct(n, xt, k, distances, labels, recons_temp);
+    index->search_and_reconstruct(
+            n,
+            trans.x,
+            k,
+            distances,
+            labels,
+            recons_temp,
+            extract_index_search_params(params));
 
     // Revert transformations from last to first
     reverse_chain(n * k, recons_temp, recons);
@@ -243,13 +258,8 @@ size_t IndexPreTransform::sa_code_size() const {
 
 void IndexPreTransform::sa_encode(idx_t n, const float* x, uint8_t* bytes)
         const {
-    if (chain.empty()) {
-        index->sa_encode(n, x, bytes);
-    } else {
-        const float* xt = apply_chain(n, x);
-        ScopeDeleter<float> del(xt == x ? nullptr : xt);
-        index->sa_encode(n, xt, bytes);
-    }
+    TransformedVectors tv(x, apply_chain(n, x));
+    index->sa_encode(n, tv.x, bytes);
 }
 
 void IndexPreTransform::sa_decode(idx_t n, const uint8_t* bytes, float* x)
@@ -264,6 +274,24 @@ void IndexPreTransform::sa_decode(idx_t n, const uint8_t* bytes, float* x)
     }
 }
 
+void IndexPreTransform::merge_from(Index& otherIndex, idx_t add_id) {
+    check_compatible_for_merge(otherIndex);
+    auto other = static_cast<const IndexPreTransform*>(&otherIndex);
+    index->merge_from(*other->index, add_id);
+    ntotal = index->ntotal;
+}
+
+void IndexPreTransform::check_compatible_for_merge(
+        const Index& otherIndex) const {
+    auto other = dynamic_cast<const IndexPreTransform*>(&otherIndex);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(chain.size() == other->chain.size());
+    for (int i = 0; i < chain.size(); i++) {
+        chain[i]->check_identical(*other->chain[i]);
+    }
+    index->check_compatible_for_merge(*other->index);
+}
+
 namespace {
 
 struct PreTransformDistanceComputer : DistanceComputer {
diff --git a/thirdparty/faiss/faiss/IndexPreTransform.h b/thirdparty/faiss/faiss/IndexPreTransform.h
index 40694e91d..3ad7b28fb 100644
--- a/thirdparty/faiss/faiss/IndexPreTransform.h
+++ b/thirdparty/faiss/faiss/IndexPreTransform.h
@@ -14,6 +14,12 @@
 
 namespace faiss {
 
+struct SearchParametersPreTransform : SearchParameters {
+    // nothing to add here.
+    // as such, encapsulating the search params is considered optional
+    SearchParameters* index_params = nullptr;
+};
+
 /** Index that applies a LinearTransform transform on vectors before
  *  handing them over to a sub-index */
 struct IndexPreTransform : Index {
@@ -49,7 +55,7 @@ struct IndexPreTransform : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     /* range search, no attempt is done to change the radius */
     void range_search(
@@ -57,7 +63,7 @@ struct IndexPreTransform : Index {
             const float* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void reconstruct(idx_t key, float* recons) const override;
 
@@ -69,7 +75,8 @@ struct IndexPreTransform : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            float* recons) const override;
+            float* recons,
+            const SearchParameters* params = nullptr) const override;
 
     /// apply the transforms in the chain. The returned float * may be
     /// equal to x, otherwise it should be deallocated.
@@ -86,6 +93,9 @@ struct IndexPreTransform : Index {
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 
+    void merge_from(Index& otherIndex, idx_t add_id = 0) override;
+    void check_compatible_for_merge(const Index& otherIndex) const override;
+
     ~IndexPreTransform() override;
 };
 
diff --git a/thirdparty/faiss/faiss/IndexRefine.cpp b/thirdparty/faiss/faiss/IndexRefine.cpp
index 412211460..2d17d33c4 100644
--- a/thirdparty/faiss/faiss/IndexRefine.cpp
+++ b/thirdparty/faiss/faiss/IndexRefine.cpp
@@ -12,6 +12,7 @@
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances.h>
+#include <faiss/utils/distances_if.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
@@ -62,7 +63,7 @@ void IndexRefine::reset() {
 
 namespace {
 
-typedef faiss::Index::idx_t idx_t;
+using idx_t = faiss::idx_t;
 
 template <class C>
 static void reorder_2_heaps(
@@ -96,9 +97,14 @@ void IndexRefine::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+
+    FAISS_THROW_IF_NOT(base_index);
+    FAISS_THROW_IF_NOT(refine_index);
 
+    FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(is_trained);
     idx_t k_base = idx_t(k * k_factor);
     idx_t* base_labels = labels;
@@ -127,13 +133,33 @@ void IndexRefine::search(
         for (idx_t i = 0; i < n; i++) {
             dc->set_query(x + i * d);
             idx_t ij = i * k_base;
-            for (idx_t j = 0; j < k_base; j++) {
-                idx_t idx = base_labels[ij];
-                if (idx < 0)
-                    break;
-                base_distances[ij] = (*dc)(idx);
-                ij++;
-            }
+
+            // // baseline
+            // for (idx_t j = 0; j < k_base; j++) {
+            //     idx_t idx = base_labels[ij];
+            //     if (idx < 0)
+            //         break;
+            //     base_distances[ij] = (*dc)(idx);
+            //     ij++;
+            // }
+
+            // the lambda that filters acceptable elements.
+            auto filter = [&](const size_t j) -> std::optional<bool> {
+                // stop iterating if idx < 0
+                if (base_labels[j + i * k_base] < 0) {
+                    return std::nullopt;
+                }
+                // go ahead
+                return true;
+            };
+
+            // the lambda that applies a filtered element.
+            auto apply = [&](const float dis, const idx_t j) {
+                base_distances[j + i * k_base] = dis;
+            };
+
+            distance_compute_by_idx_if(
+                    base_labels + i * k_base, k_base, dc.get(), filter, apply);
         }
     }
 
@@ -208,7 +234,7 @@ IndexRefineFlat::IndexRefineFlat(Index* base_index)
 
 IndexRefineFlat::IndexRefineFlat(Index* base_index, const float* xb)
         : IndexRefine(base_index, nullptr) {
-    is_trained = base_index->is_trained;    
+    is_trained = base_index->is_trained;
     refine_index = new IndexFlat(base_index->d, base_index->metric_type);
     own_refine_index = true;
     refine_index->add(base_index->ntotal, xb);
@@ -224,9 +250,13 @@ void IndexRefineFlat::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+    FAISS_THROW_IF_NOT(base_index);
+    FAISS_THROW_IF_NOT(refine_index);
 
+    FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(is_trained);
     idx_t k_base = idx_t(k * k_factor);
     idx_t* base_labels = labels;
diff --git a/thirdparty/faiss/faiss/IndexRefine.h b/thirdparty/faiss/faiss/IndexRefine.h
index 218106030..79b671b56 100644
--- a/thirdparty/faiss/faiss/IndexRefine.h
+++ b/thirdparty/faiss/faiss/IndexRefine.h
@@ -45,7 +45,7 @@ struct IndexRefine : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     // reconstruct is routed to the refine_index
     void reconstruct(idx_t key, float* recons) const override;
@@ -78,7 +78,7 @@ struct IndexRefineFlat : IndexRefine {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexReplicas.cpp b/thirdparty/faiss/faiss/IndexReplicas.cpp
index a924b21e6..8295f34a6 100644
--- a/thirdparty/faiss/faiss/IndexReplicas.cpp
+++ b/thirdparty/faiss/faiss/IndexReplicas.cpp
@@ -12,17 +12,34 @@
 
 namespace faiss {
 
+namespace {
+
+// IndexBinary needs to update the code_size when d is set...
+
+void sync_d(Index* index) {}
+
+void sync_d(IndexBinary* index) {
+    FAISS_THROW_IF_NOT(index->d % 8 == 0);
+    index->code_size = index->d / 8;
+}
+
+} // anonymous namespace
+
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(bool threaded)
         : ThreadedIndex<IndexT>(threaded) {}
 
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(idx_t d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {}
+        : ThreadedIndex<IndexT>(d, threaded) {
+    sync_d(this);
+}
 
 template <typename IndexT>
 IndexReplicasTemplate<IndexT>::IndexReplicasTemplate(int d, bool threaded)
-        : ThreadedIndex<IndexT>(d, threaded) {}
+        : ThreadedIndex<IndexT>(d, threaded) {
+    sync_d(this);
+}
 
 template <typename IndexT>
 void IndexReplicasTemplate<IndexT>::onAfterAddIndex(IndexT* index) {
@@ -109,9 +126,10 @@ void IndexReplicasTemplate<IndexT>::search(
         idx_t k,
         distance_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT_MSG(this->count() > 0, "no replicas in index");
 
     if (n == 0) {
@@ -122,14 +140,13 @@ void IndexReplicasTemplate<IndexT>::search(
     size_t componentsPerVec = sizeof(component_t) == 1 ? (dim + 7) / 8 : dim;
 
     // Partition the query by the number of indices we have
-    faiss::Index::idx_t queriesPerIndex =
-            (faiss::Index::idx_t)(n + this->count() - 1) /
-            (faiss::Index::idx_t)this->count();
+    faiss::idx_t queriesPerIndex =
+            (faiss::idx_t)(n + this->count() - 1) / (faiss::idx_t)this->count();
     FAISS_ASSERT(n / queriesPerIndex <= this->count());
 
     auto fn = [queriesPerIndex, componentsPerVec, n, x, k, distances, labels](
                       int i, const IndexT* index) {
-        faiss::Index::idx_t base = (faiss::Index::idx_t)i * queriesPerIndex;
+        faiss::idx_t base = (faiss::idx_t)i * queriesPerIndex;
 
         if (base < n) {
             auto numForIndex = std::min(queriesPerIndex, n - base);
@@ -168,6 +185,8 @@ void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
     }
 
     auto firstIndex = this->at(0);
+    this->d = firstIndex->d;
+    sync_d(this);
     this->metric_type = firstIndex->metric_type;
     this->is_trained = firstIndex->is_trained;
     this->ntotal = firstIndex->ntotal;
@@ -181,30 +200,8 @@ void IndexReplicasTemplate<IndexT>::syncWithSubIndexes() {
     }
 }
 
-// No metric_type for IndexBinary
-template <>
-void IndexReplicasTemplate<IndexBinary>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-
-        return;
-    }
-
-    auto firstIndex = this->at(0);
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-        FAISS_THROW_IF_NOT(this->ntotal == index->ntotal);
-    }
-}
-
 // explicit instantiations
-template class IndexReplicasTemplate<Index>;
-template class IndexReplicasTemplate<IndexBinary>;
+template struct IndexReplicasTemplate<Index>;
+template struct IndexReplicasTemplate<IndexBinary>;
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexReplicas.h b/thirdparty/faiss/faiss/IndexReplicas.h
index b090d2266..84b12a7b8 100644
--- a/thirdparty/faiss/faiss/IndexReplicas.h
+++ b/thirdparty/faiss/faiss/IndexReplicas.h
@@ -20,7 +20,6 @@ namespace faiss {
 template <typename IndexT>
 class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
    public:
-    using idx_t = typename IndexT::idx_t;
     using component_t = typename IndexT::component_t;
     using distance_t = typename IndexT::distance_t;
 
@@ -66,7 +65,7 @@ class IndexReplicasTemplate : public ThreadedIndex<IndexT> {
             idx_t k,
             distance_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     /// reconstructs from the first index
     void reconstruct(idx_t, component_t* v) const override;
diff --git a/thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp b/thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp
new file mode 100644
index 000000000..045bc3061
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexRowwiseMinMax.cpp
@@ -0,0 +1,445 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexRowwiseMinMax.h>
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/fp16.h>
+
+namespace faiss {
+
+namespace {
+
+using idx_t = faiss::idx_t;
+
+struct StorageMinMaxFP16 {
+    uint16_t scaler;
+    uint16_t minv;
+
+    inline void from_floats(const float float_scaler, const float float_minv) {
+        scaler = encode_fp16(float_scaler);
+        minv = encode_fp16(float_minv);
+    }
+
+    inline void to_floats(float& float_scaler, float& float_minv) const {
+        float_scaler = decode_fp16(scaler);
+        float_minv = decode_fp16(minv);
+    }
+};
+
+struct StorageMinMaxFP32 {
+    float scaler;
+    float minv;
+
+    inline void from_floats(const float float_scaler, const float float_minv) {
+        scaler = float_scaler;
+        minv = float_minv;
+    }
+
+    inline void to_floats(float& float_scaler, float& float_minv) const {
+        float_scaler = scaler;
+        float_minv = minv;
+    }
+};
+
+template <typename StorageMinMaxT>
+void sa_encode_impl(
+        const IndexRowwiseMinMaxBase* const index,
+        const idx_t n_input,
+        const float* x_input,
+        uint8_t* bytes_output) {
+    // process chunks
+    const size_t chunk_size = rowwise_minmax_sa_encode_bs;
+
+    // useful variables
+    const Index* const sub_index = index->index;
+    const int d = index->d;
+
+    // the code size of the subindex
+    const size_t old_code_size = sub_index->sa_code_size();
+    // the code size of the index
+    const size_t new_code_size = index->sa_code_size();
+
+    // allocate tmp buffers
+    std::vector<float> tmp(chunk_size * d);
+    std::vector<StorageMinMaxT> minmax(chunk_size);
+
+    // all the elements to process
+    size_t n_left = n_input;
+
+    const float* __restrict x = x_input;
+    uint8_t* __restrict bytes = bytes_output;
+
+    while (n_left > 0) {
+        // current portion to be processed
+        const idx_t n = std::min(n_left, chunk_size);
+
+        // allocate a temporary buffer and do the rescale
+        for (idx_t i = 0; i < n; i++) {
+            // compute min & max values
+            float minv = std::numeric_limits<float>::max();
+            float maxv = std::numeric_limits<float>::lowest();
+
+            const float* const vec_in = x + i * d;
+            for (idx_t j = 0; j < d; j++) {
+                minv = std::min(minv, vec_in[j]);
+                maxv = std::max(maxv, vec_in[j]);
+            }
+
+            // save the coefficients
+            const float scaler = maxv - minv;
+            minmax[i].from_floats(scaler, minv);
+
+            // and load them back, because the coefficients might
+            // be modified.
+            float actual_scaler = 0;
+            float actual_minv = 0;
+            minmax[i].to_floats(actual_scaler, actual_minv);
+
+            float* const vec_out = tmp.data() + i * d;
+            if (actual_scaler == 0) {
+                for (idx_t j = 0; j < d; j++) {
+                    vec_out[j] = 0;
+                }
+            } else {
+                float inv_actual_scaler = 1.0f / actual_scaler;
+                for (idx_t j = 0; j < d; j++) {
+                    vec_out[j] = (vec_in[j] - actual_minv) * inv_actual_scaler;
+                }
+            }
+        }
+
+        // do the coding
+        sub_index->sa_encode(n, tmp.data(), bytes);
+
+        // rearrange
+        for (idx_t i = n; (i--) > 0;) {
+            // move a single index
+            std::memmove(
+                    bytes + i * new_code_size + (new_code_size - old_code_size),
+                    bytes + i * old_code_size,
+                    old_code_size);
+
+            // save min & max values
+            StorageMinMaxT* fpv = reinterpret_cast<StorageMinMaxT*>(
+                    bytes + i * new_code_size);
+            *fpv = minmax[i];
+        }
+
+        // next chunk
+        x += n * d;
+        bytes += n * new_code_size;
+
+        n_left -= n;
+    }
+}
+
+template <typename StorageMinMaxT>
+void sa_decode_impl(
+        const IndexRowwiseMinMaxBase* const index,
+        const idx_t n_input,
+        const uint8_t* bytes_input,
+        float* x_output) {
+    // process chunks
+    const size_t chunk_size = rowwise_minmax_sa_decode_bs;
+
+    // useful variables
+    const Index* const sub_index = index->index;
+    const int d = index->d;
+
+    // the code size of the subindex
+    const size_t old_code_size = sub_index->sa_code_size();
+    // the code size of the index
+    const size_t new_code_size = index->sa_code_size();
+
+    // allocate tmp buffers
+    std::vector<uint8_t> tmp(
+            (chunk_size < n_input ? chunk_size : n_input) * old_code_size);
+    std::vector<StorageMinMaxFP16> minmax(
+            (chunk_size < n_input ? chunk_size : n_input));
+
+    // all the elements to process
+    size_t n_left = n_input;
+
+    const uint8_t* __restrict bytes = bytes_input;
+    float* __restrict x = x_output;
+
+    while (n_left > 0) {
+        // current portion to be processed
+        const idx_t n = std::min(n_left, chunk_size);
+
+        // rearrange
+        for (idx_t i = 0; i < n; i++) {
+            std::memcpy(
+                    tmp.data() + i * old_code_size,
+                    bytes + i * new_code_size + (new_code_size - old_code_size),
+                    old_code_size);
+        }
+
+        // decode
+        sub_index->sa_decode(n, tmp.data(), x);
+
+        // scale back
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* const vec_in = bytes + i * new_code_size;
+            StorageMinMaxT fpv =
+                    *(reinterpret_cast<const StorageMinMaxT*>(vec_in));
+
+            float scaler = 0;
+            float minv = 0;
+            fpv.to_floats(scaler, minv);
+
+            float* const __restrict vec = x + d * i;
+
+            for (idx_t j = 0; j < d; j++) {
+                vec[j] = vec[j] * scaler + minv;
+            }
+        }
+
+        // next chunk
+        bytes += n * new_code_size;
+        x += n * d;
+
+        n_left -= n;
+    }
+}
+
+//
+template <typename StorageMinMaxT>
+void train_inplace_impl(
+        IndexRowwiseMinMaxBase* const index,
+        idx_t n,
+        float* x) {
+    // useful variables
+    Index* const sub_index = index->index;
+    const int d = index->d;
+
+    // save normalizing coefficients
+    std::vector<StorageMinMaxT> minmax(n);
+
+    // normalize
+#pragma omp for
+    for (idx_t i = 0; i < n; i++) {
+        // compute min & max values
+        float minv = std::numeric_limits<float>::max();
+        float maxv = std::numeric_limits<float>::lowest();
+
+        float* const vec = x + i * d;
+        for (idx_t j = 0; j < d; j++) {
+            minv = std::min(minv, vec[j]);
+            maxv = std::max(maxv, vec[j]);
+        }
+
+        // save the coefficients
+        const float scaler = maxv - minv;
+        minmax[i].from_floats(scaler, minv);
+
+        // and load them back, because the coefficients might
+        // be modified.
+        float actual_scaler = 0;
+        float actual_minv = 0;
+        minmax[i].to_floats(actual_scaler, actual_minv);
+
+        if (actual_scaler == 0) {
+            for (idx_t j = 0; j < d; j++) {
+                vec[j] = 0;
+            }
+        } else {
+            float inv_actual_scaler = 1.0f / actual_scaler;
+            for (idx_t j = 0; j < d; j++) {
+                vec[j] = (vec[j] - actual_minv) * inv_actual_scaler;
+            }
+        }
+    }
+
+    // train the subindex
+    sub_index->train(n, x);
+
+    // rescale data back
+    for (idx_t i = 0; i < n; i++) {
+        float scaler = 0;
+        float minv = 0;
+        minmax[i].to_floats(scaler, minv);
+
+        float* const vec = x + i * d;
+
+        for (idx_t j = 0; j < d; j++) {
+            vec[j] = vec[j] * scaler + minv;
+        }
+    }
+}
+
+//
+template <typename StorageMinMaxT>
+void train_impl(IndexRowwiseMinMaxBase* const index, idx_t n, const float* x) {
+    // the default training that creates a copy of the input data
+
+    // useful variables
+    Index* const sub_index = index->index;
+    const int d = index->d;
+
+    // temp buffer
+    std::vector<float> tmp(n * d);
+
+#pragma omp for
+    for (idx_t i = 0; i < n; i++) {
+        // compute min & max values
+        float minv = std::numeric_limits<float>::max();
+        float maxv = std::numeric_limits<float>::lowest();
+
+        const float* const __restrict vec_in = x + i * d;
+        for (idx_t j = 0; j < d; j++) {
+            minv = std::min(minv, vec_in[j]);
+            maxv = std::max(maxv, vec_in[j]);
+        }
+
+        const float scaler = maxv - minv;
+
+        // save the coefficients
+        StorageMinMaxT storage;
+        storage.from_floats(scaler, minv);
+
+        // and load them back, because the coefficients might
+        // be modified.
+        float actual_scaler = 0;
+        float actual_minv = 0;
+        storage.to_floats(actual_scaler, actual_minv);
+
+        float* const __restrict vec_out = tmp.data() + i * d;
+        if (actual_scaler == 0) {
+            for (idx_t j = 0; j < d; j++) {
+                vec_out[j] = 0;
+            }
+        } else {
+            float inv_actual_scaler = 1.0f / actual_scaler;
+            for (idx_t j = 0; j < d; j++) {
+                vec_out[j] = (vec_in[j] - actual_minv) * inv_actual_scaler;
+            }
+        }
+    }
+
+    sub_index->train(n, tmp.data());
+}
+
+} // namespace
+
+// block size for performing sa_encode and sa_decode
+int rowwise_minmax_sa_encode_bs = 16384;
+int rowwise_minmax_sa_decode_bs = 16384;
+
+/*********************************************************
+ * IndexRowwiseMinMaxBase implementation
+ ********************************************************/
+
+IndexRowwiseMinMaxBase::IndexRowwiseMinMaxBase(Index* index)
+        : Index(index->d, index->metric_type),
+          index{index},
+          own_fields{false} {}
+
+IndexRowwiseMinMaxBase::IndexRowwiseMinMaxBase()
+        : index{nullptr}, own_fields{false} {}
+
+IndexRowwiseMinMaxBase::~IndexRowwiseMinMaxBase() {
+    if (own_fields) {
+        delete index;
+        index = nullptr;
+    }
+}
+
+void IndexRowwiseMinMaxBase::add(idx_t, const float*) {
+    FAISS_THROW_MSG("add not implemented for this type of index");
+}
+
+void IndexRowwiseMinMaxBase::search(
+        idx_t,
+        const float*,
+        idx_t,
+        float*,
+        idx_t*,
+        const SearchParameters*) const {
+    FAISS_THROW_MSG("search not implemented for this type of index");
+}
+
+void IndexRowwiseMinMaxBase::reset() {
+    FAISS_THROW_MSG("reset not implemented for this type of index");
+}
+
+/*********************************************************
+ * IndexRowwiseMinMaxFP16 implementation
+ ********************************************************/
+
+IndexRowwiseMinMaxFP16::IndexRowwiseMinMaxFP16(Index* index)
+        : IndexRowwiseMinMaxBase(index) {}
+
+IndexRowwiseMinMaxFP16::IndexRowwiseMinMaxFP16() : IndexRowwiseMinMaxBase() {}
+
+size_t IndexRowwiseMinMaxFP16::sa_code_size() const {
+    return index->sa_code_size() + 2 * sizeof(uint16_t);
+}
+
+void IndexRowwiseMinMaxFP16::sa_encode(
+        idx_t n_input,
+        const float* x_input,
+        uint8_t* bytes_output) const {
+    sa_encode_impl<StorageMinMaxFP16>(this, n_input, x_input, bytes_output);
+}
+
+void IndexRowwiseMinMaxFP16::sa_decode(
+        idx_t n_input,
+        const uint8_t* bytes_input,
+        float* x_output) const {
+    sa_decode_impl<StorageMinMaxFP16>(this, n_input, bytes_input, x_output);
+}
+
+void IndexRowwiseMinMaxFP16::train(idx_t n, const float* x) {
+    train_impl<StorageMinMaxFP16>(this, n, x);
+}
+
+void IndexRowwiseMinMaxFP16::train_inplace(idx_t n, float* x) {
+    train_inplace_impl<StorageMinMaxFP16>(this, n, x);
+}
+
+/*********************************************************
+ * IndexRowwiseMinMax implementation
+ ********************************************************/
+
+IndexRowwiseMinMax::IndexRowwiseMinMax(Index* index)
+        : IndexRowwiseMinMaxBase(index) {}
+
+IndexRowwiseMinMax::IndexRowwiseMinMax() : IndexRowwiseMinMaxBase() {}
+
+size_t IndexRowwiseMinMax::sa_code_size() const {
+    return index->sa_code_size() + 2 * sizeof(float);
+}
+
+void IndexRowwiseMinMax::sa_encode(
+        idx_t n_input,
+        const float* x_input,
+        uint8_t* bytes_output) const {
+    sa_encode_impl<StorageMinMaxFP32>(this, n_input, x_input, bytes_output);
+}
+
+void IndexRowwiseMinMax::sa_decode(
+        idx_t n_input,
+        const uint8_t* bytes_input,
+        float* x_output) const {
+    sa_decode_impl<StorageMinMaxFP32>(this, n_input, bytes_input, x_output);
+}
+
+void IndexRowwiseMinMax::train(idx_t n, const float* x) {
+    train_impl<StorageMinMaxFP32>(this, n, x);
+}
+
+void IndexRowwiseMinMax::train_inplace(idx_t n, float* x) {
+    train_inplace_impl<StorageMinMaxFP32>(this, n, x);
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexRowwiseMinMax.h b/thirdparty/faiss/faiss/IndexRowwiseMinMax.h
new file mode 100644
index 000000000..5e16da4b4
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexRowwiseMinMax.h
@@ -0,0 +1,99 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/// Index wrapper that performs rowwise normalization to [0,1], preserving
+/// the coefficients. This is a vector codec index only.
+///
+/// Basically, this index performs a rowwise scaling to [0,1] of every row
+/// in an input dataset before calling subindex::train() and
+/// subindex::sa_encode(). sa_encode() call stores the scaling coefficients
+///  (scaler and minv) in the very beginning of every output code. The format:
+///     [scaler][minv][subindex::sa_encode() output]
+/// The de-scaling in sa_decode() is done using:
+///     output_rescaled = scaler * output + minv
+///
+/// An additional ::train_inplace() function is provided in order to do
+/// an inplace scaling before calling subindex::train() and, thus, avoiding
+/// the cloning of the input dataset, but modifying the input dataset because
+/// of the scaling and the scaling back. It is up to user to call
+/// this function instead of ::train()
+///
+/// Derived classes provide different data types for scaling coefficients.
+/// Currently, versions with fp16 and fp32 scaling coefficients are available.
+/// * fp16 version adds 4 extra bytes per encoded vector
+/// * fp32 version adds 8 extra bytes per encoded vector
+
+/// Provides base functions for rowwise normalizing indices.
+struct IndexRowwiseMinMaxBase : Index {
+    /// sub-index
+    Index* index;
+
+    /// whether the subindex needs to be freed in the destructor.
+    bool own_fields;
+
+    explicit IndexRowwiseMinMaxBase(Index* index);
+
+    IndexRowwiseMinMaxBase();
+    ~IndexRowwiseMinMaxBase() override;
+
+    void add(idx_t n, const float* x) override;
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+
+    void reset() override;
+
+    virtual void train_inplace(idx_t n, float* x) = 0;
+};
+
+/// Stores scaling coefficients as fp16 values.
+struct IndexRowwiseMinMaxFP16 : IndexRowwiseMinMaxBase {
+    explicit IndexRowwiseMinMaxFP16(Index* index);
+
+    IndexRowwiseMinMaxFP16();
+
+    void train(idx_t n, const float* x) override;
+    void train_inplace(idx_t n, float* x) override;
+
+    size_t sa_code_size() const override;
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+/// Stores scaling coefficients as fp32 values.
+struct IndexRowwiseMinMax : IndexRowwiseMinMaxBase {
+    explicit IndexRowwiseMinMax(Index* index);
+
+    IndexRowwiseMinMax();
+
+    void train(idx_t n, const float* x) override;
+    void train_inplace(idx_t n, float* x) override;
+
+    size_t sa_code_size() const override;
+    void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
+    void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
+};
+
+/// block size for performing sa_encode and sa_decode
+FAISS_API extern int rowwise_minmax_sa_encode_bs;
+FAISS_API extern int rowwise_minmax_sa_decode_bs;
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexScaNN.cpp b/thirdparty/faiss/faiss/IndexScaNN.cpp
index f2ec79b5d..1d48cb42c 100644
--- a/thirdparty/faiss/faiss/IndexScaNN.cpp
+++ b/thirdparty/faiss/faiss/IndexScaNN.cpp
@@ -60,8 +60,6 @@ void IndexScaNN::reset() {
 
 namespace {
 
-typedef faiss::Index::idx_t idx_t;
-
 template <class C>
 static void reorder_2_heaps(
         idx_t n,
@@ -107,37 +105,43 @@ int64_t IndexScaNN::size() {
     return (capacity + centroid_table + precomputed_table + raw_data);
 }
 
-void IndexScaNN::search_thread_safe(
+void IndexScaNN::search(
         idx_t n,
         const float* x,
         idx_t k,
         float* distances,
         idx_t* labels,
-        const size_t nprobe,
-        const size_t reorder_k,
-        const BitsetView bitset) const {
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
-
     FAISS_THROW_IF_NOT(is_trained);
 
+    const IndexScaNNSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IndexScaNNSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexScaNN params have incorrect type");
+    }
+
+    idx_t k_base = (params != nullptr) ? params->reorder_k : idx_t(k * k_factor);
+    SearchParameters* base_index_params = 
+        (params != nullptr) ? params->base_index_params : nullptr;
+
+    FAISS_THROW_IF_NOT(k_base >= k);
+
     auto base = dynamic_cast<const IndexIVFPQFastScan*>(base_index);
     FAISS_THROW_IF_NOT(base);
 
     // nothing to refine, directly return result
     if (refine_index == nullptr) {
-        base->search_thread_safe(
+        base->search(
             n,
             x,
             k,
             distances,
             labels,
-            nprobe,
-            bitset);
+            base_index_params);
         return;
     }
 
-    idx_t k_base = idx_t(reorder_k);
-    FAISS_THROW_IF_NOT(k_base >= k);
     idx_t* base_labels = labels;
     float* base_distances = distances;
     ScopeDeleter<idx_t> del1;
@@ -150,14 +154,13 @@ void IndexScaNN::search_thread_safe(
         del2.set(base_distances);
     }
 
-    base->search_thread_safe(
+    base->search(
             n,
             x,
             k_base,
             base_distances,
             base_labels,
-            nprobe,
-            bitset);
+            base_index_params);
     for (idx_t i = 0; i < n * k_base; i++)
         assert(base_labels[i] >= -1 && base_labels[i] < ntotal);
 
@@ -167,7 +170,7 @@ void IndexScaNN::search_thread_safe(
 
     rf->compute_distance_subset(n, x, k_base, base_distances, base_labels);
 
-    if (base->is_cosine_) {
+    if (base->is_cosine) {
         for (idx_t i = 0; i < n * k_base; i++) {
             if (base_labels[i] >= 0) {
                 base_distances[i] /= base->norms[base_labels[i]];
@@ -189,19 +192,34 @@ void IndexScaNN::search_thread_safe(
     }
 }
 
-void IndexScaNN::range_search_thread_safe(
+void IndexScaNN::range_search(
         idx_t n,
         const float* x,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset) const {
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(n == 1);  // currently knowhere will split nq to 1
 
     FAISS_THROW_IF_NOT(is_trained);
+
+    const IndexScaNNSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IndexScaNNSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexScaNN params have incorrect type");
+    }
+
+    SearchParameters* base_index_params = 
+        (params != nullptr) ? params->base_index_params : nullptr;
+
+
     auto base = dynamic_cast<const IndexIVFPQFastScan*>(base_index);
     FAISS_THROW_IF_NOT(base);
 
-    base->range_search_thread_safe(n, x, radius, result, base->nlist, bitset);
+    IVFSearchParameters ivf_search_params;
+    ivf_search_params.nprobe = base->nlist;
+    // todo aguzhva: this is somewhat hacky
+    ivf_search_params.sel = base_index_params->sel;
+    base->range_search(n, x, radius, result, &ivf_search_params);
 
     // nothing to refine, directly return the result
     if (refine_index == nullptr) {
@@ -216,7 +234,7 @@ void IndexScaNN::range_search_thread_safe(
 
     idx_t current = 0;
     for (idx_t i = 0; i < result->lims[1]; ++i) {
-        if (base->is_cosine_) {
+        if (base->is_cosine) {
             result->distances[i] /= base->norms[result->labels[i]];
         }
         if (metric_type == METRIC_L2) {
diff --git a/thirdparty/faiss/faiss/IndexScaNN.h b/thirdparty/faiss/faiss/IndexScaNN.h
index 3727cba61..d1a2f6a13 100644
--- a/thirdparty/faiss/faiss/IndexScaNN.h
+++ b/thirdparty/faiss/faiss/IndexScaNN.h
@@ -5,6 +5,13 @@
 
 namespace faiss {
 
+struct IndexScaNNSearchParameters : SearchParameters {
+    size_t reorder_k = 1;
+    SearchParameters* base_index_params = nullptr;  // non-owning
+
+    virtual ~IndexScaNNSearchParameters() = default;
+};
+
 struct IndexScaNN : IndexRefine {
     explicit IndexScaNN(Index* base_index);
     IndexScaNN(Index* base_index, const float* xb);
@@ -23,22 +30,20 @@ struct IndexScaNN : IndexRefine {
 
     int64_t size();
 
-    void search_thread_safe(
+    void search(
             idx_t n,
             const float* x,
             idx_t k,
             float* distances,
             idx_t* labels,
-            const size_t nprobe,
-            const size_t reorder_k,
-            const BitsetView bitset = nullptr) const;
+            const SearchParameters* params = nullptr) const override;
 
-    void range_search_thread_safe(
+    void range_search(
             idx_t n,
             const float* x,
             float radius,
             RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const;
+            const SearchParameters* params = nullptr) const override;
 };
 
 } // namespace faiss
\ No newline at end of file
diff --git a/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp b/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp
index 47640bfe8..e30158c07 100644
--- a/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp
+++ b/thirdparty/faiss/faiss/IndexScalarQuantizer.cpp
@@ -16,6 +16,7 @@
 
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/utils/utils.h>
 
@@ -27,17 +28,16 @@ namespace faiss {
 
 IndexScalarQuantizer::IndexScalarQuantizer(
         int d,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         MetricType metric)
         : IndexFlatCodes(0, d, metric), sq(d, qtype) {
-    is_trained =
-            qtype == QuantizerType::QT_fp16 ||
-            qtype == QuantizerType::QT_8bit_direct;
+    is_trained = qtype == ScalarQuantizer::QT_fp16 ||
+            qtype == ScalarQuantizer::QT_8bit_direct;
     code_size = sq.code_size;
 }
 
 IndexScalarQuantizer::IndexScalarQuantizer()
-        : IndexScalarQuantizer(0, QuantizerType::QT_8bit) {}
+        : IndexScalarQuantizer(0, ScalarQuantizer::QT_8bit) {}
 
 void IndexScalarQuantizer::train(idx_t n, const float* x) {
     sq.train(n, x);
@@ -50,9 +50,10 @@ void IndexScalarQuantizer::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
-    FAISS_THROW_IF_NOT(k > 0);
+        const SearchParameters* params) const {
+    const IDSelector* sel = params ? params->sel : nullptr;
 
+    FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(is_trained);
     FAISS_THROW_IF_NOT(
             metric_type == METRIC_L2 || metric_type == METRIC_INNER_PRODUCT);
@@ -60,7 +61,8 @@ void IndexScalarQuantizer::search(
 #pragma omp parallel
     {
         InvertedListScanner* scanner =
-                sq.select_InvertedListScanner(metric_type, nullptr, true);
+                sq.select_InvertedListScanner(metric_type, nullptr, true, sel);
+
         ScopeDeleter1<InvertedListScanner> del(scanner);
         scanner->list_no = 0; // directly the list number
 
@@ -75,8 +77,7 @@ void IndexScalarQuantizer::search(
                 minheap_heapify(k, D, I);
             }
             scanner->set_query(x + i * d);
-            scanner->scan_codes(
-                    ntotal, codes.data(), nullptr, nullptr, D, I, k);
+            scanner->scan_codes(ntotal, codes.data(), nullptr, nullptr, D, I, k);
 
             // re-order heap
             if (metric_type == METRIC_L2) {
@@ -88,8 +89,10 @@ void IndexScalarQuantizer::search(
     }
 }
 
-DistanceComputer* IndexScalarQuantizer::get_distance_computer() const {
-    SQDistanceComputer* dc = sq.get_distance_computer(metric_type);
+FlatCodesDistanceComputer* IndexScalarQuantizer::get_FlatCodesDistanceComputer()
+        const {
+    ScalarQuantizer::SQDistanceComputer* dc =
+            sq.get_distance_computer(metric_type);
     dc->code_size = sq.code_size;
     dc->codes = codes.data();
     return dc;
@@ -109,6 +112,10 @@ void IndexScalarQuantizer::sa_decode(idx_t n, const uint8_t* bytes, float* x)
     sq.decode(bytes, x, n);
 }
 
+size_t IndexScalarQuantizer::cal_size() const {
+    return codes.size() * sizeof(uint8_t) + sizeof(size_t) + sq.cal_size();
+}
+
 /*******************************************************************
  * IndexIVFScalarQuantizer implementation
  ********************************************************************/
@@ -117,23 +124,30 @@ IndexIVFScalarQuantizer::IndexIVFScalarQuantizer(
         Index* quantizer,
         size_t d,
         size_t nlist,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         MetricType metric,
-        bool encode_residual)
-        : IndexIVF(quantizer, d, nlist, 0, metric),
-          sq(d, qtype),
-          by_residual(encode_residual) {
+        bool by_residual)
+        : IndexIVF(quantizer, d, nlist, 0, metric), sq(d, qtype) {
     code_size = sq.code_size;
+    this->by_residual = by_residual;
     // was not known at construction time
     invlists->code_size = code_size;
     is_trained = false;
 }
 
-IndexIVFScalarQuantizer::IndexIVFScalarQuantizer()
-        : IndexIVF(), by_residual(true) {}
+IndexIVFScalarQuantizer::IndexIVFScalarQuantizer() : IndexIVF() {
+    by_residual = true;
+}
+
+void IndexIVFScalarQuantizer::train_encoder(
+        idx_t n,
+        const float* x,
+        const idx_t* assign) {
+    sq.train(n, x);
+}
 
-void IndexIVFScalarQuantizer::train_residual(idx_t n, const float* x) {
-    sq.train_residual(n, x, quantizer, by_residual, verbose);
+idx_t IndexIVFScalarQuantizer::train_encoder_num_vectors() const {
+    return 100000;
 }
 
 void IndexIVFScalarQuantizer::encode_vectors(
@@ -142,7 +156,7 @@ void IndexIVFScalarQuantizer::encode_vectors(
         const idx_t* list_nos,
         uint8_t* codes,
         bool include_listnos) const {
-    std::unique_ptr<Quantizer> squant(sq.select_quantizer());
+    std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
     size_t coarse_size = include_listnos ? coarse_code_size() : 0;
     memset(codes, 0, (code_size + coarse_size) * n);
 
@@ -171,7 +185,7 @@ void IndexIVFScalarQuantizer::encode_vectors(
 
 void IndexIVFScalarQuantizer::sa_decode(idx_t n, const uint8_t* codes, float* x)
         const {
-    std::unique_ptr<Quantizer> squant(sq.select_quantizer());
+    std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
     size_t coarse_size = coarse_code_size();
 
 #pragma omp parallel if (n > 1000)
@@ -203,7 +217,7 @@ void IndexIVFScalarQuantizer::add_core(
     FAISS_THROW_IF_NOT(is_trained);
 
     size_t nadd = 0;
-    std::unique_ptr<Quantizer> squant(sq.select_quantizer());
+    std::unique_ptr<ScalarQuantizer::SQuantizer> squant(sq.select_quantizer());
 
     DirectMapAdd dm_add(direct_map, n, xids);
 
@@ -229,8 +243,7 @@ void IndexIVFScalarQuantizer::add_core(
                 memset(one_code.data(), 0, code_size);
                 squant->encode_vector(xi, one_code.data());
 
-                size_t ofs = invlists->add_entry(
-                        list_no, id, one_code.data());
+                size_t ofs = invlists->add_entry(list_no, id, one_code.data());
 
                 dm_add.add(i, list_no, ofs);
                 nadd++;
@@ -245,22 +258,28 @@ void IndexIVFScalarQuantizer::add_core(
 }
 
 InvertedListScanner* IndexIVFScalarQuantizer::get_InvertedListScanner(
-        bool store_pairs) const {
+        bool store_pairs,
+        const IDSelector* sel) const {
     return sq.select_InvertedListScanner(
-            metric_type, quantizer, store_pairs, by_residual);
+            metric_type, quantizer, store_pairs, sel, by_residual);
 }
 
 void IndexIVFScalarQuantizer::reconstruct_from_offset(
         int64_t list_no,
         int64_t offset,
         float* recons) const {
-    std::vector<float> centroid(d);
-    quantizer->reconstruct(list_no, centroid.data());
-
     const uint8_t* code = invlists->get_single_code(list_no, offset);
-    sq.decode(code, recons, 1);
-    for (int i = 0; i < d; ++i) {
-        recons[i] += centroid[i];
+
+    if (by_residual) {
+        std::vector<float> centroid(d);
+        quantizer->reconstruct(list_no, centroid.data());
+
+        sq.decode(code, recons, 1);
+        for (int i = 0; i < d; ++i) {
+            recons[i] += centroid[i];
+        }
+    } else {
+        sq.decode(code, recons, 1);
     }
 }
 
diff --git a/thirdparty/faiss/faiss/IndexScalarQuantizer.h b/thirdparty/faiss/faiss/IndexScalarQuantizer.h
index 4b9be8aa7..fe6598046 100644
--- a/thirdparty/faiss/faiss/IndexScalarQuantizer.h
+++ b/thirdparty/faiss/faiss/IndexScalarQuantizer.h
@@ -21,11 +21,8 @@
 namespace faiss {
 
 /**
- * The uniform quantizer has a range [vmin, vmax]. The range can be
- * the same for all dimensions (uniform) or specific per dimension
- * (default).
+ * Flat index built on a scalar quantizer.
  */
-
 struct IndexScalarQuantizer : IndexFlatCodes {
     /// Used to encode the vectors
     ScalarQuantizer sq;
@@ -38,7 +35,7 @@ struct IndexScalarQuantizer : IndexFlatCodes {
      */
     IndexScalarQuantizer(
             int d,
-            QuantizerType qtype,
+            ScalarQuantizer::QuantizerType qtype,
             MetricType metric = METRIC_L2);
 
     IndexScalarQuantizer();
@@ -51,18 +48,16 @@ struct IndexScalarQuantizer : IndexFlatCodes {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
-    DistanceComputer* get_distance_computer() const override;
+    FlatCodesDistanceComputer* get_FlatCodesDistanceComputer() const override;
 
     /* standalone codec interface */
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     void sa_decode(idx_t n, const uint8_t* bytes, float* x) const override;
 
-    size_t cal_size() {
-        return codes.size() * sizeof(uint8_t) + sizeof(size_t) + sq.cal_size();
-    }
+    size_t cal_size() const;
 };
 
 /** An IVF implementation where the components of the residuals are
@@ -73,19 +68,20 @@ struct IndexScalarQuantizer : IndexFlatCodes {
 
 struct IndexIVFScalarQuantizer : IndexIVF {
     ScalarQuantizer sq;
-    bool by_residual;
 
     IndexIVFScalarQuantizer(
             Index* quantizer,
             size_t d,
             size_t nlist,
-            QuantizerType qtype,
+            ScalarQuantizer::QuantizerType qtype,
             MetricType metric = METRIC_L2,
-            bool encode_residual = true);
+            bool by_residual = true);
 
     IndexIVFScalarQuantizer();
 
-    void train_residual(idx_t n, const float* x) override;
+    void train_encoder(idx_t n, const float* x, const idx_t* assign) override;
+
+    idx_t train_encoder_num_vectors() const override;
 
     void encode_vectors(
             idx_t n,
@@ -102,7 +98,8 @@ struct IndexIVFScalarQuantizer : IndexIVF {
             const idx_t* precomputed_idx) override;
 
     InvertedListScanner* get_InvertedListScanner(
-            bool store_pairs) const override;
+            bool store_pairs,
+            const IDSelector* sel) const override;
 
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
             const override;
diff --git a/thirdparty/faiss/faiss/IndexShards.cpp b/thirdparty/faiss/faiss/IndexShards.cpp
index a8c746d5c..716e69a59 100644
--- a/thirdparty/faiss/faiss/IndexShards.cpp
+++ b/thirdparty/faiss/faiss/IndexShards.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexShards.h>
 
 #include <cinttypes>
@@ -22,98 +20,26 @@ namespace faiss {
 // subroutines
 namespace {
 
-typedef Index::idx_t idx_t;
+// IndexBinary needs to update the code_size when d is set...
+
+void sync_d(Index* index) {}
+
+void sync_d(IndexBinary* index) {
+    FAISS_THROW_IF_NOT(index->d % 8 == 0);
+    index->code_size = index->d / 8;
+}
 
 // add translation to all valid labels
-void translate_labels(long n, idx_t* labels, long translation) {
+void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
     if (translation == 0)
         return;
-    for (long i = 0; i < n; i++) {
+    for (int64_t i = 0; i < n; i++) {
         if (labels[i] < 0)
             continue;
         labels[i] += translation;
     }
 }
 
-/** merge result tables from several shards.
- * @param all_distances  size nshard * n * k
- * @param all_labels     idem
- * @param translartions  label translations to apply, size nshard
- */
-
-template <class IndexClass, class C>
-void merge_tables(
-        long n,
-        long k,
-        long nshard,
-        typename IndexClass::distance_t* distances,
-        idx_t* labels,
-        const std::vector<typename IndexClass::distance_t>& all_distances,
-        const std::vector<idx_t>& all_labels,
-        const std::vector<long>& translations) {
-    if (k == 0) {
-        return;
-    }
-    using distance_t = typename IndexClass::distance_t;
-
-    long stride = n * k;
-#pragma omp parallel
-    {
-        std::vector<int> buf(2 * nshard);
-        int* pointer = buf.data();
-        int* shard_ids = pointer + nshard;
-        std::vector<distance_t> buf2(nshard);
-        distance_t* heap_vals = buf2.data();
-#pragma omp for
-        for (long i = 0; i < n; i++) {
-            // the heap maps values to the shard where they are
-            // produced.
-            const distance_t* D_in = all_distances.data() + i * k;
-            const idx_t* I_in = all_labels.data() + i * k;
-            int heap_size = 0;
-
-            for (long s = 0; s < nshard; s++) {
-                pointer[s] = 0;
-                if (I_in[stride * s] >= 0) {
-                    heap_push<C>(
-                            ++heap_size,
-                            heap_vals,
-                            shard_ids,
-                            D_in[stride * s],
-                            s);
-                }
-            }
-
-            distance_t* D = distances + i * k;
-            idx_t* I = labels + i * k;
-
-            for (int j = 0; j < k; j++) {
-                if (heap_size == 0) {
-                    I[j] = -1;
-                    D[j] = C::neutral();
-                } else {
-                    // pop best element
-                    int s = shard_ids[0];
-                    int& p = pointer[s];
-                    D[j] = heap_vals[0];
-                    I[j] = I_in[stride * s + p] + translations[s];
-
-                    heap_pop<C>(heap_size--, heap_vals, shard_ids);
-                    p++;
-                    if (p < k && I_in[stride * s + p] >= 0) {
-                        heap_push<C>(
-                                ++heap_size,
-                                heap_vals,
-                                shard_ids,
-                                D_in[stride * s + p],
-                                s);
-                    }
-                }
-            }
-        }
-    }
-}
-
 } // anonymous namespace
 
 template <typename IndexT>
@@ -121,20 +47,26 @@ IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         idx_t d,
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 
 template <typename IndexT>
 IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         int d,
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(d, threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 
 template <typename IndexT>
 IndexShardsTemplate<IndexT>::IndexShardsTemplate(
         bool threaded,
         bool successive_ids)
-        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {}
+        : ThreadedIndex<IndexT>(threaded), successive_ids(successive_ids) {
+    sync_d(this);
+}
 
 template <typename IndexT>
 void IndexShardsTemplate<IndexT>::onAfterAddIndex(IndexT* index /* unused */) {
@@ -159,6 +91,8 @@ void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
     }
 
     auto firstIndex = this->at(0);
+    this->d = firstIndex->d;
+    sync_d(this);
     this->metric_type = firstIndex->metric_type;
     this->is_trained = firstIndex->is_trained;
     this->ntotal = firstIndex->ntotal;
@@ -173,29 +107,6 @@ void IndexShardsTemplate<IndexT>::syncWithSubIndexes() {
     }
 }
 
-// No metric_type for IndexBinary
-template <>
-void IndexShardsTemplate<IndexBinary>::syncWithSubIndexes() {
-    if (!this->count()) {
-        this->is_trained = false;
-        this->ntotal = 0;
-
-        return;
-    }
-
-    auto firstIndex = this->at(0);
-    this->is_trained = firstIndex->is_trained;
-    this->ntotal = firstIndex->ntotal;
-
-    for (int i = 1; i < this->count(); ++i) {
-        auto index = this->at(i);
-        FAISS_THROW_IF_NOT(this->d == index->d);
-        FAISS_THROW_IF_NOT(this->is_trained == index->is_trained);
-
-        this->ntotal += index->ntotal;
-    }
-}
-
 template <typename IndexT>
 void IndexShardsTemplate<IndexT>::train(idx_t n, const component_t* x) {
     auto fn = [n, x](int no, IndexT* index) {
@@ -236,7 +147,7 @@ void IndexShardsTemplate<IndexT>::add_with_ids(
                 "request them to be shifted");
         FAISS_THROW_IF_NOT_MSG(
                 this->ntotal == 0,
-                "when adding to IndexShards with sucessive_ids, "
+                "when adding to IndexShards with successive_ids, "
                 "only add() in a single pass is supported");
     }
 
@@ -247,11 +158,9 @@ void IndexShardsTemplate<IndexT>::add_with_ids(
 
     if (!ids && !successive_ids) {
         aids.resize(n);
-
         for (idx_t i = 0; i < n; i++) {
             aids[i] = this->ntotal + i;
         }
-
         ids = aids.data();
     }
 
@@ -289,15 +198,28 @@ void IndexShardsTemplate<IndexT>::search(
         idx_t k,
         distance_t* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
-    long nshard = this->count();
+    int64_t nshard = this->count();
 
     std::vector<distance_t> all_distances(nshard * k * n);
     std::vector<idx_t> all_labels(nshard * k * n);
+    std::vector<int64_t> translations(nshard, 0);
+
+    // Because we just called runOnIndex above, it is safe to access the
+    // sub-index ntotal here
+    if (successive_ids) {
+        translations[0] = 0;
+
+        for (int s = 0; s + 1 < nshard; s++) {
+            translations[s + 1] = translations[s] + this->at(s)->ntotal;
+        }
+    }
 
-    auto fn = [n, k, x, &all_distances, &all_labels](
+    auto fn = [n, k, x, &all_distances, &all_labels, &translations](
                       int no, const IndexT* index) {
         if (index->verbose) {
             printf("begin query shard %d on %" PRId64 " points\n", no, n);
@@ -310,6 +232,9 @@ void IndexShardsTemplate<IndexT>::search(
                 all_distances.data() + no * k * n,
                 all_labels.data() + no * k * n);
 
+        translate_labels(
+                n * k, all_labels.data() + no * k * n, translations[no]);
+
         if (index->verbose) {
             printf("end query shard %d\n", no);
         }
@@ -317,38 +242,24 @@ void IndexShardsTemplate<IndexT>::search(
 
     this->runOnIndex(fn);
 
-    std::vector<long> translations(nshard, 0);
-
-    // Because we just called runOnIndex above, it is safe to access the
-    // sub-index ntotal here
-    if (successive_ids) {
-        translations[0] = 0;
-
-        for (int s = 0; s + 1 < nshard; s++) {
-            translations[s + 1] = translations[s] + this->at(s)->ntotal;
-        }
-    }
-
     if (this->metric_type == METRIC_L2) {
-        merge_tables<IndexT, CMin<distance_t, int>>(
+        merge_knn_results<idx_t, CMin<distance_t, int>>(
                 n,
                 k,
                 nshard,
+                all_distances.data(),
+                all_labels.data(),
                 distances,
-                labels,
-                all_distances,
-                all_labels,
-                translations);
+                labels);
     } else {
-        merge_tables<IndexT, CMax<distance_t, int>>(
+        merge_knn_results<idx_t, CMax<distance_t, int>>(
                 n,
                 k,
                 nshard,
+                all_distances.data(),
+                all_labels.data(),
                 distances,
-                labels,
-                all_distances,
-                all_labels,
-                translations);
+                labels);
     }
 }
 
diff --git a/thirdparty/faiss/faiss/IndexShards.h b/thirdparty/faiss/faiss/IndexShards.h
index cf883ce07..278d5724a 100644
--- a/thirdparty/faiss/faiss/IndexShards.h
+++ b/thirdparty/faiss/faiss/IndexShards.h
@@ -18,7 +18,6 @@ namespace faiss {
  */
 template <typename IndexT>
 struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
-    using idx_t = typename IndexT::idx_t;
     using component_t = typename IndexT::component_t;
     using distance_t = typename IndexT::distance_t;
 
@@ -72,7 +71,7 @@ struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
      * Cases (successive_ids, xids):
      * - true, non-NULL       ERROR: it makes no sense to pass in ids and
      *                        request them to be shifted
-     * - true, NULL           OK, but should be called only once (calls add()
+     * - true, NULL           OK: but should be called only once (calls add()
      *                        on sub-indexes).
      * - false, non-NULL      OK: will call add_with_ids with passed in xids
      *                        distributed evenly over shards
@@ -88,7 +87,7 @@ struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
             idx_t k,
             distance_t* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
     void train(idx_t n, const component_t* x) override;
 
@@ -96,7 +95,7 @@ struct IndexShardsTemplate : public ThreadedIndex<IndexT> {
 
     /// Synchronize the top-level index (IndexShards) with data in the
     /// sub-indices
-    void syncWithSubIndexes();
+    virtual void syncWithSubIndexes();
 
    protected:
     /// Called just after an index is added
diff --git a/thirdparty/faiss/faiss/IndexShardsIVF.cpp b/thirdparty/faiss/faiss/IndexShardsIVF.cpp
new file mode 100644
index 000000000..4381dbb9d
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexShardsIVF.cpp
@@ -0,0 +1,246 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/IndexShardsIVF.h>
+
+#include <cinttypes>
+#include <cstdio>
+#include <functional>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/WorkerThread.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+// subroutines
+namespace {
+
+// add translation to all valid labels
+void translate_labels(int64_t n, idx_t* labels, int64_t translation) {
+    if (translation == 0) {
+        return;
+    }
+    for (int64_t i = 0; i < n; i++) {
+        if (labels[i] < 0) {
+            continue;
+        }
+        labels[i] += translation;
+    }
+}
+
+} // anonymous namespace
+
+/************************************************************
+ * IndexShardsIVF
+ ************************************************************/
+
+IndexShardsIVF::IndexShardsIVF(
+        Index* quantizer,
+        size_t nlist,
+        bool threaded,
+        bool successive_ids)
+        : IndexShardsTemplate<Index>(quantizer->d, threaded, successive_ids),
+          Level1Quantizer(quantizer, nlist) {
+    is_trained = quantizer->is_trained && quantizer->ntotal == nlist;
+}
+
+void IndexShardsIVF::addIndex(Index* index) {
+    auto index_ivf = dynamic_cast<IndexIVFInterface*>(index);
+    FAISS_THROW_IF_NOT_MSG(index_ivf, "can only add IndexIVFs");
+    FAISS_THROW_IF_NOT(index_ivf->nlist == nlist);
+    IndexShardsTemplate<Index>::addIndex(index);
+}
+
+void IndexShardsIVF::train(idx_t n, const component_t* x) {
+    if (verbose) {
+        printf("Training level-1 quantizer\n");
+    }
+    train_q1(n, x, verbose, metric_type);
+
+    // set the sub-quantizer codebooks
+    std::vector<float> centroids(nlist * d);
+    quantizer->reconstruct_n(0, nlist, centroids.data());
+
+    // probably not worth running in parallel
+    for (size_t i = 0; i < indices_.size(); i++) {
+        Index* index = indices_[i].first;
+        auto index_ivf = dynamic_cast<IndexIVFInterface*>(index);
+        Index* quantizer = index_ivf->quantizer;
+        if (!quantizer->is_trained) {
+            quantizer->train(nlist, centroids.data());
+        }
+        quantizer->add(nlist, centroids.data());
+        // finish training
+        index->train(n, x);
+    }
+
+    is_trained = true;
+}
+
+void IndexShardsIVF::add_with_ids(
+        idx_t n,
+        const component_t* x,
+        const idx_t* xids) {
+    // IndexIVF exposes add_core that we can use to factorize the
+    bool all_index_ivf = true;
+    for (size_t i = 0; i < indices_.size(); i++) {
+        Index* index = indices_[i].first;
+        all_index_ivf = all_index_ivf && dynamic_cast<IndexIVF*>(index);
+    }
+    if (!all_index_ivf) {
+        IndexShardsTemplate<Index>::add_with_ids(n, x, xids);
+        return;
+    }
+    FAISS_THROW_IF_NOT_MSG(
+            !(successive_ids && xids),
+            "It makes no sense to pass in ids and "
+            "request them to be shifted");
+
+    if (successive_ids) {
+        FAISS_THROW_IF_NOT_MSG(
+                !xids,
+                "It makes no sense to pass in ids and "
+                "request them to be shifted");
+        FAISS_THROW_IF_NOT_MSG(
+                this->ntotal == 0,
+                "when adding to IndexShards with successive_ids, "
+                "only add() in a single pass is supported");
+    }
+
+    // perform coarse quantization
+    std::vector<idx_t> Iq(n);
+    std::vector<float> Dq(n);
+    quantizer->search(n, x, 1, Dq.data(), Iq.data());
+
+    // possibly shift ids
+    idx_t nshard = this->count();
+    const idx_t* ids = xids;
+    std::vector<idx_t> aids;
+    if (!ids && !successive_ids) {
+        aids.resize(n);
+
+        for (idx_t i = 0; i < n; i++) {
+            aids[i] = this->ntotal + i;
+        }
+        ids = aids.data();
+    }
+    idx_t d = this->d;
+
+    auto fn = [n, ids, x, nshard, d, Iq](int no, Index* index) {
+        idx_t i0 = (idx_t)no * n / nshard;
+        idx_t i1 = ((idx_t)no + 1) * n / nshard;
+        const float* x0 = x + i0 * d;
+        auto index_ivf = dynamic_cast<IndexIVF*>(index);
+
+        if (index->verbose) {
+            printf("begin add shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        index_ivf->add_core(
+                i1 - i0, x + i0 * d, nullptr, ids ? ids + i0 : nullptr, Iq.data() + i0);
+
+        if (index->verbose) {
+            printf("end add shard %d on %" PRId64 " points\n", no, i1 - i0);
+        }
+    };
+
+    this->runOnIndex(fn);
+    syncWithSubIndexes();
+}
+
+void IndexShardsIVF::search(
+        idx_t n,
+        const component_t* x,
+        idx_t k,
+        distance_t* distances,
+        idx_t* labels,
+        const SearchParameters* params_in) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    FAISS_THROW_IF_NOT(count() > 0);
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
+    }
+
+    auto index0 = dynamic_cast<const IndexIVFInterface*>(at(0));
+    idx_t nprobe = params ? params->nprobe : index0->nprobe;
+
+    // coarse quantization (TODO: support tiling with search_precomputed)
+    std::vector<distance_t> Dq(n * nprobe);
+    std::vector<idx_t> Iq(n * nprobe);
+
+    quantizer->search(n, x, nprobe, Dq.data(), Iq.data());
+
+    int64_t nshard = this->count();
+
+    std::vector<distance_t> all_distances(nshard * k * n);
+    std::vector<idx_t> all_labels(nshard * k * n);
+    std::vector<int64_t> translations(nshard, 0);
+
+    if (successive_ids) {
+        translations[0] = 0;
+        for (int s = 0; s + 1 < nshard; s++) {
+            translations[s + 1] = translations[s] + this->at(s)->ntotal;
+        }
+    }
+
+    auto fn = [&](int no, const Index* indexIn) {
+        if (indexIn->verbose) {
+            printf("begin query shard %d on %" PRId64 " points\n", no, n);
+        }
+
+        auto index = dynamic_cast<const IndexIVFInterface*>(indexIn);
+
+        FAISS_THROW_IF_NOT_MSG(index->nprobe == nprobe, "inconsistent nprobe");
+
+        index->search_preassigned(
+                n,
+                x,
+                k,
+                Iq.data(),
+                Dq.data(),
+                all_distances.data() + no * k * n,
+                all_labels.data() + no * k * n,
+                false);
+
+        translate_labels(
+                n * k, all_labels.data() + no * k * n, translations[no]);
+
+        if (indexIn->verbose) {
+            printf("end query shard %d\n", no);
+        }
+    };
+
+    this->runOnIndex(fn);
+
+    if (this->metric_type == METRIC_L2) {
+        merge_knn_results<idx_t, CMin<distance_t, int>>(
+                n,
+                k,
+                nshard,
+                all_distances.data(),
+                all_labels.data(),
+                distances,
+                labels);
+    } else {
+        merge_knn_results<idx_t, CMax<distance_t, int>>(
+                n,
+                k,
+                nshard,
+                all_distances.data(),
+                all_labels.data(),
+                distances,
+                labels);
+    }
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/IndexShardsIVF.h b/thirdparty/faiss/faiss/IndexShardsIVF.h
new file mode 100644
index 000000000..8b17e22b8
--- /dev/null
+++ b/thirdparty/faiss/faiss/IndexShardsIVF.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/IndexShards.h>
+
+namespace faiss {
+
+/**
+ * IndexShards with a common coarse quantizer. All the indexes added should be
+ * IndexIVFInterface indexes so that the search_precomputed can be called.
+ */
+struct IndexShardsIVF : public IndexShards, Level1Quantizer {
+    explicit IndexShardsIVF(
+            Index* quantizer,
+            size_t nlist,
+            bool threaded = false,
+            bool successive_ids = true);
+
+    void addIndex(Index* index) override;
+
+    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
+            override;
+
+    void train(idx_t n, const component_t* x) override;
+
+    void search(
+            idx_t n,
+            const component_t* x,
+            idx_t k,
+            distance_t* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/MatrixStats.cpp b/thirdparty/faiss/faiss/MatrixStats.cpp
index a864127bb..cc87e1797 100644
--- a/thirdparty/faiss/faiss/MatrixStats.cpp
+++ b/thirdparty/faiss/faiss/MatrixStats.cpp
@@ -9,9 +9,10 @@
 
 #include <faiss/MatrixStats.h>
 
-#include <stdarg.h> /* va_list, va_start, va_arg, va_end */
+#include <cstdarg> /* va_list, va_start, va_arg, va_end */
 
 #include <faiss/utils/utils.h>
+#include <cinttypes>
 #include <cmath>
 #include <cstdio>
 
@@ -21,18 +22,6 @@ namespace faiss {
  * MatrixStats
  *********************************************************************/
 
-MatrixStats::PerDimStats::PerDimStats()
-        : n(0),
-          n_nan(0),
-          n_inf(0),
-          n0(0),
-          min(HUGE_VALF),
-          max(-HUGE_VALF),
-          sum(0),
-          sum2(0),
-          mean(NAN),
-          stddev(NAN) {}
-
 void MatrixStats::PerDimStats::add(float x) {
     n++;
     if (std::isnan(x)) {
@@ -74,19 +63,12 @@ void MatrixStats::do_comment(const char* fmt, ...) {
     buf += size;
 }
 
-MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
-        : n(n),
-          d(d),
-          n_collision(0),
-          n_valid(0),
-          n0(0),
-          min_norm2(HUGE_VAL),
-          max_norm2(0) {
+MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) {
     std::vector<char> comment_buf(10000);
     buf = comment_buf.data();
     nbuf = comment_buf.size();
 
-    do_comment("analyzing %ld vectors of size %ld\n", n, d);
+    do_comment("analyzing %zd vectors of size %zd\n", n, d);
 
     if (d > 1024) {
         do_comment(
@@ -94,6 +76,9 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 "please consider dimensionality reducution (with PCAMatrix)\n");
     }
 
+    hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x));
+    do_comment("hash value 0x%016" PRIx64 "\n", hash_value);
+
     size_t nbytes = sizeof(x[0]) * d;
     per_dim_stats.resize(d);
 
@@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
 
         if (n_collision > 0) {
             do_comment(
-                    "%ld collisions in hash table, "
+                    "%zd collisions in hash table, "
                     "counts may be invalid\n",
                     n_collision);
         }
@@ -167,14 +152,14 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
                 max = it->second;
             }
         }
-        do_comment("vector %ld has %ld copies\n", max.first, max.count);
+        do_comment("vector %zd has %zd copies\n", max.first, max.count);
     }
 
     { // norm stats
         min_norm2 = sqrt(min_norm2);
         max_norm2 = sqrt(max_norm2);
         do_comment(
-                "range of L2 norms=[%g, %g] (%ld null vectors)\n",
+                "range of L2 norms=[%g, %g] (%zd null vectors)\n",
                 min_norm2,
                 max_norm2,
                 n0);
@@ -182,7 +167,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
         if (max_norm2 < min_norm2 * 1.0001) {
             do_comment(
                     "vectors are normalized, inner product and "
-                    "L2  search are equivalent\n");
+                    "L2 search are equivalent\n");
         }
 
         if (max_norm2 > min_norm2 * 100) {
@@ -227,7 +212,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
             do_comment("no constant dimensions\n");
         } else {
             do_comment(
-                    "%ld dimensions are constant: they can be removed\n",
+                    "%zd dimensions are constant: they can be removed\n",
                     n_0_range);
         }
 
@@ -235,7 +220,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x)
             do_comment("no dimension has a too large mean\n");
         } else {
             do_comment(
-                    "%ld dimensions are too large "
+                    "%zd dimensions are too large "
                     "wrt. their variance, may loose precision "
                     "in IndexFlatL2 (use CenteringTransform)\n",
                     n_dangerous_range);
diff --git a/thirdparty/faiss/faiss/MatrixStats.h b/thirdparty/faiss/faiss/MatrixStats.h
index 8d18d1008..45a7c97da 100644
--- a/thirdparty/faiss/faiss/MatrixStats.h
+++ b/thirdparty/faiss/faiss/MatrixStats.h
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <stdint.h>
+#include <cmath>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -26,20 +27,31 @@ struct MatrixStats {
     std::string comments;
 
     // raw statistics
-    size_t n, d;
-    size_t n_collision, n_valid, n0;
-    double min_norm2, max_norm2;
+    size_t n = 0, d = 0;
+    size_t n_collision = 0;
+    size_t n_valid = 0;
+    size_t n0 = 0;
+    double min_norm2 = HUGE_VALF;
+    double max_norm2 = 0;
+    uint64_t hash_value = 0;
 
     struct PerDimStats {
-        size_t n, n_nan, n_inf, n0;
+        /// counts of various special entries
+        size_t n = 0;
+        size_t n_nan = 0;
+        size_t n_inf = 0;
+        size_t n0 = 0;
 
-        float min, max;
-        double sum, sum2;
+        /// to get min/max and stddev values
+        float min = HUGE_VALF;
+        float max = -HUGE_VALF;
+        double sum = 0;
+        double sum2 = 0;
 
-        size_t n_valid;
-        double mean, stddev;
+        size_t n_valid = 0;
+        double mean = NAN;
+        double stddev = NAN;
 
-        PerDimStats();
         void add(float x);
         void compute_mean_std();
     };
diff --git a/thirdparty/faiss/faiss/MetaIndexes.cpp b/thirdparty/faiss/faiss/MetaIndexes.cpp
index c12ffbe00..d3dd5b0fc 100644
--- a/thirdparty/faiss/faiss/MetaIndexes.cpp
+++ b/thirdparty/faiss/faiss/MetaIndexes.cpp
@@ -9,197 +9,21 @@
 
 #include <faiss/MetaIndexes.h>
 
-#include <stdint.h>
 #include <cinttypes>
+#include <cstdint>
 #include <cstdio>
 #include <limits>
 
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/WorkerThread.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
 
 namespace faiss {
 
-namespace {} // namespace
-
-/*****************************************************
- * IndexIDMap implementation
- *******************************************************/
-
-template <typename IndexT>
-IndexIDMapTemplate<IndexT>::IndexIDMapTemplate(IndexT* index)
-        : index(index), own_fields(false) {
-    FAISS_THROW_IF_NOT_MSG(index->ntotal == 0, "index must be empty on input");
-    this->is_trained = index->is_trained;
-    this->metric_type = index->metric_type;
-    this->verbose = index->verbose;
-    this->d = index->d;
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::add(
-        idx_t,
-        const typename IndexT::component_t*) {
-    FAISS_THROW_MSG(
-            "add does not make sense with IndexIDMap, "
-            "use add_with_ids");
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::train(
-        idx_t n,
-        const typename IndexT::component_t* x) {
-    index->train(n, x);
-    this->is_trained = index->is_trained;
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::reset() {
-    index->reset();
-    id_map.clear();
-    this->ntotal = 0;
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::add_with_ids(
-        idx_t n,
-        const typename IndexT::component_t* x,
-        const typename IndexT::idx_t* xids) {
-    index->add(n, x);
-    for (idx_t i = 0; i < n; i++)
-        id_map.push_back(xids[i]);
-    this->ntotal = index->ntotal;
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::search(
-        idx_t n,
-        const typename IndexT::component_t* x,
-        idx_t k,
-        typename IndexT::distance_t* distances,
-        typename IndexT::idx_t* labels,
-        const BitsetView bitset) const {
-    index->search(n, x, k, distances, labels, bitset);
-    idx_t* li = labels;
-#pragma omp parallel for
-    for (idx_t i = 0; i < n * k; i++) {
-        li[i] = li[i] < 0 ? li[i] : id_map[li[i]];
-    }
-}
-
-template <typename IndexT>
-void IndexIDMapTemplate<IndexT>::range_search(
-        typename IndexT::idx_t n,
-        const typename IndexT::component_t* x,
-        float radius,
-        RangeSearchResult* result,
-        const BitsetView bitset) const {
-    index->range_search(n, x, radius, result, bitset);
-#pragma omp parallel for
-    for (idx_t i = 0; i < result->lims[result->nq]; i++) {
-        result->labels[i] = result->labels[i] < 0 ? result->labels[i]
-                                                  : id_map[result->labels[i]];
-    }
-}
-
-namespace {
-
-struct IDTranslatedSelector : IDSelector {
-    const std::vector<int64_t>& id_map;
-    const IDSelector& sel;
-    IDTranslatedSelector(
-            const std::vector<int64_t>& id_map,
-            const IDSelector& sel)
-            : id_map(id_map), sel(sel) {}
-    bool is_member(idx_t id) const override {
-        return sel.is_member(id_map[id]);
-    }
-};
-
-} // namespace
-
-template <typename IndexT>
-size_t IndexIDMapTemplate<IndexT>::remove_ids(const IDSelector& sel) {
-    // remove in sub-index first
-    IDTranslatedSelector sel2(id_map, sel);
-    size_t nremove = index->remove_ids(sel2);
-
-    int64_t j = 0;
-    for (idx_t i = 0; i < this->ntotal; i++) {
-        if (sel.is_member(id_map[i])) {
-            // remove
-        } else {
-            id_map[j] = id_map[i];
-            j++;
-        }
-    }
-    FAISS_ASSERT(j == index->ntotal);
-    this->ntotal = j;
-    id_map.resize(this->ntotal);
-    return nremove;
-}
-
-template <typename IndexT>
-IndexIDMapTemplate<IndexT>::~IndexIDMapTemplate() {
-    if (own_fields)
-        delete index;
-}
-
-/*****************************************************
- * IndexIDMap2 implementation
- *******************************************************/
-
-template <typename IndexT>
-IndexIDMap2Template<IndexT>::IndexIDMap2Template(IndexT* index)
-        : IndexIDMapTemplate<IndexT>(index) {}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::add_with_ids(
-        idx_t n,
-        const typename IndexT::component_t* x,
-        const typename IndexT::idx_t* xids) {
-    size_t prev_ntotal = this->ntotal;
-    IndexIDMapTemplate<IndexT>::add_with_ids(n, x, xids);
-    for (size_t i = prev_ntotal; i < this->ntotal; i++) {
-        rev_map[this->id_map[i]] = i;
-    }
-}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::construct_rev_map() {
-    rev_map.clear();
-    for (size_t i = 0; i < this->ntotal; i++) {
-        rev_map[this->id_map[i]] = i;
-    }
-}
-
-template <typename IndexT>
-size_t IndexIDMap2Template<IndexT>::remove_ids(const IDSelector& sel) {
-    // This is quite inefficient
-    size_t nremove = IndexIDMapTemplate<IndexT>::remove_ids(sel);
-    construct_rev_map();
-    return nremove;
-}
-
-template <typename IndexT>
-void IndexIDMap2Template<IndexT>::reconstruct(
-        idx_t key,
-        typename IndexT::component_t* recons) const {
-    try {
-        this->index->reconstruct(rev_map.at(key), recons);
-    } catch (const std::out_of_range& e) {
-        FAISS_THROW_FMT("key %" PRId64 " not found", key);
-    }
-}
-
-// explicit template instantiations
-
-template struct IndexIDMapTemplate<Index>;
-template struct IndexIDMapTemplate<IndexBinary>;
-template struct IndexIDMap2Template<Index>;
-template struct IndexIDMap2Template<IndexBinary>;
-
 /*****************************************************
  * IndexSplitVectors implementation
  *******************************************************/
@@ -238,7 +62,9 @@ void IndexSplitVectors::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const BitsetView bitset) const {
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
     FAISS_THROW_IF_NOT_MSG(k == 1, "search implemented only for k=1");
     FAISS_THROW_IF_NOT_MSG(
             sum_d == d, "not enough indexes compared to # dimensions");
@@ -330,4 +156,88 @@ IndexSplitVectors::~IndexSplitVectors() {
     }
 }
 
+/********************************************************
+ * IndexRandom implementation
+ */
+
+IndexRandom::IndexRandom(
+        idx_t d,
+        idx_t ntotal,
+        int64_t seed,
+        MetricType metric_type)
+        : Index(d, metric_type), seed(seed) {
+    this->ntotal = ntotal;
+    is_trained = true;
+}
+
+void IndexRandom::add(idx_t n, const float*) {
+    ntotal += n;
+}
+
+void IndexRandom::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    FAISS_THROW_IF_NOT_MSG(
+            !params, "search params not supported for this index");
+    FAISS_THROW_IF_NOT(k <= ntotal);
+#pragma omp parallel for if (n > 1000)
+    for (idx_t i = 0; i < n; i++) {
+        RandomGenerator rng(
+                seed + ivec_checksum(d, (const int32_t*)(x + i * d)));
+        idx_t* I = labels + i * k;
+        float* D = distances + i * k;
+        // assumes k << ntotal
+        if (k < 100 * ntotal) {
+            std::unordered_set<idx_t> map;
+            for (int j = 0; j < k; j++) {
+                idx_t ii;
+                for (;;) {
+                    // yes I know it's not strictly uniform...
+                    ii = rng.rand_int64() % ntotal;
+                    if (map.count(ii) == 0) {
+                        break;
+                    }
+                }
+                I[j] = ii;
+                map.insert(ii);
+            }
+        } else {
+            std::vector<idx_t> perm(ntotal);
+            for (idx_t j = 0; j < ntotal; j++) {
+                perm[j] = j;
+            }
+            for (int j = 0; j < k; j++) {
+                std::swap(perm[j], perm[rng.rand_int(ntotal)]);
+                I[j] = perm[j];
+            }
+        }
+        float dprev = 0;
+        for (int j = 0; j < k; j++) {
+            float step = rng.rand_float();
+            if (is_similarity_metric(metric_type)) {
+                step = -step;
+            }
+            dprev += step;
+            D[j] = dprev;
+        }
+    }
+}
+
+void IndexRandom::reconstruct(idx_t key, float* recons) const {
+    RandomGenerator rng(seed + 123332 + key);
+    for (size_t i = 0; i < d; i++) {
+        recons[i] = rng.rand_float();
+    }
+}
+
+void IndexRandom::reset() {
+    ntotal = 0;
+}
+
+IndexRandom::~IndexRandom() = default;
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/MetaIndexes.h b/thirdparty/faiss/faiss/MetaIndexes.h
index 5c42ca2ea..d94809cd4 100644
--- a/thirdparty/faiss/faiss/MetaIndexes.h
+++ b/thirdparty/faiss/faiss/MetaIndexes.h
@@ -11,107 +11,55 @@
 #define META_INDEXES_H
 
 #include <faiss/Index.h>
+#include <faiss/IndexIDMap.h>
 #include <faiss/IndexReplicas.h>
 #include <faiss/IndexShards.h>
-#include <unordered_map>
 #include <vector>
 
 namespace faiss {
 
-/** Index that translates search results to ids */
-template <typename IndexT>
-struct IndexIDMapTemplate : IndexT {
-    using idx_t = typename IndexT::idx_t;
-    using component_t = typename IndexT::component_t;
-    using distance_t = typename IndexT::distance_t;
-
-    IndexT* index;   ///! the sub-index
-    bool own_fields; ///! whether pointers are deleted in destructo
-    std::vector<idx_t> id_map;
+/** splits input vectors in segments and assigns each segment to a sub-index
+ * used to distribute a MultiIndexQuantizer
+ */
+struct IndexSplitVectors : Index {
+    bool own_fields;
+    bool threaded;
+    std::vector<Index*> sub_indexes;
+    idx_t sum_d; /// sum of dimensions seen so far
 
-    explicit IndexIDMapTemplate(IndexT* index);
+    explicit IndexSplitVectors(idx_t d, bool threaded = false);
 
-    /// @param xids if non-null, ids to store for the vectors (size n)
-    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
-            override;
+    void add_sub_index(Index*);
+    void sync_with_sub_indexes();
 
-    /// this will fail. Use add_with_ids
-    void add(idx_t n, const component_t* x) override;
+    void add(idx_t n, const float* x) override;
 
     void search(
             idx_t n,
-            const component_t* x,
+            const float* x,
             idx_t k,
-            distance_t* distances,
+            float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
-    void train(idx_t n, const component_t* x) override;
+    void train(idx_t n, const float* x) override;
 
     void reset() override;
 
-    /// remove ids adapted to IndexFlat
-    size_t remove_ids(const IDSelector& sel) override;
-
-    void range_search(
-            idx_t n,
-            const component_t* x,
-            float radius,
-            RangeSearchResult* result,
-            const BitsetView bitset = nullptr) const override;
-
-    ~IndexIDMapTemplate() override;
-    IndexIDMapTemplate() {
-        own_fields = false;
-        index = nullptr;
-    }
-};
-
-using IndexIDMap = IndexIDMapTemplate<Index>;
-using IndexBinaryIDMap = IndexIDMapTemplate<IndexBinary>;
-
-/** same as IndexIDMap but also provides an efficient reconstruction
- *  implementation via a 2-way index */
-template <typename IndexT>
-struct IndexIDMap2Template : IndexIDMapTemplate<IndexT> {
-    using idx_t = typename IndexT::idx_t;
-    using component_t = typename IndexT::component_t;
-    using distance_t = typename IndexT::distance_t;
-
-    std::unordered_map<idx_t, idx_t> rev_map;
-
-    explicit IndexIDMap2Template(IndexT* index);
-
-    /// make the rev_map from scratch
-    void construct_rev_map();
-
-    void add_with_ids(idx_t n, const component_t* x, const idx_t* xids)
-            override;
-
-    size_t remove_ids(const IDSelector& sel) override;
-
-    void reconstruct(idx_t key, component_t* recons) const override;
-
-    ~IndexIDMap2Template() override {}
-    IndexIDMap2Template() {}
+    ~IndexSplitVectors() override;
 };
 
-using IndexIDMap2 = IndexIDMap2Template<Index>;
-using IndexBinaryIDMap2 = IndexIDMap2Template<IndexBinary>;
-
-/** splits input vectors in segments and assigns each segment to a sub-index
- * used to distribute a MultiIndexQuantizer
+/** index that returns random results.
+ * used mainly for time benchmarks
  */
-struct IndexSplitVectors : Index {
-    bool own_fields;
-    bool threaded;
-    std::vector<Index*> sub_indexes;
-    idx_t sum_d; /// sum of dimensions seen so far
-
-    explicit IndexSplitVectors(idx_t d, bool threaded = false);
+struct IndexRandom : Index {
+    int64_t seed;
 
-    void add_sub_index(Index*);
-    void sync_with_sub_indexes();
+    explicit IndexRandom(
+            idx_t d,
+            idx_t ntotal = 0,
+            int64_t seed = 1234,
+            MetricType mt = METRIC_L2);
 
     void add(idx_t n, const float* x) override;
 
@@ -121,13 +69,13 @@ struct IndexSplitVectors : Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const BitsetView bitset = nullptr) const override;
+            const SearchParameters* params = nullptr) const override;
 
-    void train(idx_t n, const float* x) override;
+    void reconstruct(idx_t key, float* recons) const override;
 
     void reset() override;
 
-    ~IndexSplitVectors() override;
+    ~IndexRandom() override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/MetricType.h b/thirdparty/faiss/faiss/MetricType.h
index 1f89afaa3..6904fa203 100644
--- a/thirdparty/faiss/faiss/MetricType.h
+++ b/thirdparty/faiss/faiss/MetricType.h
@@ -10,6 +10,8 @@
 #ifndef FAISS_METRIC_TYPE_H
 #define FAISS_METRIC_TYPE_H
 
+#include <faiss/impl/platform_macros.h>
+
 namespace faiss {
 
 /// The metric space for vector comparison for Faiss indices and algorithms.
@@ -25,7 +27,10 @@ enum MetricType {
     METRIC_Lp = 4,            ///< L_p distance, p is given by a faiss::Index
                               /// metric_arg
 
-    METRIC_Jaccard = 5,
+    // Note: Faiss 1.7.4 defines METRIC_Jaccard=23,
+    //   but Knowhere defines one as 5
+    METRIC_Jaccard = 5,       ///< defined as: sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i))
+                              ///< where a_i, b_i > 0
     METRIC_Hamming = 7,
     METRIC_Substructure = 8,   ///< Tversky case alpha = 0, beta = 1
     METRIC_Superstructure = 9, ///< Tversky case alpha = 1, beta = 0
@@ -36,6 +41,16 @@ enum MetricType {
     METRIC_JensenShannon = 22,
 };
 
+/// all vector indices are this type
+using idx_t = int64_t;
+
+/// this function is used to distinguish between min and max indexes since
+/// we need to support similarity and dis-similarity metrics in a flexible way
+constexpr bool is_similarity_metric(MetricType metric_type) {
+    return ((metric_type == METRIC_INNER_PRODUCT) ||
+            (metric_type == METRIC_Jaccard));
+}
+
 } // namespace faiss
 
 #endif
diff --git a/thirdparty/faiss/faiss/VectorTransform.cpp b/thirdparty/faiss/faiss/VectorTransform.cpp
index 906fcd346..77712935d 100644
--- a/thirdparty/faiss/faiss/VectorTransform.cpp
+++ b/thirdparty/faiss/faiss/VectorTransform.cpp
@@ -136,7 +136,7 @@ int dgesvd_(
  * VectorTransform
  *********************************************/
 
-float* VectorTransform::apply(Index::idx_t n, const float* x) const {
+float* VectorTransform::apply(idx_t n, const float* x) const {
     float* xt = new float[n * d_out];
     apply_noalloc(n, x, xt);
     return xt;
@@ -150,6 +150,10 @@ void VectorTransform::reverse_transform(idx_t, const float*, float*) const {
     FAISS_THROW_MSG("reverse transform not implemented");
 }
 
+void VectorTransform::check_identical(const VectorTransform& other) const {
+    FAISS_THROW_IF_NOT(other.d_in == d_in && other.d_in == d_in);
+}
+
 /*********************************************
  * LinearTransform
  *********************************************/
@@ -163,8 +167,7 @@ LinearTransform::LinearTransform(int d_in, int d_out, bool have_bias)
     is_trained = false; // will be trained when A and b are initialized
 }
 
-void LinearTransform::apply_noalloc(Index::idx_t n, const float* x, float* xt)
-        const {
+void LinearTransform::apply_noalloc(idx_t n, const float* x, float* xt) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
 
     float c_factor;
@@ -309,6 +312,13 @@ void LinearTransform::print_if_verbose(
     printf("]\n");
 }
 
+void LinearTransform::check_identical(const VectorTransform& other_in) const {
+    VectorTransform::check_identical(other_in);
+    auto other = dynamic_cast<const LinearTransform*>(&other_in);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->A == A && other->b == b);
+}
+
 /*********************************************
  * RandomRotationMatrix
  *********************************************/
@@ -338,7 +348,7 @@ void RandomRotationMatrix::init(int seed) {
     is_trained = true;
 }
 
-void RandomRotationMatrix::train(Index::idx_t /*n*/, const float* /*x*/) {
+void RandomRotationMatrix::train(idx_t /*n*/, const float* /*x*/) {
     // initialize with some arbitrary seed
     init(12345);
 }
@@ -432,13 +442,10 @@ void eig(size_t d_in, double* cov, double* eigenvalues, int verbose) {
 
 } // namespace
 
-void PCAMatrix::train(Index::idx_t n, const float* x) {
-    const float* x_in = x;
-
-    x = fvecs_maybe_subsample(
-            d_in, (size_t*)&n, max_points_per_d * d_in, x, verbose);
-
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+void PCAMatrix::train(idx_t n, const float* x_in) {
+    const float* x = fvecs_maybe_subsample(
+            d_in, (size_t*)&n, max_points_per_d * d_in, x_in, verbose);
+    TransformedVectors tv(x_in, x);
 
     // compute mean
     mean.clear();
@@ -723,7 +730,7 @@ ITQMatrix::ITQMatrix(int d)
         : LinearTransform(d, d, false), max_iter(50), seed(123) {}
 
 /** translated from fbcode/deeplearning/catalyzer/catalyzer/quantizers.py */
-void ITQMatrix::train(Index::idx_t n, const float* xf) {
+void ITQMatrix::train(idx_t n, const float* xf) {
     size_t d = d_in;
     std::vector<double> rotation(d * d);
 
@@ -875,14 +882,13 @@ ITQTransform::ITQTransform(int d_in, int d_out, bool do_pca)
     is_trained = false;
 }
 
-void ITQTransform::train(idx_t n, const float* x) {
+void ITQTransform::train(idx_t n, const float* x_in) {
     FAISS_THROW_IF_NOT(!is_trained);
 
-    const float* x_in = x;
     size_t max_train_points = std::max(d_in * max_train_per_dim, 32768);
-    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x);
-
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+    const float* x =
+            fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x_in);
+    TransformedVectors tv(x_in, x);
 
     std::unique_ptr<float[]> x_norm(new float[n * d_in]);
     { // normalize
@@ -947,8 +953,7 @@ void ITQTransform::train(idx_t n, const float* x) {
     is_trained = true;
 }
 
-void ITQTransform::apply_noalloc(Index::idx_t n, const float* x, float* xt)
-        const {
+void ITQTransform::apply_noalloc(idx_t n, const float* x, float* xt) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "Transformation not trained yet");
 
     std::unique_ptr<float[]> x_norm(new float[n * d_in]);
@@ -967,30 +972,29 @@ void ITQTransform::apply_noalloc(Index::idx_t n, const float* x, float* xt)
     pca_then_itq.apply_noalloc(n, x_norm.get(), xt);
 }
 
+void ITQTransform::check_identical(const VectorTransform& other_in) const {
+    VectorTransform::check_identical(other_in);
+    auto other = dynamic_cast<const ITQTransform*>(&other_in);
+    FAISS_THROW_IF_NOT(other);
+    pca_then_itq.check_identical(other->pca_then_itq);
+    FAISS_THROW_IF_NOT(other->mean == mean);
+}
+
 /*********************************************
  * OPQMatrix
  *********************************************/
 
 OPQMatrix::OPQMatrix(int d, int M, int d2)
-        : LinearTransform(d, d2 == -1 ? d : d2, false),
-          M(M),
-          niter(50),
-          niter_pq(4),
-          niter_pq_0(40),
-          verbose(false),
-          pq(nullptr) {
+        : LinearTransform(d, d2 == -1 ? d : d2, false), M(M) {
     is_trained = false;
     // OPQ is quite expensive to train, so set this right.
     max_train_points = 256 * 256;
-    pq = nullptr;
 }
 
-void OPQMatrix::train(Index::idx_t n, const float* x) {
-    const float* x_in = x;
-
-    x = fvecs_maybe_subsample(d_in, (size_t*)&n, max_train_points, x, verbose);
-
-    ScopeDeleter<float> del_x(x != x_in ? x : nullptr);
+void OPQMatrix::train(idx_t n, const float* x_in) {
+    const float* x = fvecs_maybe_subsample(
+            d_in, (size_t*)&n, max_train_points, x_in, verbose);
+    TransformedVectors tv(x_in, x);
 
     // To support d_out > d_in, we pad input vectors with 0s to d_out
     size_t d = d_out <= d_in ? d_in : d_out;
@@ -1227,6 +1231,14 @@ void NormalizationTransform::reverse_transform(
     memcpy(x, xt, sizeof(xt[0]) * n * d_in);
 }
 
+void NormalizationTransform::check_identical(
+        const VectorTransform& other_in) const {
+    VectorTransform::check_identical(other_in);
+    auto other = dynamic_cast<const NormalizationTransform*>(&other_in);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->norm == norm);
+}
+
 /*********************************************
  * CenteringTransform
  *********************************************/
@@ -1235,7 +1247,7 @@ CenteringTransform::CenteringTransform(int d) : VectorTransform(d, d) {
     is_trained = false;
 }
 
-void CenteringTransform::train(Index::idx_t n, const float* x) {
+void CenteringTransform::train(idx_t n, const float* x) {
     FAISS_THROW_IF_NOT_MSG(n > 0, "need at least one training vector");
     mean.resize(d_in, 0);
     for (idx_t i = 0; i < n; i++) {
@@ -1272,6 +1284,14 @@ void CenteringTransform::reverse_transform(idx_t n, const float* xt, float* x)
     }
 }
 
+void CenteringTransform::check_identical(
+        const VectorTransform& other_in) const {
+    VectorTransform::check_identical(other_in);
+    auto other = dynamic_cast<const CenteringTransform*>(&other_in);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->mean == mean);
+}
+
 /*********************************************
  * RemapDimensionsTransform
  *********************************************/
@@ -1336,3 +1356,11 @@ void RemapDimensionsTransform::reverse_transform(
         xt += d_out;
     }
 }
+
+void RemapDimensionsTransform::check_identical(
+        const VectorTransform& other_in) const {
+    VectorTransform::check_identical(other_in);
+    auto other = dynamic_cast<const RemapDimensionsTransform*>(&other_in);
+    FAISS_THROW_IF_NOT(other);
+    FAISS_THROW_IF_NOT(other->map == map);
+}
diff --git a/thirdparty/faiss/faiss/VectorTransform.h b/thirdparty/faiss/faiss/VectorTransform.h
index de61d329a..55e46e81d 100644
--- a/thirdparty/faiss/faiss/VectorTransform.h
+++ b/thirdparty/faiss/faiss/VectorTransform.h
@@ -23,8 +23,6 @@ namespace faiss {
 
 /** Any transformation applied on a set of vectors */
 struct VectorTransform {
-    typedef Index::idx_t idx_t;
-
     int d_in;  ///! input dimension
     int d_out; ///! output dimension
 
@@ -43,19 +41,27 @@ struct VectorTransform {
      */
     virtual void train(idx_t n, const float* x);
 
-    /** apply the random rotation, return new allocated matrix
-     * @param     x size n * d_in
-     * @return    size n * d_out
+    /** apply the transformation and return the result in an allocated pointer
+     * @param     n number of vectors to transform
+     * @param     x input vectors, size n * d_in
+     * @return    output vectors, size n * d_out
      */
     float* apply(idx_t n, const float* x) const;
 
-    /// same as apply, but result is pre-allocated
+    /** apply the transformation and return the result in a provided matrix
+     * @param     n number of vectors to transform
+     * @param     x input vectors, size n * d_in
+     * @param    xt output vectors, size n * d_out
+     */
     virtual void apply_noalloc(idx_t n, const float* x, float* xt) const = 0;
 
     /// reverse transformation. May not be implemented or may return
     /// approximate result
     virtual void reverse_transform(idx_t n, const float* xt, float* x) const;
 
+    // check that the two transforms are identical (to merge indexes)
+    virtual void check_identical(const VectorTransform& other) const = 0;
+
     virtual ~VectorTransform() {}
 };
 
@@ -100,6 +106,8 @@ struct LinearTransform : VectorTransform {
             int n,
             int d) const;
 
+    void check_identical(const VectorTransform& other) const override;
+
     ~LinearTransform() override {}
 };
 
@@ -112,7 +120,7 @@ struct RandomRotationMatrix : LinearTransform {
     /// must be called before the transform is used
     void init(int seed);
 
-    // intializes with an arbitrary seed
+    // initializes with an arbitrary seed
     void train(idx_t n, const float* x) override;
 
     RandomRotationMatrix() {}
@@ -207,6 +215,8 @@ struct ITQTransform : VectorTransform {
     void train(idx_t n, const float* x) override;
 
     void apply_noalloc(idx_t n, const float* x, float* xt) const override;
+
+    void check_identical(const VectorTransform& other) const override;
 };
 
 struct ProductQuantizer;
@@ -220,18 +230,18 @@ struct ProductQuantizer;
  *
  */
 struct OPQMatrix : LinearTransform {
-    int M;          ///< nb of subquantizers
-    int niter;      ///< Number of outer training iterations
-    int niter_pq;   ///< Number of training iterations for the PQ
-    int niter_pq_0; ///< same, for the first outer iteration
+    int M;               ///< nb of subquantizers
+    int niter = 50;      ///< Number of outer training iterations
+    int niter_pq = 4;    ///< Number of training iterations for the PQ
+    int niter_pq_0 = 40; ///< same, for the first outer iteration
 
     /// if there are too many training points, resample
-    size_t max_train_points;
-    bool verbose;
+    size_t max_train_points = 256 * 256;
+    bool verbose = false;
 
     /// if non-NULL, use this product quantizer for training
     /// should be constructed with (d_out, M, _)
-    ProductQuantizer* pq;
+    ProductQuantizer* pq = nullptr;
 
     /// if d2 != -1, output vectors of this dimension
     explicit OPQMatrix(int d = 0, int M = 1, int d2 = -1);
@@ -260,6 +270,8 @@ struct RemapDimensionsTransform : VectorTransform {
     void reverse_transform(idx_t n, const float* xt, float* x) const override;
 
     RemapDimensionsTransform() {}
+
+    void check_identical(const VectorTransform& other) const override;
 };
 
 /** per-vector normalization */
@@ -273,6 +285,8 @@ struct NormalizationTransform : VectorTransform {
 
     /// Identity transform since norm is not revertible
     void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    void check_identical(const VectorTransform& other) const override;
 };
 
 /** Subtract the mean of each component from the vectors. */
@@ -290,6 +304,8 @@ struct CenteringTransform : VectorTransform {
 
     /// add the mean
     void reverse_transform(idx_t n, const float* xt, float* x) const override;
+
+    void check_identical(const VectorTransform& other) const override;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/clone_index.cpp b/thirdparty/faiss/faiss/clone_index.cpp
index d7bf24b80..44ab1f7cc 100644
--- a/thirdparty/faiss/faiss/clone_index.cpp
+++ b/thirdparty/faiss/faiss/clone_index.cpp
@@ -16,22 +16,39 @@
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexAdditiveQuantizerFastScan.h>
+#include <faiss/IndexBinary.h>
+#include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/IndexLattice.h>
 #include <faiss/IndexNSG.h>
 #include <faiss/IndexPQ.h>
+#include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
+
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
 
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/pq4_fast_scan.h>
+
+#include <faiss/invlists/BlockInvertedLists.h>
+
 namespace faiss {
 
 /*************************************************************
@@ -66,39 +83,220 @@ VectorTransform* Cloner::clone_VectorTransform(const VectorTransform* vt) {
 IndexIVF* Cloner::clone_IndexIVF(const IndexIVF* ivf) {
     TRYCLONE(IndexIVFPQR, ivf)
     TRYCLONE(IndexIVFPQ, ivf)
+
+    TRYCLONE(IndexIVFLocalSearchQuantizer, ivf)
+    TRYCLONE(IndexIVFProductLocalSearchQuantizer, ivf)
+    TRYCLONE(IndexIVFProductResidualQuantizer, ivf)
+    TRYCLONE(IndexIVFResidualQuantizer, ivf)
+
+    TRYCLONE(IndexIVFLocalSearchQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFProductLocalSearchQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFProductResidualQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFResidualQuantizerFastScan, ivf)
+    TRYCLONE(IndexIVFPQFastScan, ivf)
+
+    TRYCLONE(IndexIVFFlatDedup, ivf)
     TRYCLONE(IndexIVFFlat, ivf)
+
+    TRYCLONE(IndexIVFSpectralHash, ivf)
+
     TRYCLONE(IndexIVFScalarQuantizer, ivf) {
         FAISS_THROW_MSG("clone not supported for this type of IndexIVF");
     }
     return nullptr;
 }
 
+IndexRefine* clone_IndexRefine(const IndexRefine* ir) {
+    TRYCLONE(IndexRefineFlat, ir)
+    TRYCLONE(IndexRefine, ir) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexRefine");
+    }
+}
+
+IndexIDMap* clone_IndexIDMap(const IndexIDMap* im) {
+    TRYCLONE(IndexIDMap2, im)
+    TRYCLONE(IndexIDMap, im) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexIDMap");
+    }
+}
+
+IndexHNSW* clone_IndexHNSW(const IndexHNSW* ihnsw) {
+    TRYCLONE(IndexHNSW2Level, ihnsw)
+    TRYCLONE(IndexHNSWFlat, ihnsw)
+    TRYCLONE(IndexHNSWPQ, ihnsw)
+    TRYCLONE(IndexHNSWSQ, ihnsw)
+    TRYCLONE(IndexHNSW, ihnsw) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexHNSW");
+    }
+}
+
+IndexNNDescent* clone_IndexNNDescent(const IndexNNDescent* innd) {
+    TRYCLONE(IndexNNDescentFlat, innd)
+    TRYCLONE(IndexNNDescent, innd) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexNNDescent");
+    }
+}
+
+IndexNSG* clone_IndexNSG(const IndexNSG* insg) {
+    TRYCLONE(IndexNSGFlat, insg)
+    TRYCLONE(IndexNSGPQ, insg)
+    TRYCLONE(IndexNSGSQ, insg)
+    TRYCLONE(IndexNSG, insg) {
+        FAISS_THROW_MSG("clone not supported for this type of IndexNNDescent");
+    }
+}
+
+IndexRowwiseMinMaxBase* clone_IndexRowwiseMinMax(
+        const IndexRowwiseMinMaxBase* irmmb) {
+    TRYCLONE(IndexRowwiseMinMaxFP16, irmmb)
+    TRYCLONE(IndexRowwiseMinMax, irmmb) {
+        FAISS_THROW_MSG(
+                "clone not supported for this type of IndexRowwiseMinMax");
+    }
+}
+
+#define TRYCAST(classname) classname* res = dynamic_cast<classname*>(index)
+
+void reset_AdditiveQuantizerIndex(Index* index) {
+    auto clone_ProductQuantizers =
+            [](std::vector<AdditiveQuantizer*>& quantizers) {
+                for (auto& q : quantizers) {
+                    q = dynamic_cast<AdditiveQuantizer*>(clone_Quantizer(q));
+                }
+            };
+    if (TRYCAST(IndexIVFLocalSearchQuantizerFastScan)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexIVFResidualQuantizerFastScan)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexIVFProductLocalSearchQuantizerFastScan)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexIVFProductResidualQuantizerFastScan)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(IndexIVFLocalSearchQuantizer)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexIVFResidualQuantizer)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexIVFProductLocalSearchQuantizer)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexIVFProductResidualQuantizer)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(IndexLocalSearchQuantizerFastScan)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexResidualQuantizerFastScan)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexProductLocalSearchQuantizerFastScan)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexProductResidualQuantizerFastScan)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(IndexLocalSearchQuantizer)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(IndexResidualQuantizer)) {
+        res->aq = &res->rq;
+    } else if (TRYCAST(IndexProductLocalSearchQuantizer)) {
+        res->aq = &res->plsq;
+        clone_ProductQuantizers(res->plsq.quantizers);
+    } else if (TRYCAST(IndexProductResidualQuantizer)) {
+        res->aq = &res->prq;
+        clone_ProductQuantizers(res->prq.quantizers);
+    } else if (TRYCAST(LocalSearchCoarseQuantizer)) {
+        res->aq = &res->lsq;
+    } else if (TRYCAST(ResidualCoarseQuantizer)) {
+        res->aq = &res->rq;
+    } else {
+        FAISS_THROW_MSG(
+                "clone not supported for this type of additive quantizer index");
+    }
+}
+
+Index* clone_AdditiveQuantizerIndex(const Index* index) {
+    // IndexAdditiveQuantizer
+    TRYCLONE(IndexResidualQuantizer, index)
+    TRYCLONE(IndexProductResidualQuantizer, index)
+    TRYCLONE(IndexLocalSearchQuantizer, index)
+    TRYCLONE(IndexProductLocalSearchQuantizer, index)
+
+    // IndexFastScan
+    TRYCLONE(IndexResidualQuantizerFastScan, index)
+    TRYCLONE(IndexLocalSearchQuantizerFastScan, index)
+    TRYCLONE(IndexProductResidualQuantizerFastScan, index)
+    TRYCLONE(IndexProductLocalSearchQuantizerFastScan, index)
+
+    // AdditiveCoarseQuantizer
+    TRYCLONE(ResidualCoarseQuantizer, index)
+    TRYCLONE(LocalSearchCoarseQuantizer, index) {
+        FAISS_THROW_MSG(
+                "clone not supported for this type of additive quantizer index");
+    }
+}
+
+namespace {
+
+IndexHNSW* clone_HNSW(const IndexHNSW* ihnsw) {
+    TRYCLONE(IndexHNSWFlat, ihnsw)
+    TRYCLONE(IndexHNSWPQ, ihnsw)
+    TRYCLONE(IndexHNSWSQ, ihnsw)
+    return new IndexHNSW(*ihnsw);
+}
+
+InvertedLists* clone_InvertedLists(const InvertedLists* invlists) {
+    if (auto* ails = dynamic_cast<const ArrayInvertedLists*>(invlists)) {
+        return new ArrayInvertedLists(*ails);
+    }
+    if (auto* bils = dynamic_cast<const BlockInvertedLists*>(invlists)) {
+        auto* bils2 = new BlockInvertedLists(*bils);
+        if (bils->packer) {
+            auto* packerPQ4 = dynamic_cast<const CodePackerPQ4*>(bils->packer);
+            FAISS_THROW_IF_NOT(packerPQ4);
+            bils2->packer = new CodePackerPQ4(*packerPQ4);
+        }
+        return bils2;
+    }
+    FAISS_THROW_FMT(
+            "clone not supported for this type of inverted lists %s",
+            typeid(*invlists).name());
+}
+
+} // anonymous namespace
+
 Index* Cloner::clone_Index(const Index* index) {
     TRYCLONE(IndexPQ, index)
     TRYCLONE(IndexLSH, index)
+
+    // IndexFlat
+    TRYCLONE(IndexFlat1D, index)
     TRYCLONE(IndexFlatL2, index)
     TRYCLONE(IndexFlatIP, index)
     TRYCLONE(IndexFlat, index)
+
     TRYCLONE(IndexLattice, index)
-    TRYCLONE(IndexResidualQuantizer, index)
+    TRYCLONE(IndexRandom, index)
+    TRYCLONE(IndexPQFastScan, index)
+
     TRYCLONE(IndexScalarQuantizer, index)
     TRYCLONE(MultiIndexQuantizer, index)
-    TRYCLONE(ResidualCoarseQuantizer, index)
+
     if (const IndexIVF* ivf = dynamic_cast<const IndexIVF*>(index)) {
         IndexIVF* res = clone_IndexIVF(ivf);
         if (ivf->invlists == nullptr) {
             res->invlists = nullptr;
-        } else if (
-                auto* ails = dynamic_cast<const ArrayInvertedLists*>(
-                        ivf->invlists)) {
-            res->invlists = new ArrayInvertedLists(*ails);
-            res->own_invlists = true;
         } else {
-            FAISS_THROW_MSG(
-                    "clone not supported for this type of inverted lists");
+            res->invlists = clone_InvertedLists(ivf->invlists);
+            res->own_invlists = true;
         }
+
         res->own_fields = true;
         res->quantizer = clone_Index(ivf->quantizer);
+
+        if (dynamic_cast<const IndexIVFAdditiveQuantizerFastScan*>(res) ||
+            dynamic_cast<const IndexIVFAdditiveQuantizer*>(res)) {
+            reset_AdditiveQuantizerIndex(res);
+        }
         return res;
     } else if (
             const IndexPreTransform* ipt =
@@ -117,17 +315,18 @@ Index* Cloner::clone_Index(const Index* index) {
         return res;
     } else if (
             const IndexIDMap* idmap = dynamic_cast<const IndexIDMap*>(index)) {
-        IndexIDMap* res = new IndexIDMap(*idmap);
+        IndexIDMap* res = clone_IndexIDMap(idmap);
         res->own_fields = true;
         res->index = clone_Index(idmap->index);
         return res;
     } else if (const IndexHNSW* ihnsw = dynamic_cast<const IndexHNSW*>(index)) {
-        IndexHNSW* res = new IndexHNSW(*ihnsw);
+        IndexHNSW* res = clone_IndexHNSW(ihnsw);
         res->own_fields = true;
-        res->storage = clone_Index(ihnsw->storage);
+        // make sure we don't get a GPU index here
+        res->storage = Cloner::clone_Index(ihnsw->storage);
         return res;
     } else if (const IndexNSG* insg = dynamic_cast<const IndexNSG*>(index)) {
-        IndexNSG* res = new IndexNSG(*insg);
+        IndexNSG* res = clone_IndexNSG(insg);
 
         // copy the dynamic allocated graph
         auto& new_graph = res->nsg.final_graph;
@@ -137,16 +336,64 @@ Index* Cloner::clone_Index(const Index* index) {
         res->own_fields = true;
         res->storage = clone_Index(insg->storage);
         return res;
+    } else if (
+            const IndexNNDescent* innd =
+                    dynamic_cast<const IndexNNDescent*>(index)) {
+        IndexNNDescent* res = clone_IndexNNDescent(innd);
+        res->own_fields = true;
+        res->storage = clone_Index(innd->storage);
+        return res;
     } else if (
             const Index2Layer* i2l = dynamic_cast<const Index2Layer*>(index)) {
         Index2Layer* res = new Index2Layer(*i2l);
         res->q1.own_fields = true;
         res->q1.quantizer = clone_Index(i2l->q1.quantizer);
         return res;
+    } else if (
+            const IndexRefine* ir = dynamic_cast<const IndexRefine*>(index)) {
+        IndexRefine* res = clone_IndexRefine(ir);
+        res->own_fields = true;
+        res->base_index = clone_Index(ir->base_index);
+        if (ir->refine_index != nullptr) {
+            res->own_refine_index = true;
+            res->refine_index = clone_Index(ir->refine_index);
+        }
+        return res;
+    } else if (
+            const IndexRowwiseMinMaxBase* irmmb =
+                    dynamic_cast<const IndexRowwiseMinMaxBase*>(index)) {
+        IndexRowwiseMinMaxBase* res = clone_IndexRowwiseMinMax(irmmb);
+        res->own_fields = true;
+        res->index = clone_Index(irmmb->index);
+    } else if (
+            dynamic_cast<const IndexAdditiveQuantizerFastScan*>(index) ||
+            dynamic_cast<const IndexAdditiveQuantizer*>(index) ||
+            dynamic_cast<const AdditiveCoarseQuantizer*>(index)) {
+        Index* res = clone_AdditiveQuantizerIndex(index);
+        reset_AdditiveQuantizerIndex(res);
+        return res;
     } else {
-        FAISS_THROW_MSG("clone not supported for this type of Index");
+        FAISS_THROW_FMT(
+                "clone not supported for this Index type %s",
+                typeid(*index).name());
     }
     return nullptr;
+} // namespace
+
+Quantizer* clone_Quantizer(const Quantizer* quant) {
+    TRYCLONE(ResidualQuantizer, quant)
+    TRYCLONE(LocalSearchQuantizer, quant)
+    TRYCLONE(ProductQuantizer, quant)
+    TRYCLONE(ScalarQuantizer, quant)
+    FAISS_THROW_MSG("Did not recognize quantizer to clone");
+}
+
+IndexBinary* clone_binary_index(const IndexBinary* index) {
+    if (auto ii = dynamic_cast<const IndexBinaryFlat*>(index)) {
+        return new IndexBinaryFlat(*ii);
+    } else {
+        FAISS_THROW_MSG("cannot clone this type of index");
+    }
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/clone_index.h b/thirdparty/faiss/faiss/clone_index.h
index 982cbbb55..a251b847d 100644
--- a/thirdparty/faiss/faiss/clone_index.h
+++ b/thirdparty/faiss/faiss/clone_index.h
@@ -13,9 +13,15 @@
 
 namespace faiss {
 
+// todo aguzhva: get rid of this file by adding Index* Index::clone() function.
+//   same for quantizers.
+
 struct Index;
 struct IndexIVF;
 struct VectorTransform;
+struct Quantizer;
+struct IndexBinary;
+
 
 namespace gpu {
     struct GpuIndexFlat;
@@ -34,4 +40,8 @@ struct Cloner {
     virtual ~Cloner() {}
 };
 
+Quantizer* clone_Quantizer(const Quantizer* quant);
+
+IndexBinary* clone_binary_index(const IndexBinary* index);
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/cppcontrib/SaDecodeKernels.h b/thirdparty/faiss/faiss/cppcontrib/SaDecodeKernels.h
new file mode 100644
index 000000000..ae333e1f1
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/SaDecodeKernels.h
@@ -0,0 +1,322 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// This file contains a custom fast implementation of faiss::Index::sa_decode()
+//   function for the following index families:
+//   * IVF256,PQ[1]x8np
+//   * Residual[1]x8,PQ[2]x8
+//   * IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+//   * Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
+//   * PQ[1]x8
+// Additionally, AVX2 and ARM versions support
+//   * Residual[1]x8,PQ[2]x10
+//   * Residual[1]x8,PQ[2]x12
+//   * Residual[1]x8,PQ[2]x16
+//   * Residual[1]x10,PQ[2]x10
+//   * Residual[1]x10,PQ[2]x12
+//   * Residual[1]x10,PQ[2]x16
+//   * Residual[1]x12,PQ[2]x10
+//   * Residual[1]x12,PQ[2]x12
+//   * Residual[1]x12,PQ[2]x16
+//   * Residual[1]x16,PQ[2]x10
+//   * Residual[1]x16,PQ[2]x12
+//   * Residual[1]x16,PQ[2]x16
+//   * Residual1x[9-16 bit],PQ[1]x10 (such as Residual1x9,PQ16x10)
+//   * * (use with COARSE_BITS=16)
+//   * Residual1x[9-16 bit],PQ[1]x12 (such as Residual1x9,PQ16x12)
+//   * * (use with COARSE_BITS=16)
+//   * Residual1x[9-16 bit],PQ[1]x16 (such as Residual1x9,PQ16x16)
+//   * * (use with COARSE_BITS=16)
+//   * PQ[1]x10
+//   * PQ[1]x12
+//   * PQ[1]x16
+//   * IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
+//   * IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
+//   * IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
+//   * IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
+//   * IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
+//   * IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
+//
+// The goal was to achieve the maximum performance, so the template version it
+// is. The provided index families share the same code for sa_decode.
+//
+// The front-end code provides two high-level structures.
+//
+// First one:
+//   {
+//     template <
+//        intptr_t DIM,
+//        intptr_t COARSE_SIZE,
+//        intptr_t FINE_SIZE,
+//        intptr_t COARSE_BITS = 8
+//        intptr_t FINE_BITS = 8>
+//     struct Index2LevelDecoder { /*...*/ };
+//   }
+// * DIM is the dimensionality of data
+// * COARSE_SIZE is the dimensionality of the coarse quantizer (IVF, Residual)
+// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
+// * COARSE_BITS is the number of bits that are needed to represent a coarse
+//   quantizer code.
+// * FINE_BITS is the number of bits that are needed to represent a fine
+//   quantizer code.
+// For example, "IVF256,PQ8np" for 160-dim data translates into
+//   Index2LevelDecoder<160,160,20,8>
+// For example, "Residual4x8,PQ16" for 256-dim data translates into
+//   Index2LevelDecoder<256,64,1,8>
+// For example, "IVF1024,PQ16np" for 256-dim data translates into
+//   Index2LevelDecoder<256,256,16,10>. But as there are only 1 coarse code
+//   element, Index2LevelDecoder<256,256,16,16> can be used as a faster
+//   decoder.
+// For example, "Residual4x10,PQ16x10np" for 256-dim data translates into
+//   Index2LevelDecoder<256,64,16,10,10>
+// For example, "IVF1024,PQ16x10np" for 256-dim data translates into
+//   Index2LevelDecoder<256,256,16,10,10>. But as there are only 1 coarse code
+//   element, Index2LevelDecoder<256,256,16,16,10> can be used as a faster
+//   decoder.
+//
+// Additional supported values for COARSE_BITS and FINE_BITS may be added later.
+//
+// Second one:
+//   {
+//     template <
+//        intptr_t DIM,
+//        intptr_t FINE_SIZE,
+//        intptr_t FINE_BITS = 8>
+//     struct IndexPQDecoder { /*...*/ };
+//   }
+// * DIM is the dimensionality of data
+// * FINE_SIZE is the dimensionality of the ProductQuantizer dsq
+// * FINE_BITS is the number of bits that are needed to represent a fine
+//   quantizer code.
+// For example, "PQ8np" for 160-dim data translates into
+//   IndexPQDecoder<160,20>
+//
+// Unlike the general purpose version in faiss::Index::sa_decode(),
+//   this version provides the following functions (please note that
+//   pqCoarseCentroids params are not available for IndexPQDecoder,
+//   but the functionality is the same as for Index2LevelDecoder):
+//
+// * ::store(), which is similar to sa_decode(1, input, output),
+//   The method signature is the following:
+//   {
+//     void store(
+//       const float* const __restrict pqCoarseCentroids,
+//       const float* const __restrict pqFineCentroids,
+//       const uint8_t* const __restrict code,
+//       float* const __restrict outputStore);
+//   }
+//
+// * ::accum(), which is used to create a linear combination
+//   of decoded vectors:
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const input;
+//     float weight;
+//
+//     std::vector<float> buffer(d, 0);
+//
+//     index->sa_decode(1, input, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight * buffer[iDim];
+//   }
+//   The method signature is the following:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids,
+//      const float* const __restrict pqFineCentroids,
+//      const uint8_t* const __restrict code,
+//      const float weight,
+//      float* const __restrict outputAccum);
+//   }
+//
+// * There is an additional overload for ::accum() that decodes two vectors
+//   per call. This provides an additional speedup because of a CPU
+//   superscalar architecture:
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const input0;
+//     float weight0;
+//     const uint8_t* const input1;
+//     float weight1;
+//
+//     std::vector<float> buffer(d, 0);
+//
+//     index->sa_decode(1, input0, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight0 * buffer[iDim];
+//
+//     index->sa_decode(1, input1, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight1 * buffer[iDim];
+//   }
+//   If each code uses its own coarse quantizer centroids table and its own fine
+//   quantizer centroids table, then the following overload can be used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids0,
+//      const float* const __restrict pqFineCentroids0,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const float* const __restrict pqCoarseCentroids1,
+//      const float* const __restrict pqFineCentroids1,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      float* const __restrict outputAccum);
+//   }
+//   If codes share the coarse quantizer centroids table and also share
+//   the fine quantizer centroids table, then the following overload can be
+//   used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids,
+//      const float* const __restrict pqFineCentroids,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      float* const __restrict outputAccum);
+//   }
+//
+// * And one more overload for ::accum() that decodes and accumulates
+//   three vectors per call.
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const input0;
+//     float weight0;
+//     const uint8_t* const input1;
+//     float weight1;
+//     const uint8_t* const input2;
+//     float weight2;
+//
+//     std::vector<float> buffer(d, 0);
+//
+//     index->sa_decode(1, input0, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight0 * buffer[iDim];
+//
+//     index->sa_decode(1, input1, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight1 * buffer[iDim];
+//
+//     index->sa_decode(1, input2, buffer.data());
+//     for (size_t iDim = 0; iDim < d; iDim++)
+//       output[iDim] += weight2 * buffer[iDim];
+//   }
+//
+//   If each code uses its own coarse quantizer centroids table and its own fine
+//   quantizer centroids table, then the following overload can be used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids0,
+//      const float* const __restrict pqFineCentroids0,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const float* const __restrict pqCoarseCentroids1,
+//      const float* const __restrict pqFineCentroids1,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      const float* const __restrict pqCoarseCentroids2,
+//      const float* const __restrict pqFineCentroids2,
+//      const uint8_t* const __restrict code2,
+//      const float weight2,
+//      float* const __restrict outputAccum);
+//   }
+//   If codes share the coarse quantizer centroids table and also share
+//   the fine quantizer centroids table, then the following overload can be
+//   used:
+//   {
+//    static void accum(
+//      const float* const __restrict pqCoarseCentroids,
+//      const float* const __restrict pqFineCentroids,
+//      const uint8_t* const __restrict code0,
+//      const float weight0,
+//      const uint8_t* const __restrict code1,
+//      const float weight1,
+//      const uint8_t* const __restrict code2,
+//      const float weight2,
+//      float* const __restrict outputAccum);
+//   }
+//
+// The provided version is not multithreaded.
+//
+// Currently, an AVX2+FMA implementation is available. AVX512 version is also
+//   doable, but it was found to be slower than AVX2 for real world applications
+//   that I needed.
+//
+////////////////////////////////////////////////////////////////////////////////////
+//
+// It is possible to use an additional index wrapper on top of IVFPQ /
+// Residual+PQ, known as IndexRowwiseMinMax / IndexRowwiseMinMaxFP16. Index
+// wrapper that performs rowwise normalization to [0,1], preserving the
+// coefficients. This is a vector codec index only.
+// For more details please refer to the description in
+// faiss/IndexRowwiseMinMax.h file.
+//
+// If such a wrapper is used, then the quantizer will look like, say,
+//    MinMaxFP16,IVF256,PQ32np
+//  or
+//    MinMax,PQ16np
+// In this case, please use the following contruction for the decoding,
+// basically, wrapping a kernel in a kernel:
+//   {
+//      using SubT = faiss::cppcontrib::Index2LevelDecoder<128, 128, 2>;
+//      using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+//      // do T::store(...) or T::accum(...)
+//   }
+//
+// T::accum(...) contains an additional function variable which is
+// used for accumulating scaling. Thus, the code pattern is the following:
+//   {
+//     const float* const __restrict pqCoarseCentroidsQ;
+//     const float* const __restrict pqFineCentroidsQ;
+//     const uint8_t* const __restrict input;
+//     const float* const __restrict weights;
+//     float* const __restrict output;
+//     float outputAccumMin = 0;
+//
+//     for (size_t i = 0; i < n; i++) {
+//         T::accum(
+//                 pqCoarseCentroidsQ,
+//                 pqFineCentroidsQ,
+//                 input + i * code_size,
+//                 weights[i],
+//                 output,
+//                 outputAccumMin);
+//     }
+//     for (size_t j = 0; j < d; j++)
+//         output[j] += outputAccumMin;
+//   }
+// This is similar to the following regular pseudo-code:
+//   {
+//     const faiss::Index* const index;
+//     const uint8_t* const __restrict input;
+//     const float* const __restrict weights;
+//     float* const __restrict output;
+//
+//     for (size_t i = 0; i < n; i++) {
+//       std::vector<float> buffer(d, 0);
+//
+//       index->sa_decode(1, input + i * code_size, buffer.data());
+//       for (size_t j = 0; j < d; j++)
+//         output[j] += weights[i] * buffer[j];
+//     }
+
+#include <faiss/cppcontrib/sa_decode/MinMax-inl.h>
+#include <faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h>
+
+#ifdef __AVX2__
+#include <faiss/cppcontrib/sa_decode/Level2-avx2-inl.h>
+#include <faiss/cppcontrib/sa_decode/PQ-avx2-inl.h>
+#elif defined(__ARM_NEON)
+#include <faiss/cppcontrib/sa_decode/Level2-neon-inl.h>
+#include <faiss/cppcontrib/sa_decode/PQ-neon-inl.h>
+#else
+#include <faiss/cppcontrib/sa_decode/Level2-inl.h>
+#include <faiss/cppcontrib/sa_decode/PQ-inl.h>
+#endif
diff --git a/thirdparty/faiss/faiss/cppcontrib/detail/CoarseBitType.h b/thirdparty/faiss/faiss/cppcontrib/detail/CoarseBitType.h
new file mode 100644
index 000000000..7b438feda
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/detail/CoarseBitType.h
@@ -0,0 +1,31 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace faiss {
+namespace cppcontrib {
+namespace detail {
+
+template <int COARSE_BITS>
+struct CoarseBitType {};
+
+template <>
+struct CoarseBitType<8> {
+    using bit_type = uint8_t;
+};
+
+template <>
+struct CoarseBitType<16> {
+    using bit_type = uint16_t;
+};
+
+} // namespace detail
+} // namespace cppcontrib
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/cppcontrib/detail/UintReader.h b/thirdparty/faiss/faiss/cppcontrib/detail/UintReader.h
new file mode 100644
index 000000000..81e600f41
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/detail/UintReader.h
@@ -0,0 +1,273 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace faiss {
+namespace cppcontrib {
+namespace detail {
+
+namespace {
+
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint8Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes, if possible.
+        // Reading using 8-byte takes too many registers somewhy.
+
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
+
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 3) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x000000FF);
+                } else {
+                    return codes[CPOS];
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS + 2) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x0000FF00) >> 8;
+                } else {
+                    return codes[CPOS];
+                }
+            }
+            case 2: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x00FF0000) >> 16;
+                } else {
+                    return codes[CPOS];
+                }
+            }
+            case 3: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32) >> 24;
+                } else {
+                    return codes[CPOS];
+                }
+            }
+        }
+    }
+};
+
+// reduces the number of read operations from RAM
+///////////////////////////////////////////////
+// 76543210 76543210 76543210 76543210 76543210
+// 00000000 00
+//            111111 1111
+//                       2222 222222
+//                                  33 33333333
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint10Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes or 2-bytes.
+
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
+
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 2) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 5);
+                    return (code32 & 0b0000001111111111);
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 5 + 0);
+                    return (code16 & 0b0000001111111111);
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 5);
+                    return (code32 & 0b000011111111110000000000) >> 10;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 5 + 1);
+                    return (code16 & 0b0000111111111100) >> 2;
+                }
+            }
+            case 2: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 5);
+                    return (code32 & 0b00111111111100000000000000000000) >> 20;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 5 + 2);
+                    return (code16 & 0b0011111111110000) >> 4;
+                }
+            }
+            case 3: {
+                const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                        codes + ELEMENT_TO_READ * 5 + 3);
+                return (code16 & 0b1111111111000000) >> 6;
+            }
+        }
+    }
+};
+
+// reduces the number of read operations from RAM
+///////////////////////////////////////////////
+// 76543210 76543210 76543210 76543210 76543210 76543210
+// 00000000 0000
+//              1111 11111111
+//                            22222222 2222
+//                                         3333 33333333
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint12Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes or 2-bytes.
+
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 4;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 4;
+
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 2) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6);
+                    return (code32 & 0b0000111111111111);
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 0);
+                    return (code16 & 0b0000111111111111);
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6);
+                    return (code32 & 0b111111111111000000000000) >> 12;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 1);
+                    return (code16 & 0b1111111111110000) >> 4;
+                }
+            }
+            case 2: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 2);
+                    return (code32 & 0b000011111111111100000000) >> 8;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 3);
+                    return (code16 & 0b0000111111111111);
+                }
+            }
+            case 3: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 2);
+                    return (code32 & 0b11111111111100000000000000000000) >> 20;
+                } else {
+                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                            codes + ELEMENT_TO_READ * 6 + 4);
+                    return (code16 & 0b1111111111110000) >> 4;
+                }
+            }
+        }
+    }
+};
+
+// reduces the number of read operations from RAM
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct Uint16Reader {
+    static_assert(CPOS < N_ELEMENTS, "CPOS should be less than N_ELEMENTS");
+
+    static intptr_t get(const uint8_t* const __restrict codes) {
+        // Read using 4-bytes or 2-bytes.
+        // Reading using 8-byte takes too many registers somewhy.
+
+        constexpr intptr_t ELEMENT_TO_READ = CPOS / 2;
+        constexpr intptr_t SUB_ELEMENT = CPOS % 2;
+
+        switch (SUB_ELEMENT) {
+            case 0: {
+                if (N_ELEMENTS > CPOS + 1) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return (code32 & 0x0000FFFF);
+                } else {
+                    const uint16_t* const __restrict codesFp16 =
+                            reinterpret_cast<const uint16_t*>(codes);
+                    return codesFp16[CPOS];
+                }
+            }
+            case 1: {
+                if (N_ELEMENTS > CPOS) {
+                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                            codes + ELEMENT_TO_READ * 4);
+                    return code32 >> 16;
+                } else {
+                    const uint16_t* const __restrict codesFp16 =
+                            reinterpret_cast<const uint16_t*>(codes);
+                    return codesFp16[CPOS];
+                }
+            }
+        }
+    }
+};
+
+//
+template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
+struct UintReaderImplType {};
+
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 8, CPOS> {
+    using reader_type = Uint8Reader<N_ELEMENTS, CPOS>;
+};
+
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 10, CPOS> {
+    using reader_type = Uint10Reader<N_ELEMENTS, CPOS>;
+};
+
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 12, CPOS> {
+    using reader_type = Uint12Reader<N_ELEMENTS, CPOS>;
+};
+
+template <intptr_t N_ELEMENTS, intptr_t CPOS>
+struct UintReaderImplType<N_ELEMENTS, 16, CPOS> {
+    using reader_type = Uint16Reader<N_ELEMENTS, CPOS>;
+};
+
+} // namespace
+
+// reduces the number of read operations from RAM
+template <intptr_t DIM, intptr_t CODE_SIZE, intptr_t CODE_BITS, intptr_t CPOS>
+using UintReader =
+        typename UintReaderImplType<DIM / CODE_SIZE, CODE_BITS, CPOS>::
+                reader_type;
+
+template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
+using UintReaderRaw =
+        typename UintReaderImplType<N_ELEMENTS, CODE_BITS, CPOS>::reader_type;
+
+} // namespace detail
+} // namespace cppcontrib
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h
new file mode 100644
index 000000000..75ca7b8e4
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-avx2-inl.h
@@ -0,0 +1,2072 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef LEVEL2_AVX2_INL_H
+#define LEVEL2_AVX2_INL_H
+
+#include <immintrin.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/cppcontrib/detail/UintReader.h>
+
+namespace faiss {
+namespace cppcontrib {
+
+////////////////////////////////////////////////////////////////////////////////////
+/// Index2LevelDecoder
+////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+// Processes 8 float values.
+// Returns {
+//   [0..1] = *coarse[0..1] + *fine0[0..1];
+//   [2..3] = *coarse[2..3] + *fine1[0..1];
+//   [4..5] = *coarse[4..5] + *fine2[0..1];
+//   [6..7] = *coarse[6..7] + *fine3[0..1];
+// }
+inline __m256 elementaryBlock2x4b(
+        const float* const __restrict coarse,
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float* const __restrict fine2,
+        const float* const __restrict fine3) {
+    // load fine
+    const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
+            *reinterpret_cast<const double*>(fine0),
+            *reinterpret_cast<const double*>(fine1),
+            *reinterpret_cast<const double*>(fine2),
+            *reinterpret_cast<const double*>(fine3)));
+    // load coarse
+    const __m256 coarseValue = _mm256_loadu_ps(coarse);
+
+    // add coarse and fine
+    return _mm256_add_ps(fineValue, coarseValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..1] = existingValue[0..1] + weight * (*coarse[0..1] + *fine0[0..1]);
+//   [2..3] = existingValue[0..1] + weight * (*coarse[2..3] + *fine1[0..1]);
+//   [4..5] = existingValue[0..1] + weight * (*coarse[4..5] + *fine2[0..1]);
+//   [6..7] = existingValue[0..1] + weight * (*coarse[6..7] + *fine3[0..1]);
+// }
+inline __m256 elementaryBlock2x4bAccum(
+        const float* const __restrict coarse,
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float* const __restrict fine2,
+        const float* const __restrict fine3,
+        const float weight,
+        const __m256 existingValue) {
+    // add coarse and fine
+    const __m256 combinedValue =
+            elementaryBlock2x4b(coarse, fine0, fine1, fine2, fine3);
+
+    // this operation is expected to be optimized by a compiler
+    const __m256 weightAvx2 = _mm256_set1_ps(weight);
+    // do fma
+    return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
+}
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = *coarse[0..3] + *fine[0..3];
+// }
+inline __m128 elementaryBlock4x1b(
+        const float* const __restrict coarse,
+        const float* const __restrict fine) {
+    // load fine
+    const __m128 fineValue = _mm_loadu_ps(fine);
+    // load coarse
+    const __m128 coarseValue = _mm_loadu_ps(coarse);
+
+    // add coarse and fine
+    return _mm_add_ps(fineValue, coarseValue);
+}
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
+// }
+inline __m128 elementaryBlock4x1bAccum(
+        const float* const __restrict coarse,
+        const float* const __restrict fine,
+        const float weight,
+        const __m128 existingValue) {
+    // add coarse and fine
+    const __m128 combinedValue = elementaryBlock4x1b(coarse, fine);
+
+    // this operation is expected to be optimized by a compiler
+    const __m128 weightAvx = _mm_set1_ps(weight);
+    // do fma
+    return _mm_fmadd_ps(combinedValue, weightAvx, existingValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = *coarse[0..3] + *fine0[0..3];
+//   [4..7] = *coarse[4..7] + *fine1[0..3];
+// }
+inline __m256 elementaryBlock4x2b(
+        const float* const __restrict coarse,
+        const float* const __restrict fine0,
+        const float* const __restrict fine1) {
+    // load fine
+    const __m128 fineValue0 = _mm_loadu_ps(fine0);
+    const __m128 fineValue1 = _mm_loadu_ps(fine1);
+    // load coarse
+    const __m256 coarseValue = _mm256_loadu_ps(coarse);
+
+    // combine two 4b into a single 8b
+    const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
+    // add coarse and fine
+    return _mm256_add_ps(combinedFineValue, coarseValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
+//   [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
+// }
+inline __m256 elementaryBlock4x2bAccum(
+        const float* const __restrict coarse,
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float weight,
+        const __m256 existingValue) {
+    // add coarse and fine
+    const __m256 combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
+
+    // this operation is expected to be optimized by a compiler
+    const __m256 weightAvx2 = _mm256_set1_ps(weight);
+    // do fma
+    return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = *coarse[0..7] + *fine[0..7];
+// }
+inline __m256 elementaryBlock8x1b(
+        const float* const __restrict coarse,
+        const float* const __restrict fine) {
+    // load fine
+    const __m256 fineValue = _mm256_loadu_ps(fine);
+    // load coarse
+    const __m256 coarseValue = _mm256_loadu_ps(coarse);
+
+    // add coarse and fine
+    return _mm256_add_ps(fineValue, coarseValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
+// }
+inline __m256 elementaryBlock8x1bAccum(
+        const float* const __restrict coarse,
+        const float* const __restrict fine,
+        const float weight,
+        const __m256 existingValue) {
+    // add coarse and fine
+    const __m256 combinedValue = elementaryBlock8x1b(coarse, fine);
+
+    // this operation is expected to be optimized by a compiler
+    const __m256 weightAvx2 = _mm256_set1_ps(weight);
+    // do fma
+    return _mm256_fmadd_ps(combinedValue, weightAvx2, existingValue);
+}
+
+// The following code uses template-based for-loop unrolling,
+//   because the compiler does not do that on its own as needed.
+// The idea is the following:
+//   template<int I, int MAX>
+//   struct Foo {
+//     static void bar() {
+//       doSomething(I);
+//       Foo<I + 1, MAX>::bar();
+//     }
+//   };
+//
+//   template<int MAX>
+//   struct Foo<MAX, MAX> {
+//     static void bar() {}
+//   };
+//
+//   Initiate the loop:
+//     Foo<0, MAX>::bar();
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
+        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
+        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
+        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
+        bool DIM_EQ_CPOS = DIM == CPOS>
+struct Index2LevelDecoderImpl;
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        2,
+        COARSE_BITS,
+        FINE_BITS,
+        CPOS,
+        true,
+        false,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        false> {
+    static constexpr intptr_t FINE_SIZE = 2;
+
+    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
+    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    // coarse quantizer storage
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    // coarse quantizer bytes start from 0
+    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
+    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
+            N_COARSE_ELEMENTS * COARSE_BITS;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
+            (N_COARSE_ELEMENTS_BITS + 7) / 8;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 2 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+
+        const __m256 storeValue = elementaryBlock2x4b(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
+
+        _mm256_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
+              pqCoarseCentroids0, pqFineCentroids0, code0,
+              outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 2 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,              weight0,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 2 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
+        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
+        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        4,
+        COARSE_BITS,
+        FINE_BITS,
+        CPOS,
+        false,
+        true,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        false> {
+    static constexpr intptr_t FINE_SIZE = 4;
+
+    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
+    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    // coarse quantizer storage
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    // coarse quantizer bytes start from 0
+    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
+    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
+            N_COARSE_ELEMENTS * COARSE_BITS;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
+            (N_COARSE_ELEMENTS_BITS + 7) / 8;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+
+        const __m256 storeValue = elementaryBlock4x2b(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
+
+        _mm256_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
+              pqCoarseCentroids0, pqFineCentroids0, code0,
+              outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        FINE_SIZE,
+        COARSE_BITS,
+        FINE_BITS,
+        CPOS,
+        false,
+        false,
+        true,
+        true,
+        false> {
+    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
+    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    // coarse quantizer storage
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    // coarse quantizer bytes start from 0
+    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
+    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
+            N_COARSE_ELEMENTS * COARSE_BITS;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
+            (N_COARSE_ELEMENTS_BITS + 7) / 8;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+
+        const __m256 storeValue = elementaryBlock8x1b(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
+
+        _mm256_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::store(
+              pqCoarseCentroids0, pqFineCentroids0, code0,
+              outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 8>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        FINE_SIZE,
+        COARSE_BITS,
+        FINE_BITS,
+        CPOS,
+        false,
+        false,
+        false,
+        true,
+        false> {
+    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
+    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    // coarse quantizer storage
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    // coarse quantizer bytes start from 0
+    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
+    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
+            N_COARSE_ELEMENTS * COARSE_BITS;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
+            (N_COARSE_ELEMENTS_BITS + 7) / 8;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+
+        const __m128 storeValue = elementaryBlock4x1b(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
+
+        _mm_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::store(
+              pqCoarseCentroids0, pqFineCentroids0, code0,
+              outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS,fineCentroidIdx>::get(fine0);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids0 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids1 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids2 + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
+              pqCoarseCentroids0, pqFineCentroids0, code0, weight0,
+              pqCoarseCentroids1, pqFineCentroids1, code1, weight1,
+              pqCoarseCentroids2, pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse0);
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t coarseCode1 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse1);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t coarseCode2 = detail::UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::get(coarse2);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqCoarseCentroids + (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) * COARSE_SIZE + coarseCentroidOffset,
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<DIM, COARSE_SIZE, FINE_SIZE, COARSE_BITS, FINE_BITS, CPOS + 4>::accum(
+              pqCoarseCentroids, pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+// This partial specialization is expected to do nothing.
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        bool FINE_SIZE_EQ_2,
+        bool FINE_SIZE_EQ_4,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        FINE_SIZE,
+        COARSE_BITS,
+        FINE_BITS,
+        DIM,
+        FINE_SIZE_EQ_2,
+        FINE_SIZE_EQ_4,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        true> {
+    // clang-format off
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {}
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+
+    // clang-format on
+};
+} // namespace
+
+// Suitable for IVF256,PQ[1]x8
+// Subtable for IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
+// Subtable for IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
+// Suitable for IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
+// Suitable for Residual[1]x8,PQ[2]x8
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
+// Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS = 8,
+        intptr_t FINE_BITS = 8>
+struct Index2LevelDecoder {
+    static_assert(
+            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 12 ||
+                    COARSE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for COARSE_BITS");
+    static_assert(
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
+
+    static constexpr intptr_t dim = DIM;
+    static constexpr intptr_t coarseSize = COARSE_SIZE;
+    static constexpr intptr_t fineSize = FINE_SIZE;
+    static constexpr intptr_t coarseBits = COARSE_BITS;
+    static constexpr intptr_t fineBits = FINE_BITS;
+
+    // Process 1 sample.
+    static void store(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
+    }
+
+    // Process 1 sample.
+    // Performs outputAccum += weight * decoded(code)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code,
+                      weight,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1).
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      pqCoarseCentroids2,
+                      pqFineCentroids2,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
+#endif // LEVEL2_AVX2_INL_H
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h
new file mode 100644
index 000000000..36355af00
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-inl.h
@@ -0,0 +1,414 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef LEVEL2_INL_H
+#define LEVEL2_INL_H
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/cppcontrib/detail/CoarseBitType.h>
+
+namespace faiss {
+namespace cppcontrib {
+
+////////////////////////////////////////////////////////////////////////////////////
+/// Index2LevelDecoder
+////////////////////////////////////////////////////////////////////////////////////
+
+// Suitable for IVF256,PQ[1]x8
+// Suitable for Residual[1]x8,PQ[2]x8
+// Suitable for IVF[9-16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+// Suitable for Residual1x[9-16 bit],PQ[1]x8 (such as Residual1x9,PQ8)
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS = 8,
+        intptr_t FINE_BITS = 8>
+struct Index2LevelDecoder {
+    static_assert(
+            COARSE_BITS == 8 || COARSE_BITS == 16,
+            "Only 8 or 16 bits are currently supported for COARSE_BITS");
+    static_assert(
+            FINE_BITS == 8,
+            "Only 8 bits is currently supported for FINE_BITS");
+
+    static constexpr intptr_t dim = DIM;
+    static constexpr intptr_t coarseSize = COARSE_SIZE;
+    static constexpr intptr_t fineSize = FINE_SIZE;
+    static constexpr intptr_t coarseBits = COARSE_BITS;
+    static constexpr intptr_t fineBits = FINE_BITS;
+
+    // coarse quantizer storage
+    using coarse_storage_type =
+            typename detail::CoarseBitType<COARSE_BITS>::bit_type;
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // Process 1 sample.
+    // Performs outputStore = decoded(code)
+    static void store(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const coarse_storage_type* const __restrict coarse =
+                reinterpret_cast<const coarse_storage_type*>(code);
+
+        // fine quantizer
+        const uint8_t* const __restrict fine =
+                code + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
+            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t coarseCode = coarse[coarseCentroidIdx];
+            const intptr_t fineCode = fine[fineCentroidIdx];
+
+            const float* const __restrict coarsePtr = pqCoarseCentroids +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputStore[i] = *coarsePtr + *finePtr;
+        }
+    }
+
+    // Process 1 sample.
+    // Performs outputAccum += weight * decoded(code)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const coarse_storage_type* const __restrict coarse =
+                reinterpret_cast<const coarse_storage_type*>(code);
+
+        // fine quantizer
+        const uint8_t* const __restrict fine =
+                code + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
+            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t coarseCode = coarse[coarseCentroidIdx];
+            const intptr_t fineCode = fine[fineCentroidIdx];
+
+            const float* const __restrict coarsePtr = pqCoarseCentroids +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight * (*coarsePtr + *finePtr);
+        }
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1).
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const coarse_storage_type* const __restrict coarse0 =
+                reinterpret_cast<const coarse_storage_type*>(code0);
+        const coarse_storage_type* const __restrict coarse1 =
+                reinterpret_cast<const coarse_storage_type*>(code1);
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 =
+                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+        const uint8_t* const __restrict fine1 =
+                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
+            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+
+            const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr0 = pqFineCentroids0 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict coarsePtr1 = pqCoarseCentroids1 +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids1 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
+                    weight1 * (*coarsePtr1 + *finePtr1);
+        }
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const coarse_storage_type* const __restrict coarse0 =
+                reinterpret_cast<const coarse_storage_type*>(code0);
+        const coarse_storage_type* const __restrict coarse1 =
+                reinterpret_cast<const coarse_storage_type*>(code1);
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 =
+                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+        const uint8_t* const __restrict fine1 =
+                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
+            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+
+            const float* const __restrict coarsePtr0 = pqCoarseCentroids +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr0 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict coarsePtr1 = pqCoarseCentroids +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
+                    weight1 * (*coarsePtr1 + *finePtr1);
+        }
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const coarse_storage_type* const __restrict coarse0 =
+                reinterpret_cast<const coarse_storage_type*>(code0);
+        const coarse_storage_type* const __restrict coarse1 =
+                reinterpret_cast<const coarse_storage_type*>(code1);
+        const coarse_storage_type* const __restrict coarse2 =
+                reinterpret_cast<const coarse_storage_type*>(code2);
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 =
+                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+        const uint8_t* const __restrict fine1 =
+                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+        const uint8_t* const __restrict fine2 =
+                code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
+            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
+            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+
+            const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr0 = pqFineCentroids0 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict coarsePtr1 = pqCoarseCentroids1 +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids1 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict coarsePtr2 = pqCoarseCentroids2 +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr2 = pqFineCentroids2 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
+                    weight1 * (*coarsePtr1 + *finePtr1) +
+                    weight2 * (*coarsePtr2 + *finePtr2);
+        }
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const coarse_storage_type* const __restrict coarse0 =
+                reinterpret_cast<const coarse_storage_type*>(code0);
+        const coarse_storage_type* const __restrict coarse1 =
+                reinterpret_cast<const coarse_storage_type*>(code1);
+        const coarse_storage_type* const __restrict coarse2 =
+                reinterpret_cast<const coarse_storage_type*>(code2);
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 =
+                code0 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+        const uint8_t* const __restrict fine1 =
+                code1 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+        const uint8_t* const __restrict fine2 =
+                code2 + (DIM / COARSE_SIZE) * sizeof(coarse_storage_type);
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t coarseCentroidIdx = i / COARSE_SIZE;
+            const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
+            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+
+            const float* const __restrict coarsePtr0 = pqCoarseCentroids +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr0 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict coarsePtr1 = pqCoarseCentroids +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict coarsePtr2 = pqCoarseCentroids +
+                    (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                            COARSE_SIZE +
+                    coarseCentroidOffset;
+            const float* const __restrict finePtr2 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*coarsePtr0 + *finePtr0) +
+                    weight1 * (*coarsePtr1 + *finePtr1) +
+                    weight2 * (*coarsePtr2 + *finePtr2);
+        }
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
+#endif // LEVEL2_INL_H
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h
new file mode 100644
index 000000000..20e815a01
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/Level2-neon-inl.h
@@ -0,0 +1,2161 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef LEVEL2_NEON_INL_H
+#define LEVEL2_NEON_INL_H
+
+#include <arm_neon.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/cppcontrib/detail/UintReader.h>
+
+namespace faiss {
+namespace cppcontrib {
+
+namespace {
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = *coarse[0..3] + *fine[0..3];
+// }
+inline float32x4_t elementaryBlock4x1b(
+        const float* const __restrict coarse,
+        const float* const __restrict fine) {
+    // load fine
+    const auto fineValue = vld1q_f32(fine);
+    // load coarse
+    const auto coarseValue = vld1q_f32(coarse);
+
+    // add coarse and fine
+    return vaddq_f32(fineValue, coarseValue);
+}
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine[0..3]);
+// }
+inline float32x4_t elementaryBlock4x1bAccum(
+        const float* const __restrict coarse,
+        const float* const __restrict fine,
+        const float weight,
+        const float32x4_t existingValue) {
+    // add coarse and fine
+    const auto combinedValue = elementaryBlock4x1b(coarse, fine);
+
+    // this operation is expected to be optimized by a compiler
+    const auto weightNeon = vdupq_n_f32(weight);
+    // do fma
+    return vfmaq_f32(existingValue, weightNeon, combinedValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = *coarse[0..3] + *fine0[0..3];
+//   [4..7] = *coarse[4..7] + *fine1[0..3];
+// }
+inline float32x4x2_t elementaryBlock4x2b(
+        const float* const __restrict coarse,
+        const float* const __restrict fine0,
+        const float* const __restrict fine1) {
+    // load fine
+    const auto fineValue0 = vld1q_f32(fine0);
+    const auto fineValue1 = vld1q_f32(fine1);
+    // load coarse
+    const auto coarseValue0 = vld1q_f32(coarse);
+    const auto coarseValue1 = vld1q_f32(coarse + 4);
+
+    // add coarse and fine
+    const auto result0 = vaddq_f32(fineValue0, coarseValue0);
+    const auto result1 = vaddq_f32(fineValue1, coarseValue1);
+
+    return {result0, result1};
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*coarse[0..3] + *fine0[0..3]);
+//   [4..7] = existingValue[4..7] + weight * (*coarse[4..7] + *fine1[0..3]);
+// }
+inline float32x4x2_t elementaryBlock4x2bAccum(
+        const float* const __restrict coarse,
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float weight,
+        const float32x4x2_t existingValue) {
+    // add coarse and fine
+    const auto combinedValue = elementaryBlock4x2b(coarse, fine0, fine1);
+
+    // this operation is expected to be optimized by a compiler
+    const auto weightNeon = vdupq_n_f32(weight);
+    // do fma
+    const auto result0 =
+            vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
+    const auto result1 =
+            vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
+    return {result0, result1};
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = *coarse[0..7] + *fine[0..7];
+// }
+inline float32x4x2_t elementaryBlock8x1b(
+        const float* const __restrict coarse,
+        const float* const __restrict fine) {
+    // load fine
+    const auto fineValue0 = vld1q_f32(fine);
+    const auto fineValue1 = vld1q_f32(fine + 4);
+    // load coarse
+    const auto coarseValue0 = vld1q_f32(coarse);
+    const auto coarseValue1 = vld1q_f32(coarse + 4);
+
+    // add coarse and fine
+    return {vaddq_f32(fineValue0, coarseValue0),
+            vaddq_f32(fineValue1, coarseValue1)};
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = existingValue[0..7] + weight * (*coarse[0..7] + *fine[0..7]);
+// }
+inline float32x4x2_t elementaryBlock8x1bAccum(
+        const float* const __restrict coarse,
+        const float* const __restrict fine,
+        const float weight,
+        const float32x4x2_t existingValue) {
+    // add coarse and fine
+    const auto combinedValue = elementaryBlock8x1b(coarse, fine);
+
+    // this operation is expected to be optimized by a compiler
+    const auto weightNeon = vdupq_n_f32(weight);
+    // do fma
+    const auto result0 =
+            vfmaq_f32(existingValue.val[0], weightNeon, combinedValue.val[0]);
+    const auto result1 =
+            vfmaq_f32(existingValue.val[1], weightNeon, combinedValue.val[1]);
+    return {result0, result1};
+}
+
+// The following code uses template-based for-loop unrolling,
+//   because the compiler does not do that on its own as needed.
+// The idea is the following:
+//   template<int I, int MAX>
+//   struct Foo {
+//     static void bar() {
+//       doSomething(I);
+//       Foo<I + 1, MAX>::bar();
+//     }
+//   };
+//
+//   template<int MAX>
+//   struct Foo<MAX, MAX> {
+//     static void bar() {}
+//   };
+//
+//   Initiate the loop:
+//     Foo<0, MAX>::bar();
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
+        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
+        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
+        bool DIM_EQ_CPOS = DIM == CPOS>
+struct Index2LevelDecoderImpl;
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        4,
+        COARSE_BITS,
+        FINE_BITS,
+        CPOS,
+        true,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        false> {
+    static constexpr intptr_t FINE_SIZE = 4;
+
+    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
+    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    // coarse quantizer storage
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    // coarse quantizer bytes start from 0
+    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
+    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
+            N_COARSE_ELEMENTS * COARSE_BITS;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
+            (N_COARSE_ELEMENTS_BITS + 7) / 8;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+
+        const auto storeValue = elementaryBlock4x2b(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset);
+
+        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
+        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids1 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+        const intptr_t coarseCode2 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse2);
+        const intptr_t fineCode2a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine2);
+        const intptr_t fineCode2b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine2);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids1 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids2 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids2 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode2a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids2 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode2b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      pqCoarseCentroids2,
+                      pqFineCentroids2,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+        const intptr_t coarseCode2 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse2);
+        const intptr_t fineCode2a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine2);
+        const intptr_t fineCode2b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine2);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode2a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode2b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+};
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        FINE_SIZE,
+        COARSE_BITS,
+        FINE_BITS,
+        CPOS,
+        false,
+        true,
+        true,
+        false> {
+    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
+    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    // coarse quantizer storage
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    // coarse quantizer bytes start from 0
+    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
+    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
+            N_COARSE_ELEMENTS * COARSE_BITS;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
+            (N_COARSE_ELEMENTS_BITS + 7) / 8;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        const auto storeValue = elementaryBlock8x1b(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset);
+
+        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
+        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        const auto existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids1 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t coarseCode2 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse2);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids1 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids2 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids2 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      pqCoarseCentroids2,
+                      pqFineCentroids2,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 8 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t coarseCode2 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse2);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 8>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+};
+
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        intptr_t CPOS>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        FINE_SIZE,
+        COARSE_BITS,
+        FINE_BITS,
+        CPOS,
+        false,
+        false,
+        true,
+        false> {
+    static constexpr intptr_t coarseCentroidIdx = CPOS / COARSE_SIZE;
+    static constexpr intptr_t coarseCentroidOffset = CPOS % COARSE_SIZE;
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    // coarse quantizer storage
+    static constexpr intptr_t COARSE_TABLE_BYTES = (1 << COARSE_BITS);
+
+    // coarse quantizer bytes start from 0
+    // fine quantizer bytes start from N_COARSE_ELEMENTS_BYTES
+    static constexpr intptr_t N_COARSE_ELEMENTS = DIM / COARSE_SIZE;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BITS =
+            N_COARSE_ELEMENTS * COARSE_BITS;
+    static constexpr intptr_t N_COARSE_ELEMENTS_BYTES =
+            (N_COARSE_ELEMENTS_BITS + 7) / 8;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        const auto storeValue = elementaryBlock4x1b(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset);
+
+        vst1q_f32(outputStore + CPOS, storeValue);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 4>::
+                store(pqCoarseCentroids0, pqFineCentroids0, code0, outputStore);
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 4>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids1 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 4>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 4>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t coarseCode2 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse2);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids0 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids1 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids2 +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids2 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 4>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      pqCoarseCentroids2,
+                      pqFineCentroids2,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // coarse quantizer
+        const uint8_t* const __restrict coarse0 = code0;
+        const uint8_t* const __restrict coarse1 = code1;
+        const uint8_t* const __restrict coarse2 = code2;
+
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine1 = code1 + N_COARSE_ELEMENTS_BYTES;
+        const uint8_t* const __restrict fine2 = code2 + N_COARSE_ELEMENTS_BYTES;
+
+        // process chunks, 4 float
+
+        const intptr_t coarseCode0 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse0);
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t coarseCode1 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse1);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t coarseCode2 = detail::
+                UintReader<DIM, COARSE_SIZE, COARSE_BITS, coarseCentroidIdx>::
+                        get(coarse2);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode1) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqCoarseCentroids +
+                        (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode2) *
+                                COARSE_SIZE +
+                        coarseCentroidOffset,
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                CPOS + 4>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+};
+
+// This partial specialization is expected to do nothing.
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS,
+        intptr_t FINE_BITS,
+        bool FINE_SIZE_EQ_4,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct Index2LevelDecoderImpl<
+        DIM,
+        COARSE_SIZE,
+        FINE_SIZE,
+        COARSE_BITS,
+        FINE_BITS,
+        DIM,
+        FINE_SIZE_EQ_4,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        true> {
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {}
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+};
+} // namespace
+
+// Suitable for IVF256,PQ[1]x8
+// Subtable for IVF256,PQ[1]x10 (such as IVF256,PQ16x10np)
+// Subtable for IVF256,PQ[1]x12 (such as IVF256,PQ16x12np)
+// Suitable for IVF256,PQ[1]x16 (such as IVF256,PQ16x16np)
+// Suitable for Residual[1]x8,PQ[2]x8
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x8 (such as IVF1024,PQ16np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x10 (such as IVF1024,PQ16x10np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x12 (such as IVF1024,PQ16x12np)
+// Suitable for IVF[2^9-2^16 bit],PQ[1]x16 (such as IVF1024,PQ16x16np)
+// Suitable for Residual[1]x[9-16 bit],PQ[2]x[3] (such as Residual2x9,PQ8)
+template <
+        intptr_t DIM,
+        intptr_t COARSE_SIZE,
+        intptr_t FINE_SIZE,
+        intptr_t COARSE_BITS = 8,
+        intptr_t FINE_BITS = 8>
+struct Index2LevelDecoder {
+    static_assert(
+            COARSE_BITS == 8 || COARSE_BITS == 10 || COARSE_BITS == 12 ||
+                    COARSE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for COARSE_BITS");
+    static_assert(
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
+
+    static constexpr intptr_t dim = DIM;
+    static constexpr intptr_t coarseSize = COARSE_SIZE;
+    static constexpr intptr_t fineSize = FINE_SIZE;
+    static constexpr intptr_t coarseBits = COARSE_BITS;
+    static constexpr intptr_t fineBits = FINE_BITS;
+
+    // Process 1 sample.
+    static void store(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                store(pqCoarseCentroids, pqFineCentroids, code, outputStore);
+    }
+
+    // Process 1 sample.
+    // Performs outputAccum += weight * decoded(code)
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code,
+                      weight,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1).
+    //
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 2 samples.
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1)
+    //
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    //
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids0,
+                      pqFineCentroids0,
+                      code0,
+                      weight0,
+                      pqCoarseCentroids1,
+                      pqFineCentroids1,
+                      code1,
+                      weight1,
+                      pqCoarseCentroids2,
+                      pqFineCentroids2,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+
+    // Process 3 samples.
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    //
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        Index2LevelDecoderImpl<
+                DIM,
+                COARSE_SIZE,
+                FINE_SIZE,
+                COARSE_BITS,
+                FINE_BITS,
+                0>::
+                accum(pqCoarseCentroids,
+                      pqFineCentroids,
+                      code0,
+                      weight0,
+                      code1,
+                      weight1,
+                      code2,
+                      weight2,
+                      outputAccum);
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
+#endif // LEVEL2_NEON_INL_H
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h
new file mode 100644
index 000000000..a310bebfc
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMax-inl.h
@@ -0,0 +1,467 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace faiss {
+namespace cppcontrib {
+
+template <typename SubIndexT>
+struct IndexMinMaxDecoder {
+    static constexpr intptr_t dim = SubIndexT::dim;
+
+    // Process 1 sample.
+    // Performs outputStore = scaler * decoded(code) + minv
+    static void store(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        const float* const __restrict codeFloat =
+                reinterpret_cast<const float*>(code);
+        const float scaler = codeFloat[0];
+        const float minv = codeFloat[1];
+
+        SubIndexT::store(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code + 2 * sizeof(float),
+                outputStore);
+        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
+            outputStore[i] = outputStore[i] * scaler + minv;
+        }
+    }
+
+    // Process 1 sample.
+    // Performs outputStore = scaler * decoded(code) + minv
+    static void store(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        const float* const __restrict codeFloat =
+                reinterpret_cast<const float*>(code);
+        const float scaler = codeFloat[0];
+        const float minv = codeFloat[1];
+
+        SubIndexT::store(
+                pqFineCentroids, code + 2 * sizeof(float), outputStore);
+        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
+            outputStore[i] = outputStore[i] * scaler + minv;
+        }
+    }
+
+    // Process 1 sample.
+    // Performs
+    //  * outputAccum += weight * scaler * decoded(code)
+    //  * minvAccum += weight * minv
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict codeFloat =
+                reinterpret_cast<const float*>(code);
+        const float scaler = codeFloat[0] * weight;
+        const float minv = codeFloat[1] * weight;
+
+        SubIndexT::accum(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code + 2 * sizeof(float),
+                scaler,
+                outputAccum);
+
+        minvAccum += minv;
+    }
+
+    // Process 1 sample.
+    // Performs
+    //  * outputAccum += weight * scaler * decoded(code)
+    //  * minvAccum += weight * minv
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict codeFloat =
+                reinterpret_cast<const float*>(code);
+        const float scaler = codeFloat[0] * weight;
+        const float minv = codeFloat[1] * weight;
+
+        SubIndexT::accum(
+                pqFineCentroids, code + 2 * sizeof(float), scaler, outputAccum);
+
+        minvAccum += minv;
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        SubIndexT::accum(
+                pqCoarseCentroids0,
+                pqFineCentroids0,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                pqCoarseCentroids1,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        SubIndexT::accum(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        SubIndexT::accum(
+                pqFineCentroids0,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        SubIndexT::accum(
+                pqFineCentroids,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        const float* const __restrict code2Float =
+                reinterpret_cast<const float*>(code2);
+        const float scaler2 = code2Float[0] * weight2;
+        const float minv2 = code2Float[1] * weight2;
+
+        SubIndexT::accum(
+                pqCoarseCentroids0,
+                pqFineCentroids0,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                pqCoarseCentroids1,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                pqCoarseCentroids2,
+                pqFineCentroids2,
+                code2 + 2 * sizeof(float),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        const float* const __restrict code2Float =
+                reinterpret_cast<const float*>(code2);
+        const float scaler2 = code2Float[0] * weight2;
+        const float minv2 = code2Float[1] * weight2;
+
+        SubIndexT::accum(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                code2 + 2 * sizeof(float),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        const float* const __restrict code2Float =
+                reinterpret_cast<const float*>(code2);
+        const float scaler2 = code2Float[0] * weight2;
+        const float minv2 = code2Float[1] * weight2;
+
+        SubIndexT::accum(
+                pqFineCentroids0,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                pqFineCentroids2,
+                code2 + 2 * sizeof(float),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const float* const __restrict code0Float =
+                reinterpret_cast<const float*>(code0);
+        const float scaler0 = code0Float[0] * weight0;
+        const float minv0 = code0Float[1] * weight0;
+
+        const float* const __restrict code1Float =
+                reinterpret_cast<const float*>(code1);
+        const float scaler1 = code1Float[0] * weight1;
+        const float minv1 = code1Float[1] * weight1;
+
+        const float* const __restrict code2Float =
+                reinterpret_cast<const float*>(code2);
+        const float scaler2 = code2Float[0] * weight2;
+        const float minv2 = code2Float[1] * weight2;
+
+        SubIndexT::accum(
+                pqFineCentroids,
+                code0 + 2 * sizeof(float),
+                scaler0,
+                code1 + 2 * sizeof(float),
+                scaler1,
+                code2 + 2 * sizeof(float),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h
new file mode 100644
index 000000000..b7375fb9c
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/MinMaxFP16-inl.h
@@ -0,0 +1,472 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/utils/fp16.h>
+
+namespace faiss {
+namespace cppcontrib {
+
+template <typename SubIndexT>
+struct IndexMinMaxFP16Decoder {
+    static constexpr intptr_t dim = SubIndexT::dim;
+
+    // Process 1 sample.
+    // Performs outputStore = scaler * decoded(code) + minv
+    static void store(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        const uint16_t* const __restrict codeFP16 =
+                reinterpret_cast<const uint16_t*>(code);
+        const float scaler = faiss::decode_fp16(codeFP16[0]);
+        const float minv = faiss::decode_fp16(codeFP16[1]);
+
+        SubIndexT::store(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code + 2 * sizeof(uint16_t),
+                outputStore);
+        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
+            outputStore[i] = outputStore[i] * scaler + minv;
+        }
+    }
+
+    // Process 1 sample.
+    // Performs outputStore = scaler * decoded(code) + minv
+    static void store(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        const uint16_t* const __restrict codeFP16 =
+                reinterpret_cast<const uint16_t*>(code);
+        const float scaler = faiss::decode_fp16(codeFP16[0]);
+        const float minv = faiss::decode_fp16(codeFP16[1]);
+
+        SubIndexT::store(
+                pqFineCentroids, code + 2 * sizeof(uint16_t), outputStore);
+        for (intptr_t i = 0; i < SubIndexT::dim; i++) {
+            outputStore[i] = outputStore[i] * scaler + minv;
+        }
+    }
+
+    // Process 1 sample.
+    // Performs
+    //  * outputAccum += weight * scaler * decoded(code)
+    //  * minvAccum += weight * minv
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict codeFP16 =
+                reinterpret_cast<const uint16_t*>(code);
+        const float scaler = faiss::decode_fp16(codeFP16[0]) * weight;
+        const float minv = faiss::decode_fp16(codeFP16[1]) * weight;
+
+        SubIndexT::accum(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code + 2 * sizeof(uint16_t),
+                scaler,
+                outputAccum);
+
+        minvAccum += minv;
+    }
+
+    // Process 1 sample.
+    // Performs
+    //  * outputAccum += weight * scaler * decoded(code)
+    //  * minvAccum += weight * minv
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict codeFP16 =
+                reinterpret_cast<const uint16_t*>(code);
+        const float scaler = faiss::decode_fp16(codeFP16[0]) * weight;
+        const float minv = faiss::decode_fp16(codeFP16[1]) * weight;
+
+        SubIndexT::accum(
+                pqFineCentroids,
+                code + 2 * sizeof(uint16_t),
+                scaler,
+                outputAccum);
+
+        minvAccum += minv;
+    }
+
+    // Process 2 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        SubIndexT::accum(
+                pqCoarseCentroids0,
+                pqFineCentroids0,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                pqCoarseCentroids1,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 2 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        SubIndexT::accum(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        SubIndexT::accum(
+                pqFineCentroids0,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        SubIndexT::accum(
+                pqFineCentroids,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                outputAccum);
+
+        minvAccum += minv0 + minv1;
+    }
+
+    // Process 3 samples.
+    // Each code uses its own coarse pq centroids table and fine pq centroids
+    // table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqCoarseCentroids0,
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqCoarseCentroids1,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqCoarseCentroids2,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        const uint16_t* const __restrict code2FP16 =
+                reinterpret_cast<const uint16_t*>(code2);
+        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
+        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
+
+        SubIndexT::accum(
+                pqCoarseCentroids0,
+                pqFineCentroids0,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                pqCoarseCentroids1,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                pqCoarseCentroids2,
+                pqFineCentroids2,
+                code2 + 2 * sizeof(uint16_t),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+
+    // Process 3 samples.
+    // Coarse pq centroids table and fine pq centroids table are shared among
+    // codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqCoarseCentroids,
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        const uint16_t* const __restrict code2FP16 =
+                reinterpret_cast<const uint16_t*>(code2);
+        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
+        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
+
+        SubIndexT::accum(
+                pqCoarseCentroids,
+                pqFineCentroids,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                code2 + 2 * sizeof(uint16_t),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        const uint16_t* const __restrict code2FP16 =
+                reinterpret_cast<const uint16_t*>(code2);
+        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
+        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
+
+        SubIndexT::accum(
+                pqFineCentroids0,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                pqFineCentroids1,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                pqFineCentroids2,
+                code2 + 2 * sizeof(uint16_t),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs
+    //  * outputAccum += weight0 * scaler0 * decoded(code0)
+    //                 + weight1 * scaler1 * decoded(code1)
+    //                 + weight2 * scaler2 * decoded(code2)
+    //  * minvAccum += weight0 * minv0 + weight1 * minv1 + weight2 * minv2
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum,
+            float& minvAccum) {
+        const uint16_t* const __restrict code0FP16 =
+                reinterpret_cast<const uint16_t*>(code0);
+        const float scaler0 = faiss::decode_fp16(code0FP16[0]) * weight0;
+        const float minv0 = faiss::decode_fp16(code0FP16[1]) * weight0;
+
+        const uint16_t* const __restrict code1FP16 =
+                reinterpret_cast<const uint16_t*>(code1);
+        const float scaler1 = faiss::decode_fp16(code1FP16[0]) * weight1;
+        const float minv1 = faiss::decode_fp16(code1FP16[1]) * weight1;
+
+        const uint16_t* const __restrict code2FP16 =
+                reinterpret_cast<const uint16_t*>(code2);
+        const float scaler2 = faiss::decode_fp16(code2FP16[0]) * weight2;
+        const float minv2 = faiss::decode_fp16(code2FP16[1]) * weight2;
+
+        SubIndexT::accum(
+                pqFineCentroids,
+                code0 + 2 * sizeof(uint16_t),
+                scaler0,
+                code1 + 2 * sizeof(uint16_t),
+                scaler1,
+                code2 + 2 * sizeof(uint16_t),
+                scaler2,
+                outputAccum);
+
+        minvAccum += minv0 + minv1 + minv2;
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h
new file mode 100644
index 000000000..d63f52afe
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-avx2-inl.h
@@ -0,0 +1,1625 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PQ_AVX2_INL_H
+#define PQ_AVX2_INL_H
+
+#include <immintrin.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/cppcontrib/detail/UintReader.h>
+
+namespace faiss {
+namespace cppcontrib {
+
+////////////////////////////////////////////////////////////////////////////////////
+/// IndexPQDecoder
+////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+// Despite the following functions are somewhat redundant, I'd like to keep the
+// overall basic blocks similar to ones from Index2LevelDecoder.
+// A compiler will optimize away the redundant code.
+
+// Processes 8 float values.
+// Returns {
+//   [0..1] = *fine0[0..1];
+//   [2..3] = *fine1[0..1];
+//   [4..5] = *fine2[0..1];
+//   [6..7] = *fine3[0..1];
+// }
+inline __m256 elementaryBlock2x4b(
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float* const __restrict fine2,
+        const float* const __restrict fine3) {
+    // load fine
+    const __m256 fineValue = _mm256_castpd_ps(_mm256_setr_pd(
+            *reinterpret_cast<const double*>(fine0),
+            *reinterpret_cast<const double*>(fine1),
+            *reinterpret_cast<const double*>(fine2),
+            *reinterpret_cast<const double*>(fine3)));
+
+    // add coarse and fine
+    return fineValue;
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..1] = existingValue[0..1] + weight * (*fine0[0..1]);
+//   [2..3] = existingValue[0..1] + weight * (*fine1[0..1]);
+//   [4..5] = existingValue[0..1] + weight * (*fine2[0..1]);
+//   [6..7] = existingValue[0..1] + weight * (*fine3[0..1]);
+// }
+inline __m256 elementaryBlock2x4bAccum(
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float* const __restrict fine2,
+        const float* const __restrict fine3,
+        const float weight,
+        const __m256 existingValue) {
+    // add coarse and fine
+    const __m256 fineValue = elementaryBlock2x4b(fine0, fine1, fine2, fine3);
+
+    // this operation is expected to be optimized by a compiler
+    const __m256 weightAvx2 = _mm256_set1_ps(weight);
+    // do fma
+    return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
+}
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = *fine[0..3];
+// }
+inline __m128 elementaryBlock4x1b(const float* const __restrict fine) {
+    // load fine
+    const __m128 fineValue = _mm_loadu_ps(fine);
+    return fineValue;
+}
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
+// }
+inline __m128 elementaryBlock4x1bAccum(
+        const float* const __restrict fine,
+        const float weight,
+        const __m128 existingValue) {
+    const __m128 fineValue = elementaryBlock4x1b(fine);
+
+    // this operation is expected to be optimized by a compiler
+    const __m128 weightAvx = _mm_set1_ps(weight);
+    // do fma
+    return _mm_fmadd_ps(fineValue, weightAvx, existingValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = *fine0[0..3];
+//   [4..7] = *fine1[0..3];
+// }
+inline __m256 elementaryBlock4x2b(
+        const float* const __restrict fine0,
+        const float* const __restrict fine1) {
+    // load fine
+    const __m128 fineValue0 = _mm_loadu_ps(fine0);
+    const __m128 fineValue1 = _mm_loadu_ps(fine1);
+
+    // combine two 4b into a single 8b
+    const __m256 combinedFineValue = _mm256_set_m128(fineValue1, fineValue0);
+    return combinedFineValue;
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
+//   [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
+// }
+inline __m256 elementaryBlock4x2bAccum(
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float weight,
+        const __m256 existingValue) {
+    const __m256 fineValue = elementaryBlock4x2b(fine0, fine1);
+
+    // this operation is expected to be optimized by a compiler
+    const __m256 weightAvx2 = _mm256_set1_ps(weight);
+    // do fma
+    return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = *fine[0..7];
+// }
+inline __m256 elementaryBlock8x1b(const float* const __restrict fine) {
+    // load fine
+    const __m256 fineValue = _mm256_loadu_ps(fine);
+    return fineValue;
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
+// }
+inline __m256 elementaryBlock8x1bAccum(
+        const float* const __restrict fine,
+        const float weight,
+        const __m256 existingValue) {
+    const __m256 fineValue = elementaryBlock8x1b(fine);
+
+    // this operation is expected to be optimized by a compiler
+    const __m256 weightAvx2 = _mm256_set1_ps(weight);
+    // do fma
+    return _mm256_fmadd_ps(fineValue, weightAvx2, existingValue);
+}
+
+// The following code uses template-based for-loop unrolling,
+//   because the compiler does not do that on its own as needed.
+// The idea is the following:
+//   template<int I, int MAX>
+//   struct Foo {
+//     static void bar() {
+//       doSomething(I);
+//       Foo<I + 1, MAX>::bar();
+//     }
+//   };
+//
+//   template<int MAX>
+//   struct Foo<MAX, MAX> {
+//     static void bar() {}
+//   };
+//
+//   Initiate the loop:
+//     Foo<0, MAX>::bar();
+
+template <
+        intptr_t DIM,
+        intptr_t FINE_SIZE,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool FINE_SIZE_EQ_2 = FINE_SIZE == 2,
+        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
+        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
+        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
+        bool DIM_EQ_CPOS = DIM == CPOS>
+struct IndexPQDecoderImpl;
+
+template <
+        intptr_t DIM,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct IndexPQDecoderImpl<
+        DIM,
+        2,
+        FINE_BITS,
+        CPOS,
+        true,
+        false,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        false> {
+    static constexpr intptr_t FINE_SIZE = 2;
+
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+
+        const __m256 storeValue = elementaryBlock2x4b(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset);
+
+        _mm256_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
+              pqFineCentroids0, code0, outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0, outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
+        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode0c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine0);
+        const intptr_t fineCode0d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode1c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine1);
+        const intptr_t fineCode1d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine1);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+        const intptr_t fineCode2c = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 2>::get(fine2);
+        const intptr_t fineCode2d = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 3>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode0c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode0d) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode1c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode1d) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock2x4bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 2) * FINE_TABLE_BYTES + fineCode2c) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 3) * FINE_TABLE_BYTES + fineCode2d) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+template <
+        intptr_t DIM,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct IndexPQDecoderImpl<
+        DIM,
+        4,
+        FINE_BITS,
+        CPOS,
+        false,
+        true,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        false> {
+    static constexpr intptr_t FINE_SIZE = 4;
+
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+
+        const __m256 storeValue = elementaryBlock4x2b(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset);
+
+        _mm256_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
+              pqFineCentroids0, code0, outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0, outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids0 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids0 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids1 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids1 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids2 + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids2 + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine0);
+        const intptr_t fineCode0b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine0);
+        const intptr_t fineCode1a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine1);
+        const intptr_t fineCode1b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine1);
+        const intptr_t fineCode2a = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(fine2);
+        const intptr_t fineCode2b = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode0a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode0b) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode1a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode1b) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+              pqFineCentroids + ((fineCentroidIdx + 0) * FINE_TABLE_BYTES + fineCode2a) * FINE_SIZE + fineCentroidOffset,
+              pqFineCentroids + ((fineCentroidIdx + 1) * FINE_TABLE_BYTES + fineCode2b) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
+struct IndexPQDecoderImpl<
+        DIM,
+        FINE_SIZE,
+        FINE_BITS,
+        CPOS,
+        false,
+        false,
+        true,
+        true,
+        false> {
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+
+        const __m256 storeValue = elementaryBlock8x1b(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
+
+        _mm256_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
+              pqFineCentroids0, code0, outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0, outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m256 existingValue = _mm256_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm256_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+              pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
+struct IndexPQDecoderImpl<
+        DIM,
+        FINE_SIZE,
+        FINE_BITS,
+        CPOS,
+        false,
+        false,
+        false,
+        true,
+        false> {
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+
+        const __m128 storeValue = elementaryBlock4x1b(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset);
+
+        _mm_storeu_ps(outputStore + CPOS, storeValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
+              pqFineCentroids0, code0, outputStore);
+
+        // clang-format on
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+              pqFineCentroids0, code0, weight0, outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+              pqFineCentroids,
+              code0, weight0,
+              code1, weight1,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids0 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids1 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids2 + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+              pqFineCentroids0, code0, weight0,
+              pqFineCentroids1, code1, weight1,
+              pqFineCentroids2, code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // clang-format off
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine0);
+        const intptr_t fineCode1 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine1);
+        const intptr_t fineCode2 = detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::get(fine2);
+
+        __m128 existingValue = _mm_loadu_ps(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE + fineCentroidOffset,
+              weight0,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) * FINE_SIZE + fineCentroidOffset,
+              weight1,
+              existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+              pqFineCentroids + (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) * FINE_SIZE + fineCentroidOffset,
+              weight2,
+              existingValue);
+
+        _mm_storeu_ps(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+              pqFineCentroids, code0, weight0,
+              code1, weight1,
+              code2, weight2,
+              outputAccum);
+
+        // clang-format on
+    }
+};
+
+// This partial specialization is expected to do nothing.
+template <
+        intptr_t DIM,
+        intptr_t FINE_SIZE,
+        intptr_t FINE_BITS,
+        bool FINE_SIZE_EQ_2,
+        bool FINE_SIZE_EQ_4,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct IndexPQDecoderImpl<
+        DIM,
+        FINE_SIZE,
+        FINE_BITS,
+        DIM,
+        FINE_SIZE_EQ_2,
+        FINE_SIZE_EQ_4,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        true> {
+    // clang-format off
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {}
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+
+    // clang-format on
+};
+
+} // namespace
+
+// Suitable for PQ[1]x8
+// Suitable for PQ[1]x10
+// Suitable for PQ[1]x12
+// Suitable for PQ[1]x16
+template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
+struct IndexPQDecoder {
+    static_assert(
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
+
+    static constexpr intptr_t dim = DIM;
+    static constexpr intptr_t fineSize = FINE_SIZE;
+    static constexpr intptr_t fineBits = FINE_BITS;
+
+    // Process 1 sample.
+    static void store(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
+                pqFineCentroids, code, outputStore);
+    }
+
+    // Process 1 sample.
+    // Performs outputAccum += weight * decoded(code)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids, code, weight, outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1)
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                outputAccum);
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                pqFineCentroids2,
+                code2,
+                weight2,
+                outputAccum);
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids,
+                code0,
+                weight0,
+                code1,
+                weight1,
+                code2,
+                weight2,
+                outputAccum);
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
+#endif // PQ_AVX2_INL_H
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h
new file mode 100644
index 000000000..de6622de4
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-inl.h
@@ -0,0 +1,257 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PQ_INL_H
+#define PQ_INL_H
+
+#include <cstddef>
+#include <cstdint>
+
+namespace faiss {
+namespace cppcontrib {
+
+////////////////////////////////////////////////////////////////////////////////////
+/// IndexPQDecoder
+////////////////////////////////////////////////////////////////////////////////////
+
+// Suitable for PQ[1]x8
+template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
+struct IndexPQDecoder {
+    static_assert(
+            FINE_BITS == 8,
+            "Only 8 bits is currently supported for FINE_BITS");
+
+    static constexpr intptr_t dim = DIM;
+    static constexpr intptr_t fineSize = FINE_SIZE;
+    static constexpr intptr_t fineBits = FINE_BITS;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // Process 1 sample.
+    // Performs outputStore = decoded(code)
+    static void store(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine = code;
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t fineCode = fine[fineCentroidIdx];
+
+            const float* const __restrict finePtr = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputStore[i] = *finePtr;
+        }
+    }
+
+    // Process 1 sample.
+    // Performs outputAccum += weight * decoded(code)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine = code;
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t fineCode = fine[fineCentroidIdx];
+
+            const float* const __restrict finePtr = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight * (*finePtr);
+        }
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs
+    //  outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+
+            const float* const __restrict finePtr0 = pqFineCentroids0 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids1 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
+        }
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs
+    //  outputAccum += weight0 * decoded(code0) + weight1 * decoded(code1)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+
+            const float* const __restrict finePtr0 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1);
+        }
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+
+            const float* const __restrict finePtr0 = pqFineCentroids0 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids1 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict finePtr2 = pqFineCentroids2 +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
+                    weight2 * (*finePtr2);
+        }
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+#pragma unroll
+        for (intptr_t i = 0; i < DIM; i++) {
+            const intptr_t fineCentroidIdx = i / FINE_SIZE;
+            const intptr_t fineCentroidOffset = i % FINE_SIZE;
+
+            const intptr_t fineCode0 = fine0[fineCentroidIdx];
+            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+
+            const float* const __restrict finePtr0 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict finePtr1 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+            const float* const __restrict finePtr2 = pqFineCentroids +
+                    (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                            FINE_SIZE +
+                    fineCentroidOffset;
+
+            outputAccum[i] += weight0 * (*finePtr0) + weight1 * (*finePtr1) +
+                    weight2 * (*finePtr2);
+        }
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
+#endif // PQ_INL_H
diff --git a/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h b/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h
new file mode 100644
index 000000000..a84014d26
--- /dev/null
+++ b/thirdparty/faiss/faiss/cppcontrib/sa_decode/PQ-neon-inl.h
@@ -0,0 +1,1460 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PQ_NEON_INL_H
+#define PQ_NEON_INL_H
+
+#include <arm_neon.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/cppcontrib/detail/UintReader.h>
+
+namespace faiss {
+namespace cppcontrib {
+
+////////////////////////////////////////////////////////////////////////////////////
+/// IndexPQDecoder
+////////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+
+// Despite the following functions are somewhat redundant, I'd like to keep the
+// overall basic blocks similar to ones from Index2LevelDecoder.
+// A compiler will optimize away the redundant code.
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = *fine[0..3];
+// }
+inline float32x4_t elementaryBlock4x1b(const float* const __restrict fine) {
+    // load fine
+    const auto fineValue = vld1q_f32(fine);
+    return fineValue;
+}
+
+// Processes 4 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*fine[0..3]);
+// }
+inline float32x4_t elementaryBlock4x1bAccum(
+        const float* const __restrict fine,
+        const float weight,
+        const float32x4_t existingValue) {
+    const auto fineValue = elementaryBlock4x1b(fine);
+
+    // this operation is expected to be optimized by a compiler
+    const auto weightNeon = vdupq_n_f32(weight);
+    // do fma
+    return vfmaq_f32(existingValue, weightNeon, fineValue);
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = *fine0[0..3];
+//   [4..7] = *fine1[0..3];
+// }
+inline float32x4x2_t elementaryBlock4x2b(
+        const float* const __restrict fine0,
+        const float* const __restrict fine1) {
+    // load fine
+    const auto fineValue0 = vld1q_f32(fine0);
+    const auto fineValue1 = vld1q_f32(fine1);
+
+    return {fineValue0, fineValue1};
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..3] = existingValue[0..3] + weight * (*fine0[0..3]);
+//   [4..7] = existingValue[4..7] + weight * (*fine1[0..3]);
+// }
+inline float32x4x2_t elementaryBlock4x2bAccum(
+        const float* const __restrict fine0,
+        const float* const __restrict fine1,
+        const float weight,
+        const float32x4x2_t existingValue) {
+    const auto fineValue = elementaryBlock4x2b(fine0, fine1);
+
+    // this operation is expected to be optimized by a compiler
+    const auto weightNeon = vdupq_n_f32(weight);
+    // do fma
+    const auto result0 =
+            vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
+    const auto result1 =
+            vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
+    return {result0, result1};
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = *fine[0..7];
+// }
+inline float32x4x2_t elementaryBlock8x1b(const float* const __restrict fine) {
+    // load fine
+    const auto fineValue0 = vld1q_f32(fine);
+    const auto fineValue1 = vld1q_f32(fine + 4);
+    return {fineValue0, fineValue1};
+}
+
+// Processes 8 float values.
+// Returns {
+//   [0..7] = existingValue[0..7] + weight * (*fine[0..7]);
+// }
+inline float32x4x2_t elementaryBlock8x1bAccum(
+        const float* const __restrict fine,
+        const float weight,
+        const float32x4x2_t existingValue) {
+    const auto fineValue = elementaryBlock8x1b(fine);
+
+    // this operation is expected to be optimized by a compiler
+    const auto weightNeon = vdupq_n_f32(weight);
+    // do fma
+    const auto result0 =
+            vfmaq_f32(existingValue.val[0], weightNeon, fineValue.val[0]);
+    const auto result1 =
+            vfmaq_f32(existingValue.val[1], weightNeon, fineValue.val[1]);
+    return {result0, result1};
+}
+
+// The following code uses template-based for-loop unrolling,
+//   because the compiler does not do that on its own as needed.
+// The idea is the following:
+//   template<int I, int MAX>
+//   struct Foo {
+//     static void bar() {
+//       doSomething(I);
+//       Foo<I + 1, MAX>::bar();
+//     }
+//   };
+//
+//   template<int MAX>
+//   struct Foo<MAX, MAX> {
+//     static void bar() {}
+//   };
+//
+//   Initiate the loop:
+//     Foo<0, MAX>::bar();
+
+template <
+        intptr_t DIM,
+        intptr_t FINE_SIZE,
+        intptr_t FINE_BITS,
+        intptr_t CPOS,
+        bool FINE_SIZE_EQ_4 = FINE_SIZE == 4,
+        bool QPOS_LEFT_GE_8 = (FINE_SIZE - CPOS % FINE_SIZE >= 8),
+        bool QPOS_LEFT_GE_4 = (FINE_SIZE - CPOS % FINE_SIZE >= 4),
+        bool DIM_EQ_CPOS = DIM == CPOS>
+struct IndexPQDecoderImpl;
+
+template <
+        intptr_t DIM,
+        intptr_t CPOS,
+        intptr_t FINE_BITS,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct IndexPQDecoderImpl<
+        DIM,
+        4,
+        FINE_BITS,
+        CPOS,
+        true,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        false> {
+    static constexpr intptr_t FINE_SIZE = 4;
+
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+
+        const auto storeValue = elementaryBlock4x2b(
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset);
+
+        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
+        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
+                pqFineCentroids0, code0, outputStore);
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids0, code0, weight0, outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                outputAccum);
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+        const intptr_t fineCode2a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine2);
+        const intptr_t fineCode2b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine2);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids0 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids1 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids2 +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode2a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids2 +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode2b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                pqFineCentroids2,
+                code2,
+                weight2,
+                outputAccum);
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // process chunks, 4 float
+        // but 8 floats per loop
+
+        const intptr_t fineCode0a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine0);
+        const intptr_t fineCode0b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine0);
+        const intptr_t fineCode1a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine1);
+        const intptr_t fineCode1b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine1);
+        const intptr_t fineCode2a = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 0>::get(
+                        fine2);
+        const intptr_t fineCode2b = detail::
+                UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx + 1>::get(
+                        fine2);
+
+        auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode0a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode0b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode1a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode1b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x2bAccum(
+                pqFineCentroids +
+                        ((fineCentroidIdx + 0) * FINE_TABLE_BYTES +
+                         fineCode2a) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                pqFineCentroids +
+                        ((fineCentroidIdx + 1) * FINE_TABLE_BYTES +
+                         fineCode2b) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids,
+                code0,
+                weight0,
+                code1,
+                weight1,
+                code2,
+                weight2,
+                outputAccum);
+    }
+};
+
+template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
+struct IndexPQDecoderImpl<
+        DIM,
+        FINE_SIZE,
+        FINE_BITS,
+        CPOS,
+        false,
+        true,
+        true,
+        false> {
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        const auto storeValue = elementaryBlock8x1b(
+                pqFineCentroids0 +
+                (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
+                fineCentroidOffset);
+
+        vst1q_f32(outputStore + CPOS, storeValue.val[0]);
+        vst1q_f32(outputStore + CPOS + 4, storeValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::store(
+                pqFineCentroids0, code0, outputStore);
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        const auto existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids0, code0, weight0, outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                outputAccum);
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids2 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                pqFineCentroids2,
+                code2,
+                weight2,
+                outputAccum);
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // process chunks, 8 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        const auto existingValue0 = vld1q_f32(outputAccum + CPOS);
+        const auto existingValue1 = vld1q_f32(outputAccum + CPOS + 4);
+
+        auto existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                {existingValue0, existingValue1});
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock8x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue.val[0]);
+        vst1q_f32(outputAccum + CPOS + 4, existingValue.val[1]);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 8>::accum(
+                pqFineCentroids,
+                code0,
+                weight0,
+                code1,
+                weight1,
+                code2,
+                weight2,
+                outputAccum);
+    }
+};
+
+template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS, intptr_t CPOS>
+struct IndexPQDecoderImpl<
+        DIM,
+        FINE_SIZE,
+        FINE_BITS,
+        CPOS,
+        false,
+        false,
+        true,
+        false> {
+    static constexpr intptr_t fineCentroidIdx = CPOS / FINE_SIZE;
+    static constexpr intptr_t fineCentroidOffset = CPOS % FINE_SIZE;
+
+    static constexpr intptr_t QPOS_LEFT = FINE_SIZE - fineCentroidOffset;
+
+    static constexpr intptr_t FINE_TABLE_BYTES = (1 << FINE_BITS);
+
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        const auto storeValue = elementaryBlock4x1b(
+                pqFineCentroids0 +
+                (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) * FINE_SIZE +
+                fineCentroidOffset);
+
+        vst1q_f32(outputStore + CPOS, storeValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::store(
+                pqFineCentroids0, code0, outputStore);
+    }
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+                pqFineCentroids0, code0, weight0, outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                outputAccum);
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids0 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids1 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids2 +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                pqFineCentroids2,
+                code2,
+                weight2,
+                outputAccum);
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        // fine quantizer
+        const uint8_t* const __restrict fine0 = code0;
+        const uint8_t* const __restrict fine1 = code1;
+        const uint8_t* const __restrict fine2 = code2;
+
+        // process chunks, 4 float
+
+        const intptr_t fineCode0 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine0);
+        const intptr_t fineCode1 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine1);
+        const intptr_t fineCode2 =
+                detail::UintReader<DIM, FINE_SIZE, FINE_BITS, fineCentroidIdx>::
+                        get(fine2);
+
+        auto existingValue = vld1q_f32(outputAccum + CPOS);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode0) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight0,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode1) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight1,
+                existingValue);
+
+        existingValue = elementaryBlock4x1bAccum(
+                pqFineCentroids +
+                        (fineCentroidIdx * FINE_TABLE_BYTES + fineCode2) *
+                                FINE_SIZE +
+                        fineCentroidOffset,
+                weight2,
+                existingValue);
+
+        vst1q_f32(outputAccum + CPOS, existingValue);
+
+        // next
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, CPOS + 4>::accum(
+                pqFineCentroids,
+                code0,
+                weight0,
+                code1,
+                weight1,
+                code2,
+                weight2,
+                outputAccum);
+    }
+};
+
+// This partial specialization is expected to do nothing.
+template <
+        intptr_t DIM,
+        intptr_t FINE_SIZE,
+        intptr_t FINE_BITS,
+        bool FINE_SIZE_EQ_4,
+        bool QPOS_LEFT_GE_8,
+        bool QPOS_LEFT_GE_4>
+struct IndexPQDecoderImpl<
+        DIM,
+        FINE_SIZE,
+        FINE_BITS,
+        DIM,
+        FINE_SIZE_EQ_4,
+        QPOS_LEFT_GE_8,
+        QPOS_LEFT_GE_4,
+        true> {
+    // process 1 sample
+    static void store(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            float* const __restrict outputStore) {}
+
+    // process 1 sample
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {}
+};
+} // namespace
+
+// Suitable for PQ[1]x8
+// Suitable for PQ[1]x10
+// Suitable for PQ[1]x12
+// Suitable for PQ[1]x16
+template <intptr_t DIM, intptr_t FINE_SIZE, intptr_t FINE_BITS = 8>
+struct IndexPQDecoder {
+    static_assert(
+            FINE_BITS == 8 || FINE_BITS == 10 || FINE_BITS == 12 ||
+                    FINE_BITS == 16,
+            "Only 8, 10, 12 or 16 bits are currently supported for FINE_BITS");
+
+    static constexpr intptr_t dim = DIM;
+    static constexpr intptr_t fineSize = FINE_SIZE;
+    static constexpr intptr_t fineBits = FINE_BITS;
+
+    // Process 1 sample.
+    static void store(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            float* const __restrict outputStore) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::store(
+                pqFineCentroids, code, outputStore);
+    }
+
+    // Process 1 sample.
+    // Performs outputAccum += weight * decoded(code)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code,
+            const float weight,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids, code, weight, outputAccum);
+    }
+
+    // Process 2 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1)
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                outputAccum);
+    }
+
+    // Process 2 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids, code0, weight0, code1, weight1, outputAccum);
+    }
+
+    // Process 3 samples.
+    // Each code uses its own fine pq centroids table.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqFineCentroids0,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const float* const __restrict pqFineCentroids1,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const float* const __restrict pqFineCentroids2,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids0,
+                code0,
+                weight0,
+                pqFineCentroids1,
+                code1,
+                weight1,
+                pqFineCentroids2,
+                code2,
+                weight2,
+                outputAccum);
+    }
+
+    // Process 3 samples.
+    // Fine pq centroids table is shared among codes.
+    //
+    // Performs outputAccum += weight0 * decoded(code0) + weight1 *
+    //   decoded(code1) + weight2 * decoded(code2)
+    static void accum(
+            const float* const __restrict pqFineCentroids,
+            const uint8_t* const __restrict code0,
+            const float weight0,
+            const uint8_t* const __restrict code1,
+            const float weight1,
+            const uint8_t* const __restrict code2,
+            const float weight2,
+            float* const __restrict outputAccum) {
+        IndexPQDecoderImpl<DIM, FINE_SIZE, FINE_BITS, 0>::accum(
+                pqFineCentroids,
+                code0,
+                weight0,
+                code1,
+                weight1,
+                code2,
+                weight2,
+                outputAccum);
+    }
+};
+
+} // namespace cppcontrib
+} // namespace faiss
+#endif // PQ_NEON_INL_H
diff --git a/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp b/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp
index 82fea4fb9..e72af5431 100644
--- a/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/AdditiveQuantizer.cpp
@@ -17,8 +17,11 @@
 
 #include <algorithm>
 
+#include <faiss/Clustering.h>
 #include <faiss/FaissHook.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h>
@@ -49,17 +52,10 @@ AdditiveQuantizer::AdditiveQuantizer(
         size_t d,
         const std::vector<size_t>& nbits,
         Search_type_t search_type)
-        : d(d),
+        : Quantizer(d),
           M(nbits.size()),
           nbits(nbits),
-          verbose(false),
-          is_trained(false),
           search_type(search_type) {
-    norm_max = norm_min = NAN;
-    code_size = 0;
-    tot_bits = 0;
-    total_codebook_size = 0;
-    only_8bit = false;
     set_derived_values();
 }
 
@@ -81,27 +77,82 @@ void AdditiveQuantizer::set_derived_values() {
     }
     total_codebook_size = codebook_offsets[M];
     switch (search_type) {
-        case ST_decompress:
-        case ST_LUT_nonorm:
-        case ST_norm_from_LUT:
-            break; // nothing to add
         case ST_norm_float:
-            tot_bits += 32;
+            norm_bits = 32;
             break;
         case ST_norm_qint8:
         case ST_norm_cqint8:
-            tot_bits += 8;
+        case ST_norm_lsq2x4:
+        case ST_norm_rq2x4:
+            norm_bits = 8;
             break;
         case ST_norm_qint4:
         case ST_norm_cqint4:
-            tot_bits += 4;
+            norm_bits = 4;
+            break;
+        case ST_decompress:
+        case ST_LUT_nonorm:
+        case ST_norm_from_LUT:
+        default:
+            norm_bits = 0;
             break;
     }
+    tot_bits += norm_bits;
 
     // convert bits to bytes
     code_size = (tot_bits + 7) / 8;
 }
 
+void AdditiveQuantizer::train_norm(size_t n, const float* norms) {
+    norm_min = HUGE_VALF;
+    norm_max = -HUGE_VALF;
+    for (idx_t i = 0; i < n; i++) {
+        if (norms[i] < norm_min) {
+            norm_min = norms[i];
+        }
+        if (norms[i] > norm_max) {
+            norm_max = norms[i];
+        }
+    }
+
+    if (search_type == ST_norm_cqint8 || search_type == ST_norm_cqint4) {
+        size_t k = (1 << 8);
+        if (search_type == ST_norm_cqint4) {
+            k = (1 << 4);
+        }
+        Clustering1D clus(k);
+        clus.train_exact(n, norms);
+        qnorm.add(clus.k, clus.centroids.data());
+    } else if (search_type == ST_norm_lsq2x4 || search_type == ST_norm_rq2x4) {
+        std::unique_ptr<AdditiveQuantizer> aq;
+        if (search_type == ST_norm_lsq2x4) {
+            aq.reset(new LocalSearchQuantizer(1, 2, 4));
+        } else {
+            aq.reset(new ResidualQuantizer(1, 2, 4));
+        }
+
+        aq->train(n, norms);
+        // flatten aq codebooks
+        std::vector<float> flat_codebooks(1 << 8);
+        FAISS_THROW_IF_NOT(aq->codebooks.size() == 32);
+
+        // save norm tables for 4-bit fastscan search
+        norm_tabs = aq->codebooks;
+
+        // assume big endian
+        const float* c = norm_tabs.data();
+        for (size_t i = 0; i < 16; i++) {
+            for (size_t j = 0; j < 16; j++) {
+                flat_codebooks[i * 16 + j] = c[j] + c[16 + i];
+            }
+        }
+
+        qnorm.reset();
+        qnorm.add(1 << 8, flat_codebooks.data());
+        FAISS_THROW_IF_NOT(qnorm.ntotal == (1 << 8));
+    }
+}
+
 namespace {
 
 // TODO
@@ -133,8 +184,7 @@ float decode_qint4(uint8_t i, float amin, float amax) {
 
 uint32_t AdditiveQuantizer::encode_qcint(float x) const {
     idx_t id;
-    // qnorm.assign(idx_t(1), &x, &id, idx_t(1));
-    qnorm.assign(idx_t(1), &x, &id, nullptr);
+    qnorm.assign(1, &x, &id, 1);
     return uint32_t(id);
 }
 
@@ -142,23 +192,54 @@ float AdditiveQuantizer::decode_qcint(uint32_t c) const {
     return qnorm.get_xb()[c];
 }
 
+uint64_t AdditiveQuantizer::encode_norm(float norm) const {
+    switch (search_type) {
+        case ST_norm_float:
+            uint32_t inorm;
+            memcpy(&inorm, &norm, 4);
+            return inorm;
+        case ST_norm_qint8:
+            return encode_qint8(norm, norm_min, norm_max);
+        case ST_norm_qint4:
+            return encode_qint4(norm, norm_min, norm_max);
+        case ST_norm_lsq2x4:
+        case ST_norm_rq2x4:
+        case ST_norm_cqint8:
+            return encode_qcint(norm);
+        case ST_norm_cqint4:
+            return encode_qcint(norm);
+        case ST_decompress:
+        case ST_LUT_nonorm:
+        case ST_norm_from_LUT:
+        default:
+            return 0;
+    }
+}
+
 void AdditiveQuantizer::pack_codes(
         size_t n,
         const int32_t* codes,
         uint8_t* packed_codes,
         int64_t ld_codes,
-        const float* norms) const {
+        const float* norms,
+        const float* centroids) const {
     if (ld_codes == -1) {
         ld_codes = M;
     }
     std::vector<float> norm_buf;
     if (search_type == ST_norm_float || search_type == ST_norm_qint4 ||
         search_type == ST_norm_qint8 || search_type == ST_norm_cqint8 ||
-        search_type == ST_norm_cqint4) {
-        if (!norms) {
+        search_type == ST_norm_cqint4 || search_type == ST_norm_lsq2x4 ||
+        search_type == ST_norm_rq2x4) {
+        if (centroids != nullptr || !norms) {
             norm_buf.resize(n);
             std::vector<float> x_recons(n * d);
             decode_unpacked(codes, x_recons.data(), n, ld_codes);
+
+            if (centroids != nullptr) {
+                // x = x + c
+                fvec_add(n * d, x_recons.data(), centroids, x_recons.data());
+            }
             fvec_norms_L2sqr(norm_buf.data(), x_recons.data(), d, n);
             norms = norm_buf.data();
         }
@@ -170,34 +251,8 @@ void AdditiveQuantizer::pack_codes(
         for (int m = 0; m < M; m++) {
             bsw.write(codes1[m], nbits[m]);
         }
-        switch (search_type) {
-            case ST_decompress:
-            case ST_LUT_nonorm:
-            case ST_norm_from_LUT:
-                break;
-            case ST_norm_float:
-                bsw.write(*(uint32_t*)&norms[i], 32);
-                break;
-            case ST_norm_qint8: {
-                uint8_t b = encode_qint8(norms[i], norm_min, norm_max);
-                bsw.write(b, 8);
-                break;
-            }
-            case ST_norm_qint4: {
-                uint8_t b = encode_qint4(norms[i], norm_min, norm_max);
-                bsw.write(b, 4);
-                break;
-            }
-            case ST_norm_cqint8: {
-                uint32_t b = encode_qcint(norms[i]);
-                bsw.write(b, 8);
-                break;
-            }
-            case ST_norm_cqint4: {
-                uint32_t b = encode_qcint(norms[i]);
-                bsw.write(b, 4);
-                break;
-            }
+        if (norm_bits != 0) {
+            bsw.write(encode_norm(norms[i]), norm_bits);
         }
     }
 }
@@ -285,32 +340,39 @@ void AdditiveQuantizer::decode_64bit(idx_t bits, float* xi) const {
     }
 }
 
-void AdditiveQuantizer::compute_LUT(size_t n, const float* xq, float* LUT)
-        const {
+void AdditiveQuantizer::compute_LUT(
+        size_t n,
+        const float* xq,
+        float* LUT,
+        float alpha,
+        long ld_lut) const {
     // in all cases, it is large matrix multiplication
 
     FINTEGER ncenti = total_codebook_size;
     FINTEGER di = d;
     FINTEGER nqi = n;
-    float one = 1, zero = 0;
+    FINTEGER ldc = ld_lut > 0 ? ld_lut : ncenti;
+    float zero = 0;
 
     sgemm_("Transposed",
            "Not transposed",
            &ncenti,
            &nqi,
            &di,
-           &one,
+           &alpha,
            codebooks.data(),
            &di,
            xq,
            &di,
            &zero,
            LUT,
-           &ncenti);
+           &ldc);
 }
 
 namespace {
 
+/* compute inner products of one query with all centroids, given a look-up
+ * table of all inner producst with codebook entries */
 void compute_inner_prod_with_LUT(
         const AdditiveQuantizer& aq,
         const float* LUT,
@@ -450,7 +512,8 @@ float AdditiveQuantizer::
     BitstringReader bs(codes, code_size);
     float accu = accumulate_IPs(*this, bs, codes, LUT);
     uint32_t norm_i = bs.read(32);
-    float norm2 = *(float*)&norm_i;
+    float norm2;
+    memcpy(&norm2, &norm_i, 4);
     return norm2 - 2 * accu;
 }
 
diff --git a/thirdparty/faiss/faiss/impl/AdditiveQuantizer.h b/thirdparty/faiss/faiss/impl/AdditiveQuantizer.h
index 4bcc1b9ce..9a4f4fa37 100644
--- a/thirdparty/faiss/faiss/impl/AdditiveQuantizer.h
+++ b/thirdparty/faiss/faiss/impl/AdditiveQuantizer.h
@@ -7,11 +7,13 @@
 
 #pragma once
 
+#include <cmath>
 #include <cstdint>
 #include <vector>
 
 #include <faiss/Index.h>
 #include <faiss/IndexFlat.h>
+#include <faiss/impl/Quantizer.h>
 
 namespace faiss {
 
@@ -21,29 +23,37 @@ namespace faiss {
  * concatenation of M sub-vectors, additive quantizers sum M sub-vectors
  * to get the decoded vector.
  */
-struct AdditiveQuantizer {
-    size_t d;                     ///< size of the input vectors
+struct AdditiveQuantizer : Quantizer {
     size_t M;                     ///< number of codebooks
     std::vector<size_t> nbits;    ///< bits for each step
     std::vector<float> codebooks; ///< codebooks
 
     // derived values
     std::vector<uint64_t> codebook_offsets;
-    size_t code_size;           ///< code size in bytes
-    size_t tot_bits;            ///< total number of bits
-    size_t total_codebook_size; ///< size of the codebook in vectors
-    bool only_8bit;             ///< are all nbits = 8 (use faster decoder)
+    size_t tot_bits = 0;            ///< total number of bits (indexes + norms)
+    size_t norm_bits = 0;           ///< bits allocated for the norms
+    size_t total_codebook_size = 0; ///< size of the codebook in vectors
+    bool only_8bit = false;         ///< are all nbits = 8 (use faster decoder)
 
-    bool verbose;    ///< verbose during training?
-    bool is_trained; ///< is trained or not
+    bool verbose = false;    ///< verbose during training?
+    bool is_trained = false; ///< is trained or not
 
-    IndexFlat1D qnorm; ///< store and search norms
+    IndexFlat1D qnorm;            ///< store and search norms
+    std::vector<float> norm_tabs; ///< store norms of codebook entries for 4-bit
+                                  ///< fastscan search
 
-    uint32_t encode_qcint(
-            float x) const; ///< encode norm by non-uniform scalar quantization
+    /// norms and distance matrixes with beam search can get large, so use this
+    /// to control for the amount of memory that can be allocated
+    size_t max_mem_distances = 5 * (size_t(1) << 30);
 
-    float decode_qcint(uint32_t c)
-            const; ///< decode norm by non-uniform scalar quantization
+    /// encode a norm into norm_bits bits
+    uint64_t encode_norm(float norm) const;
+
+    /// encode norm by non-uniform scalar quantization
+    uint32_t encode_qcint(float x) const;
+
+    /// decode norm by non-uniform scalar quantization
+    float decode_qcint(uint32_t c) const;
 
     /// Encodes how search is performed and how vectors are encoded
     enum Search_type_t {
@@ -57,6 +67,10 @@ struct AdditiveQuantizer {
         ST_norm_qint4,
         ST_norm_cqint8, ///< use a LUT, and store non-uniform quantized norm
         ST_norm_cqint4,
+
+        ST_norm_lsq2x4, ///< use a 2x4 bits lsq as norm quantizer (for fast
+                        ///< scan)
+        ST_norm_rq2x4,  ///< use a 2x4 bits rq as norm quantizer (for fast scan)
     };
 
     AdditiveQuantizer(
@@ -69,16 +83,25 @@ struct AdditiveQuantizer {
     ///< compute derived values when d, M and nbits have been set
     void set_derived_values();
 
-    ///< Train the additive quantizer
-    virtual void train(size_t n, const float* x) = 0;
+    ///< Train the norm quantizer
+    void train_norm(size_t n, const float* norms);
+
+    void compute_codes(const float* x, uint8_t* codes, size_t n)
+            const override {
+        compute_codes_add_centroids(x, codes, n);
+    }
 
     /** Encode a set of vectors
      *
      * @param x      vectors to encode, size n * d
      * @param codes  output codes, size n * code_size
+     * @param centroids  centroids to be added to x, size n * d
      */
-    virtual void compute_codes(const float* x, uint8_t* codes, size_t n)
-            const = 0;
+    virtual void compute_codes_add_centroids(
+            const float* x,
+            uint8_t* codes,
+            size_t n,
+            const float* centroids = nullptr) const = 0;
 
     /** pack a series of code to bit-compact format
      *
@@ -87,27 +110,29 @@ struct AdditiveQuantizer {
      * @param ld_codes     leading dimension of codes
      * @param norms        norms of the vectors (size n). Will be computed if
      *                     needed but not provided
+     * @param centroids    centroids to be added to x, size n * d
      */
     void pack_codes(
             size_t n,
             const int32_t* codes,
             uint8_t* packed_codes,
             int64_t ld_codes = -1,
-            const float* norms = nullptr) const;
+            const float* norms = nullptr,
+            const float* centroids = nullptr) const;
 
     /** Decode a set of vectors
      *
      * @param codes  codes to decode, size n * code_size
      * @param x      output vectors, size n * d
      */
-    void decode(const uint8_t* codes, float* x, size_t n) const;
+    void decode(const uint8_t* codes, float* x, size_t n) const override;
 
     /** Decode a set of vectors in non-packed format
      *
      * @param codes  codes to decode, size n * ld_codes
      * @param x      output vectors, size n * d
      */
-    void decode_unpacked(
+    virtual void decode_unpacked(
             const int32_t* codes,
             float* x,
             size_t n,
@@ -121,7 +146,7 @@ struct AdditiveQuantizer {
     Search_type_t search_type;
 
     /// min/max for quantization of norms
-    float norm_min, norm_max;
+    float norm_min = NAN, norm_max = NAN;
 
     template <bool is_IP, Search_type_t effective_search_type>
     float compute_1_distance_LUT(const uint8_t* codes, const float* LUT) const;
@@ -133,7 +158,6 @@ struct AdditiveQuantizer {
      * Support for exhaustive distance computations with all the centroids.
      * Hence, the number of these centroids should not be too large.
      ****************************************************************************/
-    using idx_t = Index::idx_t;
 
     /// decoding function for a code in a 64-bit word
     void decode_64bit(idx_t n, float* x) const;
@@ -143,8 +167,15 @@ struct AdditiveQuantizer {
      *
      * @param xq     query vector, size (n, d)
      * @param LUT    look-up table, size (n, total_codebook_size)
+     * @param alpha  compute alpha * inner-product
+     * @param ld_lut  leading dimension of LUT
      */
-    void compute_LUT(size_t n, const float* xq, float* LUT) const;
+    virtual void compute_LUT(
+            size_t n,
+            const float* xq,
+            float* LUT,
+            float alpha = 1.0f,
+            long ld_lut = -1) const;
 
     /// exact IP search
     void knn_centroids_inner_product(
diff --git a/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp b/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp
index 4342ca20f..cebe8a1e2 100644
--- a/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp
+++ b/thirdparty/faiss/faiss/impl/AuxIndexStructures.cpp
@@ -20,7 +20,7 @@ namespace faiss {
  * RangeSearchResult
  ***********************************************************************/
 
-RangeSearchResult::RangeSearchResult(idx_t nq, bool alloc_lims) : nq(nq) {
+RangeSearchResult::RangeSearchResult(size_t nq, bool alloc_lims) : nq(nq) {
     if (alloc_lims) {
         lims = new size_t[nq + 1];
         memset(lims, 0, sizeof(*lims) * (nq + 1));
@@ -50,15 +50,9 @@ void RangeSearchResult::do_allocation() {
 }
 
 RangeSearchResult::~RangeSearchResult() {
-    if (labels) {
-        delete[] labels;
-    }
-    if (distances) {
-        delete[] distances;
-    }
-    if (lims) {
-        delete[] lims;
-    }
+    delete[] labels;
+    delete[] distances;
+    delete[] lims;
 }
 
 /***********************************************************************
@@ -71,12 +65,8 @@ BufferList::BufferList(size_t buffer_size) : buffer_size(buffer_size) {
 
 BufferList::~BufferList() {
     for (int i = 0; i < buffers.size(); i++) {
-        if (buffers[i].ids) {
-            delete[] buffers[i].ids;
-        }
-        if (buffers[i].dis) {
-            delete[] buffers[i].dis;
-        }
+        delete[] buffers[i].ids;
+        delete[] buffers[i].dis;
     }
 }
 
@@ -209,60 +199,6 @@ void RangeSearchPartialResult::merge(
     result->lims[0] = 0;
 }
 
-/***********************************************************************
- * IDSelectorRange
- ***********************************************************************/
-
-IDSelectorRange::IDSelectorRange(idx_t imin, idx_t imax)
-        : imin(imin), imax(imax) {}
-
-bool IDSelectorRange::is_member(idx_t id) const {
-    return id >= imin && id < imax;
-}
-
-/***********************************************************************
- * IDSelectorArray
- ***********************************************************************/
-
-IDSelectorArray::IDSelectorArray(size_t n, const idx_t* ids) : n(n), ids(ids) {}
-
-bool IDSelectorArray::is_member(idx_t id) const {
-    for (idx_t i = 0; i < n; i++) {
-        if (ids[i] == id)
-            return true;
-    }
-    return false;
-}
-
-/***********************************************************************
- * IDSelectorBatch
- ***********************************************************************/
-
-IDSelectorBatch::IDSelectorBatch(size_t n, const idx_t* indices) {
-    nbits = 0;
-    while (n > (1L << nbits))
-        nbits++;
-    nbits += 5;
-    // for n = 1M, nbits = 25 is optimal, see P56659518
-
-    mask = (1L << nbits) - 1;
-    bloom.resize(1UL << (nbits - 3), 0);
-    for (long i = 0; i < n; i++) {
-        Index::idx_t id = indices[i];
-        set.insert(id);
-        id &= mask;
-        bloom[id >> 3] |= 1 << (id & 7);
-    }
-}
-
-bool IDSelectorBatch::is_member(idx_t i) const {
-    long im = i & mask;
-    if (!(bloom[im >> 3] & (1 << (im & 7)))) {
-        return 0;
-    }
-    return set.count(i);
-}
-
 /***********************************************************
  * Interrupt callback
  ***********************************************************/
@@ -294,7 +230,7 @@ bool InterruptCallback::is_interrupted() {
 
 size_t InterruptCallback::get_period_hint(size_t flops) {
     if (!instance.get()) {
-        return 1L << 30; // never check
+        return (size_t)1 << 30; // never check
     }
     // for 10M flops, it is reasonable to check once every 10 iterations
     return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
diff --git a/thirdparty/faiss/faiss/impl/AuxIndexStructures.h b/thirdparty/faiss/faiss/impl/AuxIndexStructures.h
index b7721f418..344a708b7 100644
--- a/thirdparty/faiss/faiss/impl/AuxIndexStructures.h
+++ b/thirdparty/faiss/faiss/impl/AuxIndexStructures.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 // Auxiliary index structures, that are used in indexes but that can
 // be forward-declared
 
@@ -18,10 +16,9 @@
 #include <cstring>
 #include <memory>
 #include <mutex>
-#include <unordered_set>
 #include <vector>
 
-#include <faiss/Index.h>
+#include <faiss/MetricType.h>
 #include <faiss/impl/platform_macros.h>
 
 namespace faiss {
@@ -34,15 +31,13 @@ struct RangeSearchResult {
     size_t nq;    ///< nb of queries
     size_t* lims; ///< size (nq + 1)
 
-    typedef Index::idx_t idx_t;
-
     idx_t* labels;    ///< result for query i is labels[lims[i]:lims[i+1]]
     float* distances; ///< corresponding distances (not sorted)
 
     size_t buffer_size; ///< size of the result buffers used
 
     /// lims must be allocated on input to range_search.
-    explicit RangeSearchResult(idx_t nq, bool alloc_lims = true);
+    explicit RangeSearchResult(size_t nq, bool alloc_lims = true);
 
     /// called when lims contains the nb of elements result entries
     /// for each query
@@ -52,55 +47,6 @@ struct RangeSearchResult {
     virtual ~RangeSearchResult();
 };
 
-/** Encapsulates a set of ids to remove. */
-struct IDSelector {
-    typedef Index::idx_t idx_t;
-    virtual bool is_member(idx_t id) const = 0;
-    virtual ~IDSelector() {}
-};
-
-/** remove ids between [imni, imax) */
-struct IDSelectorRange : IDSelector {
-    idx_t imin, imax;
-
-    IDSelectorRange(idx_t imin, idx_t imax);
-    bool is_member(idx_t id) const override;
-    ~IDSelectorRange() override {}
-};
-
-/** simple list of elements to remove
- *
- * this is inefficient in most cases, except for IndexIVF with
- * maintain_direct_map
- */
-struct IDSelectorArray : IDSelector {
-    size_t n;
-    const idx_t* ids;
-
-    IDSelectorArray(size_t n, const idx_t* ids);
-    bool is_member(idx_t id) const override;
-    ~IDSelectorArray() override {}
-};
-
-/** Remove ids from a set. Repetitions of ids in the indices set
- * passed to the constructor does not hurt performance. The hash
- * function used for the bloom filter and GCC's implementation of
- * unordered_set are just the least significant bits of the id. This
- * works fine for random ids or ids in sequences but will produce many
- * hash collisions if lsb's are always the same */
-struct IDSelectorBatch : IDSelector {
-    std::unordered_set<idx_t> set;
-
-    typedef unsigned char uint8_t;
-    std::vector<uint8_t> bloom; // assumes low bits of id are a good hash value
-    int nbits;
-    idx_t mask;
-
-    IDSelectorBatch(size_t n, const idx_t* indices);
-    bool is_member(idx_t id) const override;
-    ~IDSelectorBatch() override {}
-};
-
 /****************************************************************
  * Result structures for range search.
  *
@@ -114,8 +60,6 @@ struct IDSelectorBatch : IDSelector {
 /** List of temporary buffers used to store results before they are
  *  copied to the RangeSearchResult object. */
 struct BufferList {
-    typedef Index::idx_t idx_t;
-
     // buffer sizes in # entries
     size_t buffer_size;
 
@@ -146,7 +90,6 @@ struct RangeSearchPartialResult;
 
 /// result structure for a single query
 struct RangeQueryResult {
-    using idx_t = Index::idx_t;
     idx_t qno;   //< id of the query
     size_t nres; //< nb of results for this query
     RangeSearchPartialResult* pres;
@@ -186,30 +129,6 @@ struct RangeSearchPartialResult : BufferList {
             bool do_delete = true);
 };
 
-/***********************************************************
- * The distance computer maintains a current query and computes
- * distances to elements in an index that supports random access.
- *
- * The DistanceComputer is not intended to be thread-safe (eg. because
- * it maintains counters) so the distance functions are not const,
- * instantiate one from each thread if needed.
- ***********************************************************/
-struct DistanceComputer {
-    using idx_t = Index::idx_t;
-
-    /// called before computing distances. Pointer x should remain valid
-    /// while operator () is called
-    virtual void set_query(const float* x) = 0;
-
-    /// compute distance of vector i to current query
-    virtual float operator()(idx_t i) = 0;
-
-    /// compute distance between two stored vectors
-    virtual float symmetric_dis(idx_t i, idx_t j) = 0;
-
-    virtual ~DistanceComputer() {}
-};
-
 /***********************************************************
  * Interrupt callback
  ***********************************************************/
@@ -246,7 +165,7 @@ struct FAISS_API InterruptCallback {
 /// set implementation optimized for fast access.
 struct VisitedTable {
     std::vector<uint8_t> visited;
-    int visno;
+    uint8_t visno;
 
     explicit VisitedTable(int size) : visited(size), visno(1) {}
 
diff --git a/thirdparty/faiss/faiss/impl/CodePacker.cpp b/thirdparty/faiss/faiss/impl/CodePacker.cpp
new file mode 100644
index 000000000..36dbf526b
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/CodePacker.cpp
@@ -0,0 +1,67 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/CodePacker.h>
+
+#include <cassert>
+#include <cstring>
+
+namespace faiss {
+
+/*********************************************
+ * CodePacker
+ * default of pack_all / unpack_all loops over the _1 versions
+ */
+
+void CodePacker::pack_all(const uint8_t* flat_codes, uint8_t* block) const {
+    for (size_t i = 0; i < nvec; i++) {
+        pack_1(flat_codes + code_size * i, i, block);
+    }
+}
+
+void CodePacker::unpack_all(const uint8_t* block, uint8_t* flat_codes) const {
+    for (size_t i = 0; i < nvec; i++) {
+        unpack_1(block, i, flat_codes + code_size * i);
+    }
+}
+
+/*********************************************
+ * CodePackerFlat
+ */
+
+CodePackerFlat::CodePackerFlat(size_t code_size) {
+    this->code_size = code_size;
+    nvec = 1;
+    block_size = code_size;
+}
+
+void CodePackerFlat::pack_all(const uint8_t* flat_codes, uint8_t* block) const {
+    memcpy(block, flat_codes, code_size);
+}
+
+void CodePackerFlat::unpack_all(const uint8_t* block, uint8_t* flat_codes)
+        const {
+    memcpy(flat_codes, block, code_size);
+}
+
+void CodePackerFlat::pack_1(
+        const uint8_t* flat_code,
+        size_t offset,
+        uint8_t* block) const {
+    assert(offset == 0);
+    pack_all(flat_code, block);
+}
+
+void CodePackerFlat::unpack_1(
+        const uint8_t* block,
+        size_t offset,
+        uint8_t* flat_code) const {
+    assert(offset == 0);
+    unpack_all(block, flat_code);
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/CodePacker.h b/thirdparty/faiss/faiss/impl/CodePacker.h
new file mode 100644
index 000000000..84c323ed6
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/CodePacker.h
@@ -0,0 +1,71 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/MetricType.h>
+
+namespace faiss {
+
+/**
+ * Packing consists in combining a fixed number of codes of constant size
+ * (code_size) into a block of data where they may (or may not) be interleaved
+ * for efficient consumption by distance computation kernels. This exists for
+ * the "fast_scan" indexes on CPU and for some GPU kernels.
+ */
+struct CodePacker {
+    size_t code_size;  // input code size in bytes
+    size_t nvec;       // number of vectors per block
+    size_t block_size; // size of one block in bytes (>= code_size * nvec)
+
+    // pack a single code to a block
+    virtual void pack_1(
+            const uint8_t*
+                    flat_code, // code to write to the block, size code_size
+            size_t offset,     // offset in the block (0 <= offset < nvec)
+            uint8_t* block     // block to write to (size block_size)
+    ) const = 0;
+
+    // unpack a single code from a block
+    virtual void unpack_1(
+            const uint8_t* block, // block to read from (size block_size)
+            size_t offset,        // offset in the block (0 <= offset < nvec)
+            uint8_t* flat_code    // where to write the resulting code, size
+                                  // code_size
+    ) const = 0;
+
+    // pack all code in a block
+    virtual void pack_all(
+            const uint8_t* flat_codes, // codes to write to the block, size
+                                       // (nvec * code_size)
+            uint8_t* block             // block to write to (size block_size)
+    ) const;
+
+    // unpack all code in a block
+    virtual void unpack_all(
+            const uint8_t* block, // block to read from (size block_size)
+            uint8_t* flat_codes // where to write the resulting codes size (nvec
+                                // * code_size)
+    ) const;
+
+    virtual ~CodePacker() {}
+};
+
+/** Trivial code packer where codes are stored one by one */
+struct CodePackerFlat : CodePacker {
+    explicit CodePackerFlat(size_t code_size);
+
+    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
+            const final;
+    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
+            const final;
+
+    void pack_all(const uint8_t* flat_codes, uint8_t* block) const final;
+    void unpack_all(const uint8_t* block, uint8_t* flat_codes) const final;
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/DistanceComputer.h b/thirdparty/faiss/faiss/impl/DistanceComputer.h
new file mode 100644
index 000000000..dc46d113f
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/DistanceComputer.h
@@ -0,0 +1,85 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+
+namespace faiss {
+
+/***********************************************************
+ * The distance computer maintains a current query and computes
+ * distances to elements in an index that supports random access.
+ *
+ * The DistanceComputer is not intended to be thread-safe (eg. because
+ * it maintains counters) so the distance functions are not const,
+ * instantiate one from each thread if needed.
+ *
+ * Note that the equivalent for IVF indexes is the InvertedListScanner,
+ * that has additional methods to handle the inverted list context.
+ ***********************************************************/
+struct DistanceComputer {
+    /// called before computing distances. Pointer x should remain valid
+    /// while operator () is called
+    virtual void set_query(const float* x) = 0;
+
+    /// compute distance of vector i to current query
+    virtual float operator()(idx_t i) = 0;
+
+    /// compute distances of current query to 4 stored vectors.
+    /// certain DistanceComputer implementations may benefit
+    /// heavily from this.
+    virtual void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) {
+        // compute first, assign next
+        const float d0 = this->operator()(idx0);
+        const float d1 = this->operator()(idx1);
+        const float d2 = this->operator()(idx2);
+        const float d3 = this->operator()(idx3);
+        dis0 = d0;
+        dis1 = d1;
+        dis2 = d2;
+        dis3 = d3;
+    }
+
+    /// compute distance between two stored vectors
+    virtual float symmetric_dis(idx_t i, idx_t j) = 0;
+
+    virtual ~DistanceComputer() {}
+};
+
+/*************************************************************
+ * Specialized version of the DistanceComputer when we know that codes are
+ * laid out in a flat index.
+ */
+struct FlatCodesDistanceComputer : DistanceComputer {
+    const uint8_t* codes;
+    size_t code_size;
+
+    FlatCodesDistanceComputer(const uint8_t* codes, size_t code_size)
+            : codes(codes), code_size(code_size) {}
+
+    FlatCodesDistanceComputer() : codes(nullptr), code_size(0) {}
+
+    float operator()(idx_t i) override {
+        return distance_to_code(codes + i * code_size);
+    }
+
+    /// compute distance of current query to an encoded vector
+    virtual float distance_to_code(const uint8_t* code) = 0;
+
+    virtual ~FlatCodesDistanceComputer() {}
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/FaissException.h b/thirdparty/faiss/faiss/impl/FaissException.h
index bc8bb9aca..5e5bcf1a3 100644
--- a/thirdparty/faiss/faiss/impl/FaissException.h
+++ b/thirdparty/faiss/faiss/impl/FaissException.h
@@ -1,3 +1,4 @@
+
 /**
  * Copyright (c) Facebook, Inc. and its affiliates.
  *
@@ -79,6 +80,23 @@ struct ScopeDeleter1 {
     }
 };
 
+/** RAII object for a set of possibly transformed vectors (deallocated only if
+ * they are indeed transformed)
+ */
+struct TransformedVectors {
+    const float* x;
+    bool own_x;
+    TransformedVectors(const float* x_orig, const float* x) : x(x) {
+        own_x = x_orig != x;
+    }
+
+    ~TransformedVectors() {
+        if (own_x) {
+            delete[] x;
+        }
+    }
+};
+
 /// make typeids more readable
 std::string demangle_cpp_symbol(const char* name);
 
diff --git a/thirdparty/faiss/faiss/impl/HNSW.cpp b/thirdparty/faiss/faiss/impl/HNSW.cpp
index d5678e372..9fc201ea3 100644
--- a/thirdparty/faiss/faiss/impl/HNSW.cpp
+++ b/thirdparty/faiss/faiss/impl/HNSW.cpp
@@ -12,6 +12,18 @@
 #include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/utils/prefetch.h>
+
+#include <faiss/impl/platform_macros.h>
+
+#ifdef __AVX2__
+#include <immintrin.h>
+
+#include <limits>
+#include <type_traits>
+#endif
 
 namespace faiss {
 
@@ -45,11 +57,6 @@ void HNSW::neighbor_range(idx_t no, int layer_no, size_t* begin, size_t* end)
 
 HNSW::HNSW(int M) : rng(12345) {
     set_default_probas(M, 1.0 / log(M));
-    max_level = -1;
-    entry_point = -1;
-    efSearch = 16;
-    efConstruction = 40;
-    upper_beam = 1;
     offsets.push_back(0);
 }
 
@@ -215,7 +222,7 @@ int HNSW::prepare_level_tab(size_t n, bool preset_levels) {
     return max_level;
 }
 
-/** Enumerate vertices from farthest to nearest from query, keep a
+/** Enumerate vertices from nearest to farthest from query, keep a
  * neighbor only if there is no previous neighbor that is closer to
  * that vertex than the query.
  */
@@ -501,9 +508,18 @@ void HNSW::add_with_locks(
     }
 }
 
+/**************************************************************
+ * Searching
+ **************************************************************/
+
+namespace {
+
+using MinimaxHeap = HNSW::MinimaxHeap;
+using Node = HNSW::Node;
 /** Do a BFS on the candidates list */
 
-int HNSW::search_from_candidates(
+int search_from_candidates(
+        const HNSW& hnsw,
         DistanceComputer& qdis,
         int k,
         idx_t* I,
@@ -512,22 +528,31 @@ int HNSW::search_from_candidates(
         VisitedTable& vt,
         HNSWStats& stats,
         int level,
-        int nres_in) const {
+        int nres_in = 0,
+        const SearchParametersHNSW* params = nullptr) {
     int nres = nres_in;
     int ndis = 0;
+
+    // can be overridden by search params
+    bool do_dis_check = params ? params->check_relative_distance
+                               : hnsw.check_relative_distance;
+    int efSearch = params ? params->efSearch : hnsw.efSearch;
+    const IDSelector* sel = params ? params->sel : nullptr;
+
     for (int i = 0; i < candidates.size(); i++) {
         idx_t v1 = candidates.ids[i];
         float d = candidates.dis[i];
         FAISS_ASSERT(v1 >= 0);
-        if (nres < k) {
-            faiss::maxheap_push(++nres, D, I, d, v1);
-        } else if (d < D[0]) {
-            faiss::maxheap_replace_top(nres, D, I, d, v1);
+        if (!sel || sel->is_member(v1)) {
+            if (nres < k) {
+                faiss::maxheap_push(++nres, D, I, d, v1);
+            } else if (d < D[0]) {
+                faiss::maxheap_replace_top(nres, D, I, d, v1);
+            }
         }
         vt.set(v1);
     }
 
-    bool do_dis_check = check_relative_distance;
     int nstep = 0;
 
     while (candidates.size() > 0) {
@@ -546,24 +571,87 @@ int HNSW::search_from_candidates(
         }
 
         size_t begin, end;
-        neighbor_range(v0, level, &begin, &end);
-
+        hnsw.neighbor_range(v0, level, &begin, &end);
+
+        // // baseline version
+        // for (size_t j = begin; j < end; j++) {
+        //     int v1 = hnsw.neighbors[j];
+        //     if (v1 < 0)
+        //         break;
+        //     if (vt.get(v1)) {
+        //         continue;
+        //     }
+        //     vt.set(v1);
+        //     ndis++;
+        //     float d = qdis(v1);
+        //     if (!sel || sel->is_member(v1)) {
+        //         if (nres < k) {
+        //             faiss::maxheap_push(++nres, D, I, d, v1);
+        //         } else if (d < D[0]) {
+        //             faiss::maxheap_replace_top(nres, D, I, d, v1);
+        //         }
+        //     }
+        //     candidates.push(v1, d);
+        // }
+
+        // the following version processes 4 neighbors at a time
+        size_t jmax = begin;
         for (size_t j = begin; j < end; j++) {
-            int v1 = neighbors[j];
+            int v1 = hnsw.neighbors[j];
             if (v1 < 0)
                 break;
-            if (vt.get(v1)) {
-                continue;
+
+            prefetch_L2(vt.visited.data() + v1);
+            jmax += 1;
+        }
+
+        int counter = 0;
+        size_t saved_j[4];
+
+        ndis += jmax - begin;
+
+        auto add_to_heap = [&](const size_t idx, const float dis) {
+            if (!sel || sel->is_member(idx)) {
+                if (nres < k) {
+                    faiss::maxheap_push(++nres, D, I, dis, idx);
+                } else if (dis < D[0]) {
+                    faiss::maxheap_replace_top(nres, D, I, dis, idx);
+                }
             }
+            candidates.push(idx, dis);
+        };
+
+        for (size_t j = begin; j < jmax; j++) {
+            int v1 = hnsw.neighbors[j];
+
+            bool vget = vt.get(v1);
             vt.set(v1);
-            ndis++;
-            float d = qdis(v1);
-            if (nres < k) {
-                faiss::maxheap_push(++nres, D, I, d, v1);
-            } else if (d < D[0]) {
-                faiss::maxheap_replace_top(nres, D, I, d, v1);
+            saved_j[counter] = v1;
+            counter += vget ? 0 : 1;
+
+            if (counter == 4) {
+                float dis[4];
+                qdis.distances_batch_4(
+                        saved_j[0],
+                        saved_j[1],
+                        saved_j[2],
+                        saved_j[3],
+                        dis[0],
+                        dis[1],
+                        dis[2],
+                        dis[3]);
+
+                for (size_t id4 = 0; id4 < 4; id4++) {
+                    add_to_heap(saved_j[id4], dis[id4]);
+                }
+
+                counter = 0;
             }
-            candidates.push(v1, d);
+        }
+
+        for (size_t icnt = 0; icnt < counter; icnt++) {
+            float dis = qdis(saved_j[icnt]);
+            add_to_heap(saved_j[icnt], dis);
         }
 
         nstep++;
@@ -583,16 +671,13 @@ int HNSW::search_from_candidates(
     return nres;
 }
 
-/**************************************************************
- * Searching
- **************************************************************/
-
-std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
+std::priority_queue<HNSW::Node> search_from_candidate_unbounded(
+        const HNSW& hnsw,
         const Node& node,
         DistanceComputer& qdis,
         int ef,
         VisitedTable* vt,
-        HNSWStats& stats) const {
+        HNSWStats& stats) {
     int ndis = 0;
     std::priority_queue<Node> top_candidates;
     std::priority_queue<Node, std::vector<Node>, std::greater<Node>> candidates;
@@ -614,31 +699,94 @@ std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
         candidates.pop();
 
         size_t begin, end;
-        neighbor_range(v0, 0, &begin, &end);
-
-        for (size_t j = begin; j < end; ++j) {
-            int v1 = neighbors[j];
-
-            if (v1 < 0) {
+        hnsw.neighbor_range(v0, 0, &begin, &end);
+
+        // // baseline version
+        // for (size_t j = begin; j < end; ++j) {
+        //     int v1 = hnsw.neighbors[j];
+        //
+        //     if (v1 < 0) {
+        //         break;
+        //     }
+        //     if (vt->get(v1)) {
+        //         continue;
+        //     }
+        //
+        //     vt->set(v1);
+        //
+        //     float d1 = qdis(v1);
+        //     ++ndis;
+        //
+        //     if (top_candidates.top().first > d1 ||
+        //         top_candidates.size() < ef) {
+        //         candidates.emplace(d1, v1);
+        //         top_candidates.emplace(d1, v1);
+        //
+        //         if (top_candidates.size() > ef) {
+        //             top_candidates.pop();
+        //         }
+        //     }
+        // }
+
+        // the following version processes 4 neighbors at a time
+        size_t jmax = begin;
+        for (size_t j = begin; j < end; j++) {
+            int v1 = hnsw.neighbors[j];
+            if (v1 < 0)
                 break;
-            }
-            if (vt->get(v1)) {
-                continue;
-            }
 
-            vt->set(v1);
+            prefetch_L2(vt->visited.data() + v1);
+            jmax += 1;
+        }
 
-            float d1 = qdis(v1);
-            ++ndis;
+        int counter = 0;
+        size_t saved_j[4];
 
-            if (top_candidates.top().first > d1 || top_candidates.size() < ef) {
-                candidates.emplace(d1, v1);
-                top_candidates.emplace(d1, v1);
+        ndis += jmax - begin;
+
+        auto add_to_heap = [&](const size_t idx, const float dis) {
+            if (top_candidates.top().first > dis ||
+                top_candidates.size() < ef) {
+                candidates.emplace(dis, idx);
+                top_candidates.emplace(dis, idx);
 
                 if (top_candidates.size() > ef) {
                     top_candidates.pop();
                 }
             }
+        };
+
+        for (size_t j = begin; j < jmax; j++) {
+            int v1 = hnsw.neighbors[j];
+
+            bool vget = vt->get(v1);
+            vt->set(v1);
+            saved_j[counter] = v1;
+            counter += vget ? 0 : 1;
+
+            if (counter == 4) {
+                float dis[4];
+                qdis.distances_batch_4(
+                        saved_j[0],
+                        saved_j[1],
+                        saved_j[2],
+                        saved_j[3],
+                        dis[0],
+                        dis[1],
+                        dis[2],
+                        dis[3]);
+
+                for (size_t id4 = 0; id4 < 4; id4++) {
+                    add_to_heap(saved_j[id4], dis[id4]);
+                }
+
+                counter = 0;
+            }
+        }
+
+        for (size_t icnt = 0; icnt < counter; icnt++) {
+            float dis = qdis(saved_j[icnt]);
+            add_to_heap(saved_j[icnt], dis);
         }
     }
 
@@ -651,14 +799,19 @@ std::priority_queue<HNSW::Node> HNSW::search_from_candidate_unbounded(
     return top_candidates;
 }
 
+} // anonymous namespace
+
 HNSWStats HNSW::search(
         DistanceComputer& qdis,
         int k,
         idx_t* I,
         float* D,
-        VisitedTable& vt) const {
+        VisitedTable& vt,
+        const SearchParametersHNSW* params) const {
     HNSWStats stats;
-
+    if (entry_point == -1) {
+        return stats;
+    }
     if (upper_beam == 1) {
         //  greedy search on upper levels
         storage_idx_t nearest = entry_point;
@@ -669,16 +822,22 @@ HNSWStats HNSW::search(
         }
 
         int ef = std::max(efSearch, k);
-        if (search_bounded_queue) {
+        if (search_bounded_queue) { // this is the most common branch
             MinimaxHeap candidates(ef);
 
             candidates.push(nearest, d_nearest);
 
-            search_from_candidates(qdis, k, I, D, candidates, vt, stats, 0);
+            search_from_candidates(
+                    *this, qdis, k, I, D, candidates, vt, stats, 0, 0, params);
         } else {
             std::priority_queue<Node> top_candidates =
                     search_from_candidate_unbounded(
-                            Node(d_nearest, nearest), qdis, ef, &vt, stats);
+                            *this,
+                            Node(d_nearest, nearest),
+                            qdis,
+                            ef,
+                            &vt,
+                            stats);
 
             while (top_candidates.size() > k) {
                 top_candidates.pop();
@@ -718,9 +877,10 @@ HNSWStats HNSW::search(
 
             if (level == 0) {
                 nres = search_from_candidates(
-                        qdis, k, I, D, candidates, vt, stats, 0);
+                        *this, qdis, k, I, D, candidates, vt, stats, 0);
             } else {
                 nres = search_from_candidates(
+                        *this,
                         qdis,
                         candidates_size,
                         I_to_next.data(),
@@ -737,12 +897,111 @@ HNSWStats HNSW::search(
     return stats;
 }
 
+void HNSW::search_level_0(
+        DistanceComputer& qdis,
+        int k,
+        idx_t* idxi,
+        float* simi,
+        idx_t nprobe,
+        const storage_idx_t* nearest_i,
+        const float* nearest_d,
+        int search_type,
+        HNSWStats& search_stats,
+        VisitedTable& vt) const {
+    const HNSW& hnsw = *this;
+
+    if (search_type == 1) {
+        int nres = 0;
+
+        for (int j = 0; j < nprobe; j++) {
+            storage_idx_t cj = nearest_i[j];
+
+            if (cj < 0)
+                break;
+
+            if (vt.get(cj))
+                continue;
+
+            int candidates_size = std::max(hnsw.efSearch, int(k));
+            MinimaxHeap candidates(candidates_size);
+
+            candidates.push(cj, nearest_d[j]);
+
+            nres = search_from_candidates(
+                    hnsw,
+                    qdis,
+                    k,
+                    idxi,
+                    simi,
+                    candidates,
+                    vt,
+                    search_stats,
+                    0,
+                    nres);
+        }
+    } else if (search_type == 2) {
+        int candidates_size = std::max(hnsw.efSearch, int(k));
+        candidates_size = std::max(candidates_size, int(nprobe));
+
+        MinimaxHeap candidates(candidates_size);
+        for (int j = 0; j < nprobe; j++) {
+            storage_idx_t cj = nearest_i[j];
+
+            if (cj < 0)
+                break;
+            candidates.push(cj, nearest_d[j]);
+        }
+
+        search_from_candidates(
+                hnsw, qdis, k, idxi, simi, candidates, vt, search_stats, 0);
+    }
+}
+
+void HNSW::permute_entries(const idx_t* map) {
+    // remap levels
+    storage_idx_t ntotal = levels.size();
+    std::vector<storage_idx_t> imap(ntotal); // inverse mapping
+    // map: new index -> old index
+    // imap: old index -> new index
+    for (int i = 0; i < ntotal; i++) {
+        assert(map[i] >= 0 && map[i] < ntotal);
+        imap[map[i]] = i;
+    }
+    if (entry_point != -1) {
+        entry_point = imap[entry_point];
+    }
+    std::vector<int> new_levels(ntotal);
+    std::vector<size_t> new_offsets(ntotal + 1);
+    std::vector<storage_idx_t> new_neighbors(neighbors.size());
+    size_t no = 0;
+    for (int i = 0; i < ntotal; i++) {
+        storage_idx_t o = map[i]; // corresponding "old" index
+        new_levels[i] = levels[o];
+        for (size_t j = offsets[o]; j < offsets[o + 1]; j++) {
+            storage_idx_t neigh = neighbors[j];
+            new_neighbors[no++] = neigh >= 0 ? imap[neigh] : neigh;
+        }
+        new_offsets[i + 1] = no;
+    }
+    assert(new_offsets[ntotal] == offsets[ntotal]);
+    // swap everyone
+    std::swap(levels, new_levels);
+    std::swap(offsets, new_offsets);
+    std::swap(neighbors, new_neighbors);
+}
+
+/**************************************************************
+ * MinimaxHeap
+ **************************************************************/
+
 void HNSW::MinimaxHeap::push(storage_idx_t i, float v) {
     if (k == n) {
         if (v >= dis[0])
             return;
+        if (ids[0] != -1) {
+            --nvalid;
+        }
         faiss::heap_pop<HC>(k--, dis.data(), ids.data());
-        --nvalid;
     }
     faiss::heap_push<HC>(++k, dis.data(), ids.data(), v, i);
     ++nvalid;
@@ -760,17 +1019,105 @@ void HNSW::MinimaxHeap::clear() {
     nvalid = k = 0;
 }
 
+#ifdef __AVX2__
+int HNSW::MinimaxHeap::pop_min(float* vmin_out) {
+    assert(k > 0);
+    static_assert(
+            std::is_same<storage_idx_t, int32_t>::value,
+            "This code expects storage_idx_t to be int32_t");
+
+    int32_t min_idx = -1;
+    float min_dis = std::numeric_limits<float>::infinity();
+
+    size_t iii = 0;
+
+    __m256i min_indices = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, -1, -1);
+    __m256 min_distances =
+            _mm256_set1_ps(std::numeric_limits<float>::infinity());
+    __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i offset = _mm256_set1_epi32(8);
+
+    // The baseline version is available in non-AVX2 branch.
+
+    // The following loop tracks the rightmost index with the min distance.
+    // -1 index values are ignored.
+    const int k8 = (k / 8) * 8;
+    for (; iii < k8; iii += 8) {
+        __m256i indices =
+                _mm256_loadu_si256((const __m256i*)(ids.data() + iii));
+        __m256 distances = _mm256_loadu_ps(dis.data() + iii);
+
+        // This mask filters out -1 values among indices.
+        __m256i m1mask = _mm256_cmpgt_epi32(_mm256_setzero_si256(), indices);
+
+        __m256i dmask = _mm256_castps_si256(
+                _mm256_cmp_ps(min_distances, distances, _CMP_LT_OS));
+        __m256 finalmask = _mm256_castsi256_ps(_mm256_or_si256(m1mask, dmask));
+
+        const __m256i min_indices_new = _mm256_castps_si256(_mm256_blendv_ps(
+                _mm256_castsi256_ps(current_indices),
+                _mm256_castsi256_ps(min_indices),
+                finalmask));
+
+        const __m256 min_distances_new =
+                _mm256_blendv_ps(distances, min_distances, finalmask);
+
+        min_indices = min_indices_new;
+        min_distances = min_distances_new;
+
+        current_indices = _mm256_add_epi32(current_indices, offset);
+    }
+
+    // Vectorizing is doable, but is not practical
+    int32_t vidx8[8];
+    float vdis8[8];
+    _mm256_storeu_ps(vdis8, min_distances);
+    _mm256_storeu_si256((__m256i*)vidx8, min_indices);
+
+    for (size_t j = 0; j < 8; j++) {
+        if (min_dis > vdis8[j] || (min_dis == vdis8[j] && min_idx < vidx8[j])) {
+            min_idx = vidx8[j];
+            min_dis = vdis8[j];
+        }
+    }
+
+    // process last values. Vectorizing is doable, but is not practical
+    for (; iii < k; iii++) {
+        if (ids[iii] != -1 && dis[iii] <= min_dis) {
+            min_dis = dis[iii];
+            min_idx = iii;
+        }
+    }
+
+    if (min_idx == -1) {
+        return -1;
+    }
+
+    if (vmin_out) {
+        *vmin_out = min_dis;
+    }
+    int ret = ids[min_idx];
+    ids[min_idx] = -1;
+    --nvalid;
+    return ret;
+}
+
+#else
+
+// baseline non-vectorized version
 int HNSW::MinimaxHeap::pop_min(float* vmin_out) {
     assert(k > 0);
     // returns min. This is an O(n) operation
     int i = k - 1;
     while (i >= 0) {
-        if (ids[i] != -1)
+        if (ids[i] != -1) {
             break;
+        }
         i--;
     }
-    if (i == -1)
+    if (i == -1) {
         return -1;
+    }
     int imin = i;
     float vmin = dis[i];
     i--;
@@ -781,14 +1128,16 @@ int HNSW::MinimaxHeap::pop_min(float* vmin_out) {
         }
         i--;
     }
-    if (vmin_out)
+    if (vmin_out) {
         *vmin_out = vmin;
+    }
     int ret = ids[imin];
     ids[imin] = -1;
     --nvalid;
 
     return ret;
 }
+#endif
 
 int HNSW::MinimaxHeap::count_below(float thresh) {
     int n_below = 0;
diff --git a/thirdparty/faiss/faiss/impl/HNSW.h b/thirdparty/faiss/faiss/impl/HNSW.h
index 6e133b1bc..c923e0a6a 100644
--- a/thirdparty/faiss/faiss/impl/HNSW.h
+++ b/thirdparty/faiss/faiss/impl/HNSW.h
@@ -43,12 +43,16 @@ struct VisitedTable;
 struct DistanceComputer; // from AuxIndexStructures
 struct HNSWStats;
 
+struct SearchParametersHNSW : SearchParameters {
+    int efSearch = 16;
+    bool check_relative_distance = true;
+
+    ~SearchParametersHNSW() {}
+};
+
 struct HNSW {
     /// internal storage of vectors (32 bits: this is expensive)
-    typedef int storage_idx_t;
-
-    /// Faiss results are 64-bit
-    typedef Index::idx_t idx_t;
+    using storage_idx_t = int32_t;
 
     typedef std::pair<float, storage_idx_t> Node;
 
@@ -117,25 +121,25 @@ struct HNSW {
 
     /// entry point in the search structure (one of the points with maximum
     /// level
-    storage_idx_t entry_point;
+    storage_idx_t entry_point = -1;
 
     faiss::RandomGenerator rng;
 
     /// maximum level
-    int max_level;
+    int max_level = -1;
 
     /// expansion factor at construction time
-    int efConstruction;
+    int efConstruction = 40;
 
     /// expansion factor at search time
-    int efSearch;
+    int efSearch = 16;
 
     /// during search: do we check whether the next best distance is good
     /// enough?
     bool check_relative_distance = true;
 
     /// number of entry points in levels > 0.
-    int upper_beam;
+    int upper_beam = 1;
 
     /// use bounded queue during exploration
     bool search_bounded_queue = true;
@@ -188,30 +192,26 @@ struct HNSW {
             std::vector<omp_lock_t>& locks,
             VisitedTable& vt);
 
-    int search_from_candidates(
+    /// search interface for 1 point, single thread
+    HNSWStats search(
             DistanceComputer& qdis,
             int k,
             idx_t* I,
             float* D,
-            MinimaxHeap& candidates,
             VisitedTable& vt,
-            HNSWStats& stats,
-            int level,
-            int nres_in = 0) const;
+            const SearchParametersHNSW* params = nullptr) const;
 
-    std::priority_queue<Node> search_from_candidate_unbounded(
-            const Node& node,
-            DistanceComputer& qdis,
-            int ef,
-            VisitedTable* vt,
-            HNSWStats& stats) const;
-
-    /// search interface
-    HNSWStats search(
+    /// search only in level 0 from a given vertex
+    void search_level_0(
             DistanceComputer& qdis,
             int k,
-            idx_t* I,
-            float* D,
+            idx_t* idxi,
+            float* simi,
+            idx_t nprobe,
+            const storage_idx_t* nearest_i,
+            const float* nearest_d,
+            int search_type,
+            HNSWStats& search_stats,
             VisitedTable& vt) const;
 
     void reset();
@@ -226,6 +226,8 @@ struct HNSW {
             std::priority_queue<NodeDistFarther>& input,
             std::vector<NodeDistFarther>& output,
             int max_size);
+
+    void permute_entries(const idx_t* map);
 };
 
 struct HNSWStats {
diff --git a/thirdparty/faiss/faiss/impl/IDSelector.cpp b/thirdparty/faiss/faiss/impl/IDSelector.cpp
new file mode 100644
index 000000000..e4a4bba96
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/IDSelector.cpp
@@ -0,0 +1,125 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+
+namespace faiss {
+
+/***********************************************************************
+ * IDSelectorRange
+ ***********************************************************************/
+
+IDSelectorRange::IDSelectorRange(idx_t imin, idx_t imax, bool assume_sorted)
+        : imin(imin), imax(imax), assume_sorted(assume_sorted) {}
+
+bool IDSelectorRange::is_member(idx_t id) const {
+    return id >= imin && id < imax;
+}
+
+void IDSelectorRange::find_sorted_ids_bounds(
+        size_t list_size,
+        const idx_t* ids,
+        size_t* jmin_out,
+        size_t* jmax_out) const {
+    FAISS_ASSERT(assume_sorted);
+    if (list_size == 0 || imax <= ids[0] || imin > ids[list_size - 1]) {
+        *jmin_out = *jmax_out = 0;
+        return;
+    }
+    // bissection to find imin
+    if (ids[0] >= imin) {
+        *jmin_out = 0;
+    } else {
+        size_t j0 = 0, j1 = list_size;
+        while (j1 > j0 + 1) {
+            size_t jmed = (j0 + j1) / 2;
+            if (ids[jmed] >= imin) {
+                j1 = jmed;
+            } else {
+                j0 = jmed;
+            }
+        }
+        *jmin_out = j1;
+    }
+    // bissection to find imax
+    if (*jmin_out == list_size || ids[*jmin_out] >= imax) {
+        *jmax_out = *jmin_out;
+    } else {
+        size_t j0 = *jmin_out, j1 = list_size;
+        while (j1 > j0 + 1) {
+            size_t jmed = (j0 + j1) / 2;
+            if (ids[jmed] >= imax) {
+                j1 = jmed;
+            } else {
+                j0 = jmed;
+            }
+        }
+        *jmax_out = j1;
+    }
+}
+
+/***********************************************************************
+ * IDSelectorArray
+ ***********************************************************************/
+
+IDSelectorArray::IDSelectorArray(size_t n, const idx_t* ids) : n(n), ids(ids) {}
+
+bool IDSelectorArray::is_member(idx_t id) const {
+    for (idx_t i = 0; i < n; i++) {
+        if (ids[i] == id)
+            return true;
+    }
+    return false;
+}
+
+/***********************************************************************
+ * IDSelectorBatch
+ ***********************************************************************/
+
+IDSelectorBatch::IDSelectorBatch(size_t n, const idx_t* indices) {
+    nbits = 0;
+    while (n > ((idx_t)1 << nbits)) {
+        nbits++;
+    }
+    nbits += 5;
+    // for n = 1M, nbits = 25 is optimal, see P56659518
+
+    mask = ((idx_t)1 << nbits) - 1;
+    bloom.resize((idx_t)1 << (nbits - 3), 0);
+    for (idx_t i = 0; i < n; i++) {
+        idx_t id = indices[i];
+        set.insert(id);
+        id &= mask;
+        bloom[id >> 3] |= 1 << (id & 7);
+    }
+}
+
+bool IDSelectorBatch::is_member(idx_t i) const {
+    long im = i & mask;
+    if (!(bloom[im >> 3] & (1 << (im & 7)))) {
+        return 0;
+    }
+    return set.count(i);
+}
+
+/***********************************************************************
+ * IDSelectorBitmap
+ ***********************************************************************/
+
+IDSelectorBitmap::IDSelectorBitmap(size_t n, const uint8_t* bitmap)
+        : n(n), bitmap(bitmap) {}
+
+bool IDSelectorBitmap::is_member(idx_t ii) const {
+    uint64_t i = ii;
+    if ((i >> 3) >= n) {
+        return false;
+    }
+    return (bitmap[i >> 3] >> (i & 7)) & 1;
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/IDSelector.h b/thirdparty/faiss/faiss/impl/IDSelector.h
new file mode 100644
index 000000000..dd56cff66
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/IDSelector.h
@@ -0,0 +1,173 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <unordered_set>
+#include <vector>
+
+#include <faiss/MetricType.h>
+
+/** IDSelector is intended to define a subset of vectors to handle (for removal
+ * or as subset to search) */
+
+namespace faiss {
+
+/** Encapsulates a set of ids to handle. */
+struct IDSelector {
+    virtual bool is_member(idx_t id) const = 0;
+    virtual ~IDSelector() {}
+};
+
+/** ids between [imin, imax) */
+struct IDSelectorRange : IDSelector {
+    idx_t imin, imax;
+
+    /// Assume that the ids to handle are sorted. In some cases this can speed
+    /// up processing
+    bool assume_sorted;
+
+    IDSelectorRange(idx_t imin, idx_t imax, bool assume_sorted = false);
+
+    bool is_member(idx_t id) const final;
+
+    /// for sorted ids, find the range of list indices where the valid ids are
+    /// stored
+    void find_sorted_ids_bounds(
+            size_t list_size,
+            const idx_t* ids,
+            size_t* jmin,
+            size_t* jmax) const;
+
+    ~IDSelectorRange() override {}
+};
+
+/** Simple array of elements
+ *
+ * is_member calls are very inefficient, but some operations can use the ids
+ * directly.
+ */
+struct IDSelectorArray : IDSelector {
+    size_t n;
+    const idx_t* ids;
+
+    /** Construct with an array of ids to process
+     *
+     * @param n number of ids to store
+     * @param ids elements to store. The pointer should remain valid during
+     *            IDSelectorArray's lifetime
+     */
+    IDSelectorArray(size_t n, const idx_t* ids);
+    bool is_member(idx_t id) const final;
+    ~IDSelectorArray() override {}
+};
+
+/** Ids from a set.
+ *
+ * Repetitions of ids in the indices set passed to the constructor does not hurt
+ * performance.
+ *
+ * The hash function used for the bloom filter and GCC's implementation of
+ * unordered_set are just the least significant bits of the id. This works fine
+ * for random ids or ids in sequences but will produce many hash collisions if
+ * lsb's are always the same
+ */
+struct IDSelectorBatch : IDSelector {
+    std::unordered_set<idx_t> set;
+
+    // Bloom filter to avoid accessing the unordered set if it is unlikely
+    // to be true
+    std::vector<uint8_t> bloom;
+    int nbits;
+    idx_t mask;
+
+    /** Construct with an array of ids to process
+     *
+     * @param n number of ids to store
+     * @param ids elements to store. The pointer can be released after
+     *            construction
+     */
+    IDSelectorBatch(size_t n, const idx_t* indices);
+    bool is_member(idx_t id) const final;
+    ~IDSelectorBatch() override {}
+};
+
+/** One bit per element. Constructed with a bitmap, size ceil(n / 8).
+ */
+struct IDSelectorBitmap : IDSelector {
+    size_t n;
+    const uint8_t* bitmap;
+
+    /** Construct with a binary mask
+     *
+     * @param n size of the bitmap array
+     * @param bitmap id will be selected iff id / 8 < n and bit number
+     *               (i%8) of bitmap[floor(i / 8)] is 1.
+     */
+    IDSelectorBitmap(size_t n, const uint8_t* bitmap);
+    bool is_member(idx_t id) const final;
+    ~IDSelectorBitmap() override {}
+};
+
+/** reverts the membership test of another selector */
+struct IDSelectorNot : IDSelector {
+    const IDSelector* sel;
+    IDSelectorNot(const IDSelector* sel) : sel(sel) {}
+    bool is_member(idx_t id) const final {
+        return !sel->is_member(id);
+    }
+    virtual ~IDSelectorNot() {}
+};
+
+/// selects all entries (useful for benchmarking)
+struct IDSelectorAll : IDSelector {
+    bool is_member(idx_t id) const final {
+        return true;
+    }
+    virtual ~IDSelectorAll() {}
+};
+
+/// does an AND operation on the the two given IDSelector's is_membership
+/// results.
+struct IDSelectorAnd : IDSelector {
+    const IDSelector* lhs;
+    const IDSelector* rhs;
+    IDSelectorAnd(const IDSelector* lhs, const IDSelector* rhs)
+            : lhs(lhs), rhs(rhs) {}
+    bool is_member(idx_t id) const final {
+        return lhs->is_member(id) && rhs->is_member(id);
+    };
+    virtual ~IDSelectorAnd() {}
+};
+
+/// does an OR operation on the the two given IDSelector's is_membership
+/// results.
+struct IDSelectorOr : IDSelector {
+    const IDSelector* lhs;
+    const IDSelector* rhs;
+    IDSelectorOr(const IDSelector* lhs, const IDSelector* rhs)
+            : lhs(lhs), rhs(rhs) {}
+    bool is_member(idx_t id) const final {
+        return lhs->is_member(id) || rhs->is_member(id);
+    };
+    virtual ~IDSelectorOr() {}
+};
+
+/// does an XOR operation on the the two given IDSelector's is_membership
+/// results.
+struct IDSelectorXOr : IDSelector {
+    const IDSelector* lhs;
+    const IDSelector* rhs;
+    IDSelectorXOr(const IDSelector* lhs, const IDSelector* rhs)
+            : lhs(lhs), rhs(rhs) {}
+    bool is_member(idx_t id) const final {
+        return lhs->is_member(id) ^ rhs->is_member(id);
+    };
+    virtual ~IDSelectorXOr() {}
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp b/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp
index be9051dae..2fd84b2d0 100644
--- a/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.cpp
@@ -15,14 +15,24 @@
 
 #include <algorithm>
 
-#include <faiss/Clustering.h>
-#include <faiss/FaissHook.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h> // BitstringWriter
 #include <faiss/utils/utils.h>
 
+#include <faiss/utils/approx_topk/approx_topk.h>
+
+// this is needed for prefetching
+#include <faiss/impl/platform_macros.h>
+
+// todo aguzhva: is it needed?
+#ifdef __AVX2__
+#include <xmmintrin.h>
+#endif
+
+#include "simd/hook.h"
+
 extern "C" {
 // LU decomoposition of a general matrix
 void sgetrf_(
@@ -152,27 +162,8 @@ LocalSearchQuantizer::LocalSearchQuantizer(
         size_t nbits,
         Search_type_t search_type)
         : AdditiveQuantizer(d, std::vector<size_t>(M, nbits), search_type) {
-    is_trained = false;
-    verbose = false;
-
     K = (1 << nbits);
-
-    train_iters = 25;
-    train_ils_iters = 8;
-    icm_iters = 4;
-
-    encode_ils_iters = 16;
-
-    p = 0.5f;
-    lambd = 1e-2f;
-
-    chunk_size = 10000;
-    nperts = 4;
-
-    random_seed = 0x12345;
     std::srand(random_seed);
-
-    icm_encoder_factory = nullptr;
 }
 
 LocalSearchQuantizer::~LocalSearchQuantizer() {
@@ -183,7 +174,7 @@ LocalSearchQuantizer::LocalSearchQuantizer() : LocalSearchQuantizer(0, 0, 0) {}
 
 void LocalSearchQuantizer::train(size_t n, const float* x) {
     FAISS_THROW_IF_NOT(K == (1 << nbits[0]));
-    FAISS_THROW_IF_NOT(nperts <= M);
+    nperts = std::min(nperts, M);
 
     lsq_timer.reset();
     LSQTimerScope scope(&lsq_timer, "train");
@@ -197,7 +188,7 @@ void LocalSearchQuantizer::train(size_t n, const float* x) {
     // allocate memory for codebooks, size [M, K, d]
     codebooks.resize(M * K * d);
 
-    // randomly intialize codes
+    // randomly initialize codes
     std::mt19937 gen(random_seed);
     std::vector<int32_t> codes(n * M); // [n, M]
     random_int32(codes, 0, K - 1, gen);
@@ -265,26 +256,7 @@ void LocalSearchQuantizer::train(size_t n, const float* x) {
         decode_unpacked(codes.data(), x_recons.data(), n);
         fvec_norms_L2sqr(norms.data(), x_recons.data(), d, n);
 
-        norm_min = HUGE_VALF;
-        norm_max = -HUGE_VALF;
-        for (idx_t i = 0; i < n; i++) {
-            if (norms[i] < norm_min) {
-                norm_min = norms[i];
-            }
-            if (norms[i] > norm_max) {
-                norm_max = norms[i];
-            }
-        }
-
-        if (search_type == ST_norm_cqint8 || search_type == ST_norm_cqint4) {
-            size_t k = (1 << 8);
-            if (search_type == ST_norm_cqint4) {
-                k = (1 << 4);
-            }
-            Clustering1D clus(k);
-            clus.train_exact(n, norms.data());
-            qnorm.add(clus.k, clus.centroids.data());
-        }
+        train_norm(n, norms.data());
     }
 
     if (verbose) {
@@ -319,10 +291,11 @@ void LocalSearchQuantizer::perturb_codebooks(
     }
 }
 
-void LocalSearchQuantizer::compute_codes(
+void LocalSearchQuantizer::compute_codes_add_centroids(
         const float* x,
         uint8_t* codes_out,
-        size_t n) const {
+        size_t n,
+        const float* centroids) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "LSQ is not trained yet.");
 
     lsq_timer.reset();
@@ -336,7 +309,7 @@ void LocalSearchQuantizer::compute_codes(
     random_int32(codes, 0, K - 1, gen);
 
     icm_encode(codes.data(), x, n, encode_ils_iters, gen);
-    pack_codes(n, codes.data(), codes_out);
+    pack_codes(n, codes.data(), codes_out, -1, nullptr, centroids);
 
     if (verbose) {
         scope.finish();
@@ -627,54 +600,72 @@ void LocalSearchQuantizer::icm_encode_step(
     FAISS_THROW_IF_NOT(M != 0 && K != 0);
     FAISS_THROW_IF_NOT(binaries != nullptr);
 
-    for (size_t iter = 0; iter < n_iters; iter++) {
-        // condition on the m-th subcode
-        for (size_t m = 0; m < M; m++) {
-            std::vector<float> objs(n * K);
-#pragma omp parallel for
-            for (int64_t i = 0; i < n; i++) {
-                auto u = unaries + m * n * K + i * K;
-                memcpy(objs.data() + i * K, u, sizeof(float) * K);
-            }
+#pragma omp parallel for schedule(dynamic)
+    for (int64_t i = 0; i < n; i++) {
+        std::vector<float> objs(K);
 
-            // compute objective function by adding unary
-            // and binary terms together
-            for (size_t other_m = 0; other_m < M; other_m++) {
-                if (other_m == m) {
-                    continue;
+        for (size_t iter = 0; iter < n_iters; iter++) {
+            // condition on the m-th subcode
+            for (size_t m = 0; m < M; m++) {
+                // copy
+                auto u = unaries + m * n * K + i * K;
+                for (size_t code = 0; code < K; code++) {
+                    objs[code] = u[code];
                 }
 
-#pragma omp parallel for
-                for (int64_t i = 0; i < n; i++) {
+                // compute objective function by adding unary
+                // and binary terms together
+                for (size_t other_m = 0; other_m < M; other_m++) {
+                    if (other_m == m) {
+                        continue;
+                    }
+
+#ifdef __AVX2__
+                    // TODO: add platform-independent compiler-independent
+                    // prefetch utilities.
+                    if (other_m + 1 < M) {
+                        // do a single prefetch
+                        int32_t code2 = codes[i * M + other_m + 1];
+                        // for (int32_t code = 0; code < K; code += 64) {
+                        int32_t code = 0;
+                        {
+                            size_t binary_idx = (other_m + 1) * M * K * K +
+                                    m * K * K + code2 * K + code;
+                            _mm_prefetch(binaries + binary_idx, _MM_HINT_T0);
+                        }
+                    }
+#endif
+
                     for (int32_t code = 0; code < K; code++) {
                         int32_t code2 = codes[i * M + other_m];
-                        size_t binary_idx = m * M * K * K + other_m * K * K +
-                                code * K + code2;
-                        // binaries[m, other_m, code, code2]
-                        objs[i * K + code] += binaries[binary_idx];
+                        size_t binary_idx = other_m * M * K * K + m * K * K +
+                                code2 * K + code;
+                        // binaries[m, other_m, code, code2].
+                        // It is symmetric over (m <-> other_m)
+                        //   and (code <-> code2).
+                        // So, replace the op with
+                        //   binaries[other_m, m, code2, code].
+                        objs[code] += binaries[binary_idx];
                     }
                 }
-            }
 
-            // find the optimal value of the m-th subcode
-#pragma omp parallel for
-            for (int64_t i = 0; i < n; i++) {
+                // find the optimal value of the m-th subcode
                 float best_obj = HUGE_VALF;
                 int32_t best_code = 0;
-                for (size_t code = 0; code < K; code++) {
-                    float obj = objs[i * K + code];
-                    if (obj < best_obj) {
-                        best_obj = obj;
-                        best_code = code;
-                    }
-                }
+
+                // find one using SIMD. The following operation is similar
+                // to the search of the smallest element in objs
+                using C = CMax<float, int>;
+                HeapWithBuckets<C, 16, 1>::addn(
+                        K, objs.data(), 1, &best_obj, &best_code);
+
+                // done
                 codes[i * M + m] = best_code;
-            }
 
-        } // loop M
+            } // loop M
+        }
     }
 }
-
 void LocalSearchQuantizer::perturb_codes(
         int32_t* codes,
         size_t n,
diff --git a/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.h b/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.h
index 162a70a0b..3904d349a 100644
--- a/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.h
+++ b/thirdparty/faiss/faiss/impl/LocalSearchQuantizer.h
@@ -45,22 +45,21 @@ struct IcmEncoderFactory;
 struct LocalSearchQuantizer : AdditiveQuantizer {
     size_t K; ///< number of codes per codebook
 
-    size_t train_iters; ///< number of iterations in training
+    size_t train_iters = 25;      ///< number of iterations in training
+    size_t encode_ils_iters = 16; ///< iterations of local search in encoding
+    size_t train_ils_iters = 8;   ///< iterations of local search in training
+    size_t icm_iters = 4;         ///< number of iterations in icm
 
-    size_t encode_ils_iters; ///< iterations of local search in encoding
-    size_t train_ils_iters;  ///< iterations of local search in training
-    size_t icm_iters;        ///< number of iterations in icm
+    float p = 0.5f;      ///< temperature factor
+    float lambd = 1e-2f; ///< regularization factor
 
-    float p;     ///< temperature factor
-    float lambd; ///< regularization factor
+    size_t chunk_size = 10000; ///< nb of vectors to encode at a time
 
-    size_t chunk_size; ///< nb of vectors to encode at a time
+    int random_seed = 0x12345; ///< seed for random generator
+    size_t nperts = 4;         ///< number of perturbation in each code
 
-    int random_seed; ///< seed for random generator
-    size_t nperts;   ///< number of perturbation in each code
-
-    ///< if non-NULL, use this encoder to encode
-    lsq::IcmEncoderFactory* icm_encoder_factory;
+    ///< if non-NULL, use this encoder to encode (owned by the object)
+    lsq::IcmEncoderFactory* icm_encoder_factory = nullptr;
 
     bool update_codebooks_with_double = true;
 
@@ -83,8 +82,13 @@ struct LocalSearchQuantizer : AdditiveQuantizer {
      * @param x      vectors to encode, size n * d
      * @param codes  output codes, size n * code_size
      * @param n      number of vectors
+     * @param centroids  centroids to be added to x, size n * d
      */
-    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
+    void compute_codes_add_centroids(
+            const float* x,
+            uint8_t* codes,
+            size_t n,
+            const float* centroids = nullptr) const override;
 
     /** Update codebooks given encodings
      *
diff --git a/thirdparty/faiss/faiss/impl/LookupTableScaler.h b/thirdparty/faiss/faiss/impl/LookupTableScaler.h
new file mode 100644
index 000000000..c553a0f14
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/LookupTableScaler.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+
+#include <faiss/utils/simdlib.h>
+
+/*******************************************
+ * The Scaler objects are used to specialize the handling of the
+ * norm components in Additive quantizer fast-scan.
+ ********************************************/
+
+namespace faiss {
+
+/// no-op handler
+struct DummyScaler {
+    static constexpr int nscale = 0;
+
+    inline simd32uint8 lookup(const simd32uint8&, const simd32uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::lookup should not be called.");
+        return simd32uint8(0);
+    }
+
+    inline simd16uint16 scale_lo(const simd32uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_lo should not be called.");
+        return simd16uint16(0);
+    }
+
+    inline simd16uint16 scale_hi(const simd32uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_hi should not be called.");
+        return simd16uint16(0);
+    }
+
+    template <class dist_t>
+    inline dist_t scale_one(const dist_t&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_one should not be called.");
+        return 0;
+    }
+};
+
+/// consumes 2x4 bits to encode a norm as a scalar additive quantizer
+/// the norm is scaled because its range if larger than other components
+struct NormTableScaler {
+    static constexpr int nscale = 2;
+    int scale_int;
+    simd16uint16 scale_simd;
+
+    explicit NormTableScaler(int scale) : scale_int(scale), scale_simd(scale) {}
+
+    inline simd32uint8 lookup(const simd32uint8& lut, const simd32uint8& c)
+            const {
+        return lut.lookup_2_lanes(c);
+    }
+
+    inline simd16uint16 scale_lo(const simd32uint8& res) const {
+        return simd16uint16(res) * scale_simd;
+    }
+
+    inline simd16uint16 scale_hi(const simd32uint8& res) const {
+        return (simd16uint16(res) >> 8) * scale_simd;
+    }
+
+    // for non-SIMD implem 2, 3, 4
+    template <class dist_t>
+    inline dist_t scale_one(const dist_t& x) const {
+        return x * scale_int;
+    }
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/NNDescent.cpp b/thirdparty/faiss/faiss/impl/NNDescent.cpp
index adac2601c..8878349ff 100644
--- a/thirdparty/faiss/faiss/impl/NNDescent.cpp
+++ b/thirdparty/faiss/faiss/impl/NNDescent.cpp
@@ -13,6 +13,7 @@
 #include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
 
 namespace faiss {
 
@@ -146,14 +147,8 @@ using namespace nndescent;
 
 constexpr int NUM_EVAL_POINTS = 100;
 
-NNDescent::NNDescent(const int d, const int K) : K(K), random_seed(2021), d(d) {
-    ntotal = 0;
-    has_built = false;
-    S = 10;
-    R = 100;
+NNDescent::NNDescent(const int d, const int K) : K(K), d(d) {
     L = K + 50;
-    iter = 10;
-    search_L = 0;
 }
 
 NNDescent::~NNDescent() {}
@@ -310,7 +305,7 @@ void NNDescent::generate_eval_set(
     for (int i = 0; i < c.size(); i++) {
         std::vector<Neighbor> tmp;
         for (int j = 0; j < N; j++) {
-            if (i == j)
+            if (c[i] == j)
                 continue; // skip itself
             float dist = qdis.symmetric_dis(c[i], j);
             tmp.push_back(Neighbor(j, dist, true));
@@ -379,6 +374,10 @@ void NNDescent::init_graph(DistanceComputer& qdis) {
 
 void NNDescent::build(DistanceComputer& qdis, const int n, bool verbose) {
     FAISS_THROW_IF_NOT_MSG(L >= K, "L should be >= K in NNDescent.build");
+    FAISS_THROW_IF_NOT_FMT(
+            n > NUM_EVAL_POINTS,
+            "NNDescent.build cannot build a graph smaller than %d",
+            int(NUM_EVAL_POINTS));
 
     if (verbose) {
         printf("Parameters: K=%d, S=%d, R=%d, L=%d, iter=%d\n",
@@ -408,7 +407,7 @@ void NNDescent::build(DistanceComputer& qdis, const int n, bool verbose) {
     has_built = true;
 
     if (verbose) {
-        printf("Addes %d points into the index\n", ntotal);
+        printf("Added %d points into the index\n", ntotal);
     }
 }
 
@@ -424,7 +423,7 @@ void NNDescent::search(
     // candidate pool, the K best items is the result.
     std::vector<Neighbor> retset(L + 1);
 
-    // Randomly choose L points to intialize the candidate pool
+    // Randomly choose L points to initialize the candidate pool
     std::vector<int> init_ids(L);
     std::mt19937 rng(random_seed);
 
diff --git a/thirdparty/faiss/faiss/impl/NNDescent.h b/thirdparty/faiss/faiss/impl/NNDescent.h
index 858367f3b..2426b0d7b 100644
--- a/thirdparty/faiss/faiss/impl/NNDescent.h
+++ b/thirdparty/faiss/faiss/impl/NNDescent.h
@@ -90,7 +90,6 @@ struct Nhood {
 
 struct NNDescent {
     using storage_idx_t = int;
-    using idx_t = Index::idx_t;
 
     using KNNGraph = std::vector<nndescent::Nhood>;
 
@@ -133,19 +132,20 @@ struct NNDescent {
             std::vector<int>& ctrl_points,
             std::vector<std::vector<int>>& acc_eval_set);
 
-    bool has_built;
+    bool has_built = false;
 
-    int K; // K in KNN graph
-    int S; // number of sample neighbors to be updated for each node
-    int R; // size of reverse links, 0 means the reverse links will not be used
-    int L; // size of the candidate pool in building
-    int iter;        // number of iterations to iterate over
-    int search_L;    // size of candidate pool in searching
-    int random_seed; // random seed for generators
+    int S = 10;  // number of sample neighbors to be updated for each node
+    int R = 100; // size of reverse links, 0 means the reverse links will not be
+                 // used
+    int iter = 10;          // number of iterations to iterate over
+    int search_L = 0;       // size of candidate pool in searching
+    int random_seed = 2021; // random seed for generators
 
+    int K; // K in KNN graph
     int d; // dimensions
+    int L; // size of the candidate pool in building
 
-    int ntotal;
+    int ntotal = 0;
 
     KNNGraph graph;
     std::vector<int> final_graph;
diff --git a/thirdparty/faiss/faiss/impl/NSG.cpp b/thirdparty/faiss/faiss/impl/NSG.cpp
index d5da292b6..1f30b576b 100644
--- a/thirdparty/faiss/faiss/impl/NSG.cpp
+++ b/thirdparty/faiss/faiss/impl/NSG.cpp
@@ -14,7 +14,7 @@
 #include <mutex>
 #include <stack>
 
-#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
 
 namespace faiss {
 
@@ -29,8 +29,6 @@ constexpr int EMPTY_ID = -1;
    distances. This makes supporting INNER_PRODUCE search easier */
 
 struct NegativeDistanceComputer : DistanceComputer {
-    using idx_t = Index::idx_t;
-
     /// owned by this
     DistanceComputer* basedis;
 
@@ -59,7 +57,7 @@ struct NegativeDistanceComputer : DistanceComputer {
 } // namespace
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
-    if (storage->metric_type == METRIC_INNER_PRODUCT) {
+    if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
     } else {
         return storage->get_distance_computer();
@@ -140,9 +138,6 @@ inline int insert_into_pool(Neighbor* addr, int K, Neighbor nn) {
 NSG::NSG(int R) : R(R), rng(0x0903) {
     L = R + 32;
     C = R + 100;
-    search_L = 16;
-    ntotal = 0;
-    is_built = false;
     srand(0x1998);
 }
 
diff --git a/thirdparty/faiss/faiss/impl/NSG.h b/thirdparty/faiss/faiss/impl/NSG.h
index 132c0050f..641a42f8c 100644
--- a/thirdparty/faiss/faiss/impl/NSG.h
+++ b/thirdparty/faiss/faiss/impl/NSG.h
@@ -54,7 +54,7 @@ namespace nsg {
 
 template <class node_t>
 struct Graph {
-    node_t* data;    ///< the flattened adjacency matrix
+    node_t* data;    ///< the flattened adjacency matrix, size N-by-K
     int K;           ///< nb of neighbors per node
     int N;           ///< total nb of nodes
     bool own_fields; ///< the underlying data owned by itself or not
@@ -98,12 +98,9 @@ DistanceComputer* storage_distance_computer(const Index* storage);
 
 struct NSG {
     /// internal storage of vectors (32 bits: this is expensive)
-    using storage_idx_t = int;
+    using storage_idx_t = int32_t;
 
-    /// Faiss results are 64-bit
-    using idx_t = Index::idx_t;
-
-    int ntotal; ///< nb of nodes
+    int ntotal = 0; ///< nb of nodes
 
     // construction-time parameters
     int R; ///< nb of neighbors per node
@@ -111,13 +108,13 @@ struct NSG {
     int C; ///< candidate pool size at construction time
 
     // search-time parameters
-    int search_L; ///< length of the search path
+    int search_L = 16; ///< length of the search path
 
     int enterpoint; ///< enterpoint
 
     std::shared_ptr<nsg::Graph<int>> final_graph; ///< NSG graph structure
 
-    bool is_built; ///< NSG is built or not
+    bool is_built = false; ///< NSG is built or not
 
     RandomGenerator rng; ///< random generator
 
diff --git a/thirdparty/faiss/faiss/impl/PolysemousTraining.cpp b/thirdparty/faiss/faiss/impl/PolysemousTraining.cpp
index 919868ef7..5fdc019fa 100644
--- a/thirdparty/faiss/faiss/impl/PolysemousTraining.cpp
+++ b/thirdparty/faiss/faiss/impl/PolysemousTraining.cpp
@@ -8,7 +8,6 @@
 // -*- c++ -*-
 
 #include <faiss/impl/PolysemousTraining.h>
-#include "faiss/impl/FaissAssert.h"
 
 #include <omp.h>
 #include <stdint.h>
@@ -25,7 +24,8 @@
 #include <faiss/utils/utils.h>
 
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/FaissHook.h>
+
+#include "simd/hook.h"
 
 /*****************************************
  * Mixed PQ / Hamming
@@ -37,19 +37,6 @@ namespace faiss {
  * Optimization code
  ****************************************************/
 
-SimulatedAnnealingParameters::SimulatedAnnealingParameters() {
-    // set some reasonable defaults for the optimization
-    init_temperature = 0.7;
-    temperature_decay = pow(0.9, 1 / 500.);
-    // reduce by a factor 0.9 every 500 it
-    n_iter = 500000;
-    n_redo = 2;
-    seed = 123;
-    verbose = 0;
-    only_bit_flips = false;
-    init_random = false;
-}
-
 // what would the cost update be if iw and jw were swapped?
 // default implementation just computes both and computes the difference
 double PermutationObjective::cost_update(const int* perm, int iw, int jw)
@@ -907,7 +894,7 @@ void PolysemousTraining::optimize_ranking(
         ScopeDeleter1<PermutationObjective> del(obj);
 
         if (verbose > 0) {
-            printf("   m=%d, nq=%zd, nb=%zd, intialize RankingScore "
+            printf("   m=%d, nq=%zd, nb=%zd, initialize RankingScore "
                    "in %.3f ms\n",
                    m,
                    nq,
diff --git a/thirdparty/faiss/faiss/impl/PolysemousTraining.h b/thirdparty/faiss/faiss/impl/PolysemousTraining.h
index e3430bb53..d8b5efaca 100644
--- a/thirdparty/faiss/faiss/impl/PolysemousTraining.h
+++ b/thirdparty/faiss/faiss/impl/PolysemousTraining.h
@@ -17,18 +17,19 @@ namespace faiss {
 /// parameters used for the simulated annealing method
 struct SimulatedAnnealingParameters {
     // optimization parameters
-    double init_temperature;  // init probability of accepting a bad swap
-    double temperature_decay; // at each iteration the temp is multiplied by
-                              // this
-    int n_iter;               // nb of iterations
-    int n_redo;               // nb of runs of the simulation
-    int seed;                 // random seed
-    int verbose;
-    bool only_bit_flips; // restrict permutation changes to bit flips
-    bool init_random;    // initialize with a random permutation (not identity)
+    double init_temperature = 0.7; // init probability of accepting a bad swap
+    // at each iteration the temp is multiplied by this
+    double temperature_decay = 0.9997893011688015; // = 0.9^(1/500)
+    int n_iter = 500000;                           // nb of iterations
+    int n_redo = 2; // nb of runs of the simulation
+    int seed = 123; // random seed
+    int verbose = 0;
+    bool only_bit_flips = false; // restrict permutation changes to bit flips
+    bool init_random =
+            false; // initialize with a random permutation (not identity)
 
     // set reasonable defaults
-    SimulatedAnnealingParameters();
+    SimulatedAnnealingParameters() {}
 };
 
 /// abstract class for the loss function
diff --git a/thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.cpp b/thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.cpp
new file mode 100644
index 000000000..1104b778a
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.cpp
@@ -0,0 +1,376 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/ProductAdditiveQuantizer.h>
+
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <random>
+
+#include <algorithm>
+
+#include <faiss/clone_index.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+
+extern "C" {
+
+// general matrix multiplication
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+}
+
+namespace faiss {
+
+ProductAdditiveQuantizer::ProductAdditiveQuantizer(
+        size_t d,
+        const std::vector<AdditiveQuantizer*>& aqs,
+        Search_type_t search_type) {
+    init(d, aqs, search_type);
+}
+
+ProductAdditiveQuantizer::ProductAdditiveQuantizer()
+        : ProductAdditiveQuantizer(0, {}) {}
+
+void ProductAdditiveQuantizer::init(
+        size_t d,
+        const std::vector<AdditiveQuantizer*>& aqs,
+        Search_type_t search_type) {
+    // AdditiveQuantizer constructor
+    this->d = d;
+    this->search_type = search_type;
+    M = 0;
+    for (const auto& q : aqs) {
+        M += q->M;
+        nbits.insert(nbits.end(), q->nbits.begin(), q->nbits.end());
+    }
+    set_derived_values();
+
+    // ProductAdditiveQuantizer
+    nsplits = aqs.size();
+
+    FAISS_THROW_IF_NOT(quantizers.empty());
+    for (const auto& q : aqs) {
+        auto aq = dynamic_cast<AdditiveQuantizer*>(clone_Quantizer(q));
+        quantizers.push_back(aq);
+    }
+}
+
+ProductAdditiveQuantizer::~ProductAdditiveQuantizer() {
+    for (auto& q : quantizers) {
+        delete q;
+    }
+}
+
+AdditiveQuantizer* ProductAdditiveQuantizer::subquantizer(size_t s) const {
+    return quantizers[s];
+}
+
+void ProductAdditiveQuantizer::train(size_t n, const float* x) {
+    if (is_trained) {
+        return;
+    }
+
+    // copy the subvectors into contiguous memory
+    size_t offset_d = 0;
+    std::vector<float> xt;
+    for (size_t s = 0; s < nsplits; s++) {
+        auto q = quantizers[s];
+        xt.resize(q->d * n);
+
+#pragma omp parallel for if (n > 1000)
+        for (idx_t i = 0; i < n; i++) {
+            memcpy(xt.data() + i * q->d,
+                   x + i * d + offset_d,
+                   q->d * sizeof(*x));
+        }
+
+        q->train(n, xt.data());
+        offset_d += q->d;
+    }
+
+    // compute codebook size
+    size_t codebook_size = 0;
+    for (const auto& q : quantizers) {
+        codebook_size += q->total_codebook_size * q->d;
+    }
+
+    // copy codebook from sub-quantizers
+    codebooks.resize(codebook_size); // size (M * ksub, dsub)
+    float* cb = codebooks.data();
+    for (size_t s = 0; s < nsplits; s++) {
+        auto q = quantizers[s];
+        size_t sub_codebook_size = q->total_codebook_size * q->d;
+        memcpy(cb, q->codebooks.data(), sub_codebook_size * sizeof(float));
+        cb += sub_codebook_size;
+    }
+
+    is_trained = true;
+
+    // train norm
+    std::vector<int32_t> codes(n * M);
+    compute_unpacked_codes(x, codes.data(), n);
+    std::vector<float> x_recons(n * d);
+    std::vector<float> norms(n);
+    decode_unpacked(codes.data(), x_recons.data(), n);
+    fvec_norms_L2sqr(norms.data(), x_recons.data(), d, n);
+    train_norm(n, norms.data());
+}
+
+void ProductAdditiveQuantizer::compute_codes_add_centroids(
+        const float* x,
+        uint8_t* codes_out,
+        size_t n,
+        const float* centroids) const {
+    // size (n, M)
+    std::vector<int32_t> unpacked_codes(n * M);
+    compute_unpacked_codes(x, unpacked_codes.data(), n, centroids);
+
+    // pack
+    pack_codes(n, unpacked_codes.data(), codes_out, -1, nullptr, centroids);
+}
+
+void ProductAdditiveQuantizer::compute_unpacked_codes(
+        const float* x,
+        int32_t* unpacked_codes,
+        size_t n,
+        const float* centroids) const {
+    /// TODO: actuallly we do not need to unpack and pack
+    size_t offset_d = 0, offset_m = 0;
+    std::vector<float> xsub;
+    std::vector<uint8_t> codes;
+
+    for (size_t s = 0; s < nsplits; s++) {
+        const auto q = quantizers[s];
+        xsub.resize(n * q->d);
+        codes.resize(n * q->code_size);
+
+#pragma omp parallel for if (n > 1000)
+        for (idx_t i = 0; i < n; i++) {
+            memcpy(xsub.data() + i * q->d,
+                   x + i * d + offset_d,
+                   q->d * sizeof(float));
+        }
+
+        q->compute_codes(xsub.data(), codes.data(), n);
+
+        // unpack
+#pragma omp parallel for if (n > 1000)
+        for (idx_t i = 0; i < n; i++) {
+            uint8_t* code = codes.data() + i * q->code_size;
+            BitstringReader bsr(code, q->code_size);
+
+            // unpacked_codes[i][s][m] = codes[i][m]
+            for (size_t m = 0; m < q->M; m++) {
+                unpacked_codes[i * M + offset_m + m] = bsr.read(q->nbits[m]);
+            }
+        }
+
+        offset_d += q->d;
+        offset_m += q->M;
+    }
+}
+
+void ProductAdditiveQuantizer::decode_unpacked(
+        const int32_t* codes,
+        float* x,
+        size_t n,
+        int64_t ld_codes) const {
+    FAISS_THROW_IF_NOT_MSG(
+            is_trained, "The product additive quantizer is not trained yet.");
+
+    if (ld_codes == -1) {
+        ld_codes = M;
+    }
+
+    // product additive quantizer decoding
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        const int32_t* codesi = codes + i * ld_codes;
+
+        size_t offset_m = 0, offset_d = 0;
+        for (size_t s = 0; s < nsplits; s++) {
+            const auto q = quantizers[s];
+            float* xi = x + i * d + offset_d;
+
+            for (int m = 0; m < q->M; m++) {
+                int idx = codesi[offset_m + m];
+                const float* c = codebooks.data() +
+                        q->d * (codebook_offsets[offset_m + m] + idx);
+                if (m == 0) {
+                    memcpy(xi, c, sizeof(*x) * q->d);
+                } else {
+                    fvec_add(q->d, xi, c, xi);
+                }
+            }
+
+            offset_m += q->M;
+            offset_d += q->d;
+        }
+    }
+}
+
+void ProductAdditiveQuantizer::decode(const uint8_t* codes, float* x, size_t n)
+        const {
+    FAISS_THROW_IF_NOT_MSG(
+            is_trained, "The product additive quantizer is not trained yet.");
+
+#pragma omp parallel for if (n > 1000)
+    for (int64_t i = 0; i < n; i++) {
+        BitstringReader bsr(codes + i * code_size, code_size);
+
+        size_t offset_m = 0, offset_d = 0;
+        for (size_t s = 0; s < nsplits; s++) {
+            const auto q = quantizers[s];
+            float* xi = x + i * d + offset_d;
+
+            for (int m = 0; m < q->M; m++) {
+                int idx = bsr.read(q->nbits[m]);
+                const float* c = codebooks.data() +
+                        q->d * (codebook_offsets[offset_m + m] + idx);
+                if (m == 0) {
+                    memcpy(xi, c, sizeof(*x) * q->d);
+                } else {
+                    fvec_add(q->d, xi, c, xi);
+                }
+            }
+
+            offset_m += q->M;
+            offset_d += q->d;
+        }
+    }
+}
+
+void ProductAdditiveQuantizer::compute_LUT(
+        size_t n,
+        const float* xq,
+        float* LUT,
+        float alpha,
+        long ld_lut) const {
+    // codebooks:  size (M * ksub, dsub)
+    // xq:         size (n, d)
+    // output LUT: size (n, M * ksub)
+
+    FINTEGER nqi = n;
+    // leading dimension of 'LUT' and 'xq'
+    FINTEGER ld_LUT = ld_lut > 0 ? ld_lut : total_codebook_size;
+    FINTEGER ld_xq = d;
+
+    float zero = 0;
+    size_t offset_d = 0;
+    size_t offset_cb = 0;
+    size_t offset_lut = 0;
+
+    for (size_t s = 0; s < nsplits; s++) {
+        const auto q = quantizers[s];
+
+        FINTEGER ncenti = q->total_codebook_size;
+        FINTEGER ld_cb = q->d; // leading dimension of 'codebooks'
+
+        auto codebooksi = codebooks.data() + offset_cb;
+        auto xqi = xq + offset_d;
+        auto LUTi = LUT + offset_lut;
+
+        sgemm_("Transposed",
+               "Not transposed",
+               &ncenti,
+               &nqi,
+               &ld_cb,
+               &alpha,
+               codebooksi,
+               &ld_cb,
+               xqi,
+               &ld_xq,
+               &zero,
+               LUTi,
+               &ld_LUT);
+
+        offset_d += q->d;
+        offset_cb += q->total_codebook_size * q->d;
+        offset_lut += q->total_codebook_size;
+    }
+}
+
+/*************************************
+ * Product Local Search Quantizer
+ ************************************/
+
+ProductLocalSearchQuantizer::ProductLocalSearchQuantizer(
+        size_t d,
+        size_t nsplits,
+        size_t Msub,
+        size_t nbits,
+        Search_type_t search_type) {
+    std::vector<AdditiveQuantizer*> aqs;
+
+    if (nsplits > 0) {
+        FAISS_THROW_IF_NOT(d % nsplits == 0);
+        size_t dsub = d / nsplits;
+
+        for (size_t i = 0; i < nsplits; i++) {
+            auto lsq =
+                    new LocalSearchQuantizer(dsub, Msub, nbits, ST_decompress);
+            aqs.push_back(lsq);
+        }
+    }
+    init(d, aqs, search_type);
+    for (auto& q : aqs) {
+        delete q;
+    }
+}
+
+ProductLocalSearchQuantizer::ProductLocalSearchQuantizer()
+        : ProductLocalSearchQuantizer(0, 0, 0, 0) {}
+
+/*************************************
+ * Product Residual Quantizer
+ ************************************/
+
+ProductResidualQuantizer::ProductResidualQuantizer(
+        size_t d,
+        size_t nsplits,
+        size_t Msub,
+        size_t nbits,
+        Search_type_t search_type) {
+    std::vector<AdditiveQuantizer*> aqs;
+
+    if (nsplits > 0) {
+        FAISS_THROW_IF_NOT(d % nsplits == 0);
+        size_t dsub = d / nsplits;
+
+        for (size_t i = 0; i < nsplits; i++) {
+            auto rq = new ResidualQuantizer(dsub, Msub, nbits, ST_decompress);
+            aqs.push_back(rq);
+        }
+    }
+    init(d, aqs, search_type);
+    for (auto& q : aqs) {
+        delete q;
+    }
+}
+
+ProductResidualQuantizer::ProductResidualQuantizer()
+        : ProductResidualQuantizer(0, 0, 0, 0) {}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.h b/thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.h
new file mode 100644
index 000000000..163d341cf
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/ProductAdditiveQuantizer.h
@@ -0,0 +1,154 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/impl/AdditiveQuantizer.h>
+#include <faiss/impl/LocalSearchQuantizer.h>
+#include <faiss/impl/ResidualQuantizer.h>
+
+namespace faiss {
+
+/** Product Additive Quantizers
+ *
+ * The product additive quantizer is a variant of AQ and PQ.
+ * It first splits the vector space into multiple orthogonal sub-spaces
+ * just like PQ does. And then it quantizes each sub-space by an independent
+ * additive quantizer.
+ *
+ */
+struct ProductAdditiveQuantizer : AdditiveQuantizer {
+    size_t nsplits; ///< number of sub-vectors we split a vector into
+
+    std::vector<AdditiveQuantizer*> quantizers;
+
+    /** Construct a product additive quantizer.
+     *
+     * The additive quantizers passed in will be cloned into the
+     * ProductAdditiveQuantizer object.
+     *
+     * @param d      dimensionality of the input vectors
+     * @param aqs    sub-additive quantizers
+     * @param search_type  AQ search type
+     */
+    ProductAdditiveQuantizer(
+            size_t d,
+            const std::vector<AdditiveQuantizer*>& aqs,
+            Search_type_t search_type = ST_decompress);
+
+    ProductAdditiveQuantizer();
+
+    virtual ~ProductAdditiveQuantizer();
+
+    void init(
+            size_t d,
+            const std::vector<AdditiveQuantizer*>& aqs,
+            Search_type_t search_type);
+
+    AdditiveQuantizer* subquantizer(size_t m) const;
+
+    ///< Train the product additive quantizer
+    void train(size_t n, const float* x) override;
+
+    /** Encode a set of vectors
+     *
+     * @param x      vectors to encode, size n * d
+     * @param codes  output codes, size n * code_size
+     * @param centroids  centroids to be added to x, size n * d
+     */
+    void compute_codes_add_centroids(
+            const float* x,
+            uint8_t* codes,
+            size_t n,
+            const float* centroids = nullptr) const override;
+
+    void compute_unpacked_codes(
+            const float* x,
+            int32_t* codes,
+            size_t n,
+            const float* centroids = nullptr) const;
+
+    /** Decode a set of vectors in non-packed format
+     *
+     * @param codes  codes to decode, size n * ld_codes
+     * @param x      output vectors, size n * d
+     */
+    void decode_unpacked(
+            const int32_t* codes,
+            float* x,
+            size_t n,
+            int64_t ld_codes = -1) const override;
+
+    /** Decode a set of vectors
+     *
+     * @param codes  codes to decode, size n * code_size
+     * @param x      output vectors, size n * d
+     */
+    void decode(const uint8_t* codes, float* x, size_t n) const override;
+
+    /** Compute inner-product look-up tables. Used in the search functions.
+     *
+     * @param xq     query vector, size (n, d)
+     * @param LUT    look-up table, size (n, total_codebook_size)
+     * @param alpha  compute alpha * inner-product
+     * @param ld_lut  leading dimension of LUT
+     */
+    void compute_LUT(
+            size_t n,
+            const float* xq,
+            float* LUT,
+            float alpha = 1.0f,
+            long ld_lut = -1) const override;
+};
+
+/** Product Local Search Quantizer
+ */
+struct ProductLocalSearchQuantizer : ProductAdditiveQuantizer {
+    /** Construct a product LSQ object.
+     *
+     * @param d   dimensionality of the input vectors
+     * @param nsplits  number of sub-vectors we split a vector into
+     * @param Msub     number of codebooks of each LSQ
+     * @param nbits    bits for each step
+     * @param search_type  AQ search type
+     */
+    ProductLocalSearchQuantizer(
+            size_t d,
+            size_t nsplits,
+            size_t Msub,
+            size_t nbits,
+            Search_type_t search_type = ST_decompress);
+
+    ProductLocalSearchQuantizer();
+};
+
+/** Product Residual Quantizer
+ */
+struct ProductResidualQuantizer : ProductAdditiveQuantizer {
+    /** Construct a product RQ object.
+     *
+     * @param d   dimensionality of the input vectors
+     * @param nsplits  number of sub-vectors we split a vector into
+     * @param Msub     number of codebooks of each RQ
+     * @param nbits    bits for each step
+     * @param search_type  AQ search type
+     */
+    ProductResidualQuantizer(
+            size_t d,
+            size_t nsplits,
+            size_t Msub,
+            size_t nbits,
+            Search_type_t search_type = ST_decompress);
+
+    ProductResidualQuantizer();
+};
+
+}; // namespace faiss
\ No newline at end of file
diff --git a/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp b/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp
index 5616eebb4..06db87662 100644
--- a/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/ProductQuantizer.cpp
@@ -44,137 +44,12 @@ int sgemm_(
 
 namespace faiss {
 
-/* compute an estimator using look-up tables for typical values of M */
-template <typename CT, class C>
-void pq_estimators_from_tables_Mmul4(
-        int M,
-        const CT* codes,
-        size_t ncodes,
-        const float* __restrict dis_table,
-        size_t ksub,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float* dt = dis_table;
-
-        for (size_t m = 0; m < M; m += 4) {
-            float dism = 0;
-            dism = dt[*codes++];
-            dt += ksub;
-            dism += dt[*codes++];
-            dt += ksub;
-            dism += dt[*codes++];
-            dt += ksub;
-            dism += dt[*codes++];
-            dt += ksub;
-            dis += dism;
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <typename CT, class C>
-void pq_estimators_from_tables_M4(
-        const CT* codes,
-        size_t ncodes,
-        const float* __restrict dis_table,
-        size_t ksub,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float* dt = dis_table;
-        dis = dt[*codes++];
-        dt += ksub;
-        dis += dt[*codes++];
-        dt += ksub;
-        dis += dt[*codes++];
-        dt += ksub;
-        dis += dt[*codes++];
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <typename CT, class C>
-static inline void pq_estimators_from_tables(
-        const ProductQuantizer& pq,
-        const CT* codes,
-        size_t ncodes,
-        const float* dis_table,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    if (pq.M == 4) {
-        pq_estimators_from_tables_M4<CT, C>(
-                codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
-        return;
-    }
-
-    if (pq.M % 4 == 0) {
-        pq_estimators_from_tables_Mmul4<CT, C>(
-                pq.M, codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
-        return;
-    }
-
-    /* Default is relatively slow */
-    const size_t M = pq.M;
-    const size_t ksub = pq.ksub;
-    for (size_t j = 0; j < ncodes; j++) {
-        float dis = 0;
-        const float* __restrict dt = dis_table;
-        for (int m = 0; m < M; m++) {
-            dis += dt[*codes++];
-            dt += ksub;
-        }
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
-template <class C>
-static inline void pq_estimators_from_tables_generic(
-        const ProductQuantizer& pq,
-        size_t nbits,
-        const uint8_t* codes,
-        size_t ncodes,
-        const float* dis_table,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids) {
-    const size_t M = pq.M;
-    const size_t ksub = pq.ksub;
-    for (size_t j = 0; j < ncodes; ++j) {
-        PQDecoderGeneric decoder(codes + j * pq.code_size, nbits);
-        float dis = 0;
-        const float* __restrict dt = dis_table;
-        for (size_t m = 0; m < M; m++) {
-            uint64_t c = decoder.decode();
-            dis += dt[c];
-            dt += ksub;
-        }
-
-        if (C::cmp(heap_dis[0], dis)) {
-            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
-        }
-    }
-}
-
 /*********************************************
  * PQ implementation
  *********************************************/
 
 ProductQuantizer::ProductQuantizer(size_t d, size_t M, size_t nbits)
-        : d(d), M(M), nbits(nbits), assign_index(nullptr) {
+        : Quantizer(d, 0), M(M), nbits(nbits), assign_index(nullptr) {
     set_derived_values();
 }
 
@@ -247,7 +122,7 @@ static void init_hypercube_pca(
     }
 }
 
-void ProductQuantizer::train(int n, const float* x) {
+void ProductQuantizer::train(size_t n, const float* x) {
     if (train_type != Train_shared) {
         train_type_t final_train_type;
         final_train_type = train_type;
@@ -322,26 +197,66 @@ void ProductQuantizer::train(int n, const float* x) {
 template <class PQEncoder>
 void compute_code(const ProductQuantizer& pq, const float* x, uint8_t* code) {
     std::vector<float> distances(pq.ksub);
+
+    // It seems to be meaningless to allocate std::vector<float> distances.
+    // But it is done in order to cope the ineffectiveness of the way
+    // the compiler generates the code. Basically, doing something like
+    //
+    //     size_t min_distance = HUGE_VALF;
+    //     size_t idxm = 0;
+    //     for (size_t i = 0; i < N; i++) {
+    //         const float distance = compute_distance(x, y + i * d, d);
+    //         if (distance < min_distance) {
+    //            min_distance = distance;
+    //            idxm = i;
+    //         }
+    //     }
+    //
+    // generates significantly more CPU instructions than the baseline
+    //
+    //     std::vector<float> distances_cached(N);
+    //     for (size_t i = 0; i < N; i++) {
+    //         distances_cached[i] = compute_distance(x, y + i * d, d);
+    //     }
+    //     size_t min_distance = HUGE_VALF;
+    //     size_t idxm = 0;
+    //     for (size_t i = 0; i < N; i++) {
+    //         const float distance = distances_cached[i];
+    //         if (distance < min_distance) {
+    //            min_distance = distance;
+    //            idxm = i;
+    //         }
+    //     }
+    //
+    // So, the baseline is faster. This is because of the vectorization.
+    // I suppose that the branch predictor might affect the performance as well.
+    // So, the buffer is allocated, but it might be unused in
+    // manually optimized code. Let's hope that the compiler is smart enough to
+    // get rid of std::vector allocation in such a case.
+
     PQEncoder encoder(code, pq.nbits);
     for (size_t m = 0; m < pq.M; m++) {
-        float mindis = 1e20;
-        uint64_t idxm = 0;
         const float* xsub = x + m * pq.dsub;
 
-        fvec_L2sqr_ny(
-                distances.data(),
-                xsub,
-                pq.get_centroids(m, 0),
-                pq.dsub,
-                pq.ksub);
-
-        /* Find best centroid */
-        for (size_t i = 0; i < pq.ksub; i++) {
-            float dis = distances[i];
-            if (dis < mindis) {
-                mindis = dis;
-                idxm = i;
-            }
+        uint64_t idxm = 0;
+        if (pq.transposed_centroids.empty()) {
+            // the regular version
+            idxm = fvec_L2sqr_ny_nearest(
+                    distances.data(),
+                    xsub,
+                    pq.get_centroids(m, 0),
+                    pq.dsub,
+                    pq.ksub);
+        } else {
+            // transposed centroids are available, use'em
+            idxm = fvec_L2sqr_ny_nearest_y_transposed(
+                    distances.data(),
+                    xsub,
+                    pq.transposed_centroids.data() + m * pq.ksub,
+                    pq.centroids_sq_lengths.data() + m * pq.ksub,
+                    pq.dsub,
+                    pq.M * pq.ksub,
+                    pq.ksub);
         }
 
         encoder.encode(idxm);
@@ -470,10 +385,13 @@ void ProductQuantizer::compute_codes_with_assign_index(
     }
 }
 
+// block size used in ProductQuantizer::compute_codes
+int product_quantizer_compute_codes_bs = 256 * 1024;
+
 void ProductQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
         const {
     // process by blocks to avoid using too much RAM
-    size_t bs = 256 * 1024;
+    size_t bs = product_quantizer_compute_codes_bs;
     if (n > bs) {
         for (size_t i0 = 0; i0 < n; i0 += bs) {
             size_t i1 = std::min(i0 + bs, n);
@@ -488,7 +406,7 @@ void ProductQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
         for (int64_t i = 0; i < n; i++)
             compute_code(x + i * d, codes + i * code_size);
 
-    } else { // worthwile to use BLAS
+    } else { // worthwhile to use BLAS
         float* dis_tables = new float[n * ksub * M];
         ScopeDeleter<float> del(dis_tables);
         compute_distance_tables(n, x, dis_tables);
@@ -504,15 +422,28 @@ void ProductQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
 
 void ProductQuantizer::compute_distance_table(const float* x, float* dis_table)
         const {
-    size_t m;
-
-    for (m = 0; m < M; m++) {
-        fvec_L2sqr_ny(
-                dis_table + m * ksub,
-                x + m * dsub,
-                get_centroids(m, 0),
-                dsub,
-                ksub);
+    if (transposed_centroids.empty()) {
+        // use regular version
+        for (size_t m = 0; m < M; m++) {
+            fvec_L2sqr_ny(
+                    dis_table + m * ksub,
+                    x + m * dsub,
+                    get_centroids(m, 0),
+                    dsub,
+                    ksub);
+        }
+    } else {
+        // transposed centroids are available, use'em
+        for (size_t m = 0; m < M; m++) {
+            fvec_L2sqr_ny_transposed(
+                    dis_table + m * ksub,
+                    x + m * dsub,
+                    transposed_centroids.data() + m * ksub,
+                    centroids_sq_lengths.data() + m * ksub,
+                    dsub,
+                    M * ksub,
+                    ksub);
+        }
     }
 }
 
@@ -543,7 +474,7 @@ void ProductQuantizer::compute_distance_tables(
 #endif
             if (dsub < 16) {
 
-#pragma omp parallel for
+#pragma omp parallel for if (nx > 1)
         for (int64_t i = 0; i < nx; i++) {
             compute_distance_table(x + i * d, dis_tables + i * ksub * M);
         }
@@ -577,7 +508,7 @@ void ProductQuantizer::compute_inner_prod_tables(
 #endif
             if (dsub < 16) {
 
-#pragma omp parallel for
+#pragma omp parallel for if (nx > 1)
         for (int64_t i = 0; i < nx; i++) {
             compute_inner_prod_table(x + i * d, dis_tables + i * ksub * M);
         }
@@ -607,8 +538,140 @@ void ProductQuantizer::compute_inner_prod_tables(
     }
 }
 
+/**********************************************
+ * Templatized search functions
+ * The template class C indicates whether to keep the highest or smallest values
+ **********************************************/
+
+namespace {
+
+/* compute an estimator using look-up tables for typical values of M */
+template <typename CT, class C>
+void pq_estimators_from_tables_Mmul4(
+        int M,
+        const CT* codes,
+        size_t ncodes,
+        const float* __restrict dis_table,
+        size_t ksub,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
+    for (size_t j = 0; j < ncodes; j++) {
+        float dis = 0;
+        const float* dt = dis_table;
+
+        for (size_t m = 0; m < M; m += 4) {
+            float dism = 0;
+            dism = dt[*codes++];
+            dt += ksub;
+            dism += dt[*codes++];
+            dt += ksub;
+            dism += dt[*codes++];
+            dt += ksub;
+            dism += dt[*codes++];
+            dt += ksub;
+            dis += dism;
+        }
+
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
+template <typename CT, class C>
+void pq_estimators_from_tables_M4(
+        const CT* codes,
+        size_t ncodes,
+        const float* __restrict dis_table,
+        size_t ksub,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
+    for (size_t j = 0; j < ncodes; j++) {
+        float dis = 0;
+        const float* dt = dis_table;
+        dis = dt[*codes++];
+        dt += ksub;
+        dis += dt[*codes++];
+        dt += ksub;
+        dis += dt[*codes++];
+        dt += ksub;
+        dis += dt[*codes++];
+
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
+template <typename CT, class C>
+void pq_estimators_from_tables(
+        const ProductQuantizer& pq,
+        const CT* codes,
+        size_t ncodes,
+        const float* dis_table,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
+    if (pq.M == 4) {
+        pq_estimators_from_tables_M4<CT, C>(
+                codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
+        return;
+    }
+
+    if (pq.M % 4 == 0) {
+        pq_estimators_from_tables_Mmul4<CT, C>(
+                pq.M, codes, ncodes, dis_table, pq.ksub, k, heap_dis, heap_ids);
+        return;
+    }
+
+    /* Default is relatively slow */
+    const size_t M = pq.M;
+    const size_t ksub = pq.ksub;
+    for (size_t j = 0; j < ncodes; j++) {
+        float dis = 0;
+        const float* __restrict dt = dis_table;
+        for (int m = 0; m < M; m++) {
+            dis += dt[*codes++];
+            dt += ksub;
+        }
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
 template <class C>
-static void pq_knn_search_with_tables(
+void pq_estimators_from_tables_generic(
+        const ProductQuantizer& pq,
+        size_t nbits,
+        const uint8_t* codes,
+        size_t ncodes,
+        const float* dis_table,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids) {
+    const size_t M = pq.M;
+    const size_t ksub = pq.ksub;
+    for (size_t j = 0; j < ncodes; ++j) {
+        PQDecoderGeneric decoder(codes + j * pq.code_size, nbits);
+        float dis = 0;
+        const float* __restrict dt = dis_table;
+        for (size_t m = 0; m < M; m++) {
+            uint64_t c = decoder.decode();
+            dis += dt[c];
+            dt += ksub;
+        }
+
+        if (C::cmp(heap_dis[0], dis)) {
+            heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
+        }
+    }
+}
+
+template <class C>
+void pq_knn_search_with_tables(
         const ProductQuantizer& pq,
         size_t nbits,
         const float* dis_tables,
@@ -619,7 +682,7 @@ static void pq_knn_search_with_tables(
     size_t k = res->k, nx = res->nh;
     size_t ksub = pq.ksub, M = pq.M;
 
-#pragma omp parallel for
+#pragma omp parallel for if (nx > 1)
     for (int64_t i = 0; i < nx; i++) {
         /* query preparation for asymmetric search: compute look-up tables */
         const float* dis_table = dis_tables + i * ksub * M;
@@ -668,6 +731,8 @@ static void pq_knn_search_with_tables(
     }
 }
 
+} // anonymous namespace
+
 void ProductQuantizer::search(
         const float* __restrict x,
         size_t nx,
@@ -782,4 +847,36 @@ void ProductQuantizer::search_sdc(
     }
 }
 
+void ProductQuantizer::sync_transposed_centroids() {
+    transposed_centroids.resize(d * ksub);
+    centroids_sq_lengths.resize(ksub * M);
+
+    for (size_t mi = 0; mi < M; mi++) {
+        for (size_t ki = 0; ki < ksub; ki++) {
+            float sqlen = 0;
+
+            for (size_t di = 0; di < dsub; di++) {
+                const float q = centroids[(mi * ksub + ki) * dsub + di];
+
+                transposed_centroids[(di * M + mi) * ksub + ki] = q;
+                sqlen += q * q;
+            }
+
+            centroids_sq_lengths[mi * ksub + ki] = sqlen;
+        }
+    }
+}
+
+void ProductQuantizer::clear_transposed_centroids() {
+    transposed_centroids.clear();
+    transposed_centroids.shrink_to_fit();
+
+    centroids_sq_lengths.clear();
+    centroids_sq_lengths.shrink_to_fit();
+}
+
+size_t ProductQuantizer::cal_size() const {
+    return sizeof(*this) + centroids.size() * sizeof(float);
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ProductQuantizer.h b/thirdparty/faiss/faiss/impl/ProductQuantizer.h
index 2dde68377..8f5d90883 100644
--- a/thirdparty/faiss/faiss/impl/ProductQuantizer.h
+++ b/thirdparty/faiss/faiss/impl/ProductQuantizer.h
@@ -15,31 +15,29 @@
 #include <vector>
 
 #include <faiss/Clustering.h>
+#include <faiss/impl/Quantizer.h>
+#include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
 
 namespace faiss {
 
 /** Product Quantizer. Implemented only for METRIC_L2 */
-struct ProductQuantizer {
-    using idx_t = Index::idx_t;
-
-    size_t d;     ///< size of the input vectors
+struct ProductQuantizer : Quantizer {
     size_t M;     ///< number of subquantizers
     size_t nbits; ///< number of bits per quantization index
 
     // values derived from the above
-    size_t dsub;      ///< dimensionality of each subvector
-    size_t code_size; ///< bytes per indexed vector
-    size_t ksub;      ///< number of centroids for each subquantizer
-    bool verbose;     ///< verbose during training?
+    size_t dsub;  ///< dimensionality of each subvector
+    size_t ksub;  ///< number of centroids for each subquantizer
+    bool verbose; ///< verbose during training?
 
     /// initialization
     enum train_type_t {
         Train_default,
         Train_hot_start,     ///< the centroids are already initialized
-        Train_shared,        ///< share dictionary accross PQ segments
-        Train_hypercube,     ///< intialize centroids with nbits-D hypercube
-        Train_hypercube_pca, ///< intialize centroids with nbits-D hypercube
+        Train_shared,        ///< share dictionary across PQ segments
+        Train_hypercube,     ///< initialize centroids with nbits-D hypercube
+        Train_hypercube_pca, ///< initialize centroids with nbits-D hypercube
     };
     train_type_t train_type;
 
@@ -49,9 +47,18 @@ struct ProductQuantizer {
     /// d / M)
     Index* assign_index;
 
-    /// Centroid table, size M * ksub * dsub
+    /// Centroid table, size M * ksub * dsub.
+    /// Layout: (M, ksub, dsub)
     std::vector<float> centroids;
 
+    /// Transposed centroid table, size M * ksub * dsub.
+    /// Layout: (dsub, M, ksub)
+    std::vector<float> transposed_centroids;
+
+    /// Squared lengths of centroids, size M * ksub
+    /// Layout: (M, ksub)
+    std::vector<float> centroids_sq_lengths;
+
     /// return the centroids associated with subvector m
     float* get_centroids(size_t m, size_t i) {
         return &centroids[(m * ksub + i) * dsub];
@@ -62,7 +69,7 @@ struct ProductQuantizer {
 
     // Train the product quantizer on a set of points. A clustering
     // can be set on input to define non-default clustering parameters
-    void train(int n, const float* x);
+    void train(size_t n, const float* x) override;
 
     ProductQuantizer(
             size_t d,      /* dimensionality of the input vectors */
@@ -81,7 +88,7 @@ struct ProductQuantizer {
     void compute_code(const float* x, uint8_t* code) const;
 
     /// same as compute_code for several vectors
-    void compute_codes(const float* x, uint8_t* codes, size_t n) const;
+    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
 
     /// speed up code assignment using assign_index
     /// (non-const because the index is changed)
@@ -92,7 +99,7 @@ struct ProductQuantizer {
 
     /// decode a vector from a given code (or n vectors if third argument)
     void decode(const uint8_t* code, float* x) const;
-    void decode(const uint8_t* code, float* x, size_t n) const;
+    void decode(const uint8_t* code, float* x, size_t n) const override;
 
     /// If we happen to have the distance tables precomputed, this is
     /// more efficient to compute the codes.
@@ -166,11 +173,19 @@ struct ProductQuantizer {
             float_maxheap_array_t* res,
             bool init_finalize_heap = true) const;
 
-    size_t cal_size() {
-        return sizeof(*this) + centroids.size() * sizeof(float);
-    }
+    /// Sync transposed centroids with regular centroids. This call
+    /// is needed if centroids were edited directly.
+    void sync_transposed_centroids();
+
+    /// Clear transposed centroids table so ones are no longer used.
+    void clear_transposed_centroids();
+
+    size_t cal_size() const;
 };
 
+// block size used in ProductQuantizer::compute_codes
+FAISS_API extern int product_quantizer_compute_codes_bs;
+
 /*************************************************
  * Objects to encode / decode strings of bits
  *************************************************/
diff --git a/thirdparty/faiss/faiss/impl/Quantizer.h b/thirdparty/faiss/faiss/impl/Quantizer.h
new file mode 100644
index 000000000..34673211d
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/Quantizer.h
@@ -0,0 +1,46 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+namespace faiss {
+
+/** Product Quantizer. Implemented only for METRIC_L2 */
+struct Quantizer {
+    size_t d;         ///< size of the input vectors
+    size_t code_size; ///< bytes per indexed vector
+
+    explicit Quantizer(size_t d = 0, size_t code_size = 0)
+            : d(d), code_size(code_size) {}
+
+    /** Train the quantizer
+     *
+     * @param x       training vectors, size n * d
+     */
+    virtual void train(size_t n, const float* x) = 0;
+
+    /** Quantize a set of vectors
+     *
+     * @param x        input vectors, size n * d
+     * @param codes    output codes, size n * code_size
+     */
+    virtual void compute_codes(const float* x, uint8_t* codes, size_t n)
+            const = 0;
+
+    /** Decode a set of vectors
+     *
+     * @param codes    input codes, size n * code_size
+     * @param x        output vectors, size n * d
+     */
+    virtual void decode(const uint8_t* code, float* x, size_t n) const = 0;
+
+    virtual ~Quantizer() {}
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ResidualQuantizer.cpp b/thirdparty/faiss/faiss/impl/ResidualQuantizer.cpp
index 25ad6dee6..21ab78108 100644
--- a/thirdparty/faiss/faiss/impl/ResidualQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/ResidualQuantizer.cpp
@@ -5,31 +5,25 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/impl/ResidualQuantizer.h>
 
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <cstdio>
 #include <cstring>
 #include <memory>
 
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/ResidualQuantizer.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/Clustering.h>
-#include <faiss/FaissHook.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/VectorTransform.h>
-#include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/utils/Heap.h>
+#include <faiss/impl/residual_quantizer_encode_steps.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/utils.h>
 
+#include "simd/hook.h"
+
 extern "C" {
 
 // general matrix multiplication
@@ -47,16 +41,30 @@ int sgemm_(
         float* beta,
         float* c,
         FINTEGER* ldc);
+
+// http://www.netlib.org/clapack/old/single/sgels.c
+// solve least squares
+
+int sgelsd_(
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* nrhs,
+        float* a,
+        FINTEGER* lda,
+        float* b,
+        FINTEGER* ldb,
+        float* s,
+        float* rcond,
+        FINTEGER* rank,
+        float* work,
+        FINTEGER* lwork,
+        FINTEGER* iwork,
+        FINTEGER* info);
 }
 
 namespace faiss {
 
-ResidualQuantizer::ResidualQuantizer()
-        : train_type(Train_progressive_dim),
-          max_beam_size(5),
-          use_beam_LUT(0),
-          max_mem_distances(5 * (size_t(1) << 30)), // 5 GiB
-          assign_index_factory(nullptr) {
+ResidualQuantizer::ResidualQuantizer() {
     d = 0;
     M = 0;
     verbose = false;
@@ -81,138 +89,43 @@ ResidualQuantizer::ResidualQuantizer(
         Search_type_t search_type)
         : ResidualQuantizer(d, std::vector<size_t>(M, nbits), search_type) {}
 
-void beam_search_encode_step(
-        size_t d,
-        size_t K,
-        const float* cent, /// size (K, d)
-        size_t n,
-        size_t beam_size,
-        const float* residuals, /// size (n, beam_size, d)
-        size_t m,
-        const int32_t* codes, /// size (n, beam_size, m)
-        size_t new_beam_size,
-        int32_t* new_codes,   /// size (n, new_beam_size, m + 1)
-        float* new_residuals, /// size (n, new_beam_size, d)
-        float* new_distances, /// size (n, new_beam_size)
-        Index* assign_index) {
-    // we have to fill in the whole output matrix
-    FAISS_THROW_IF_NOT(new_beam_size <= beam_size * K);
-
-    using idx_t = Index::idx_t;
-
-    std::vector<float> cent_distances;
-    std::vector<idx_t> cent_ids;
-
-    if (assign_index) {
-        // search beam_size distances per query
-        FAISS_THROW_IF_NOT(assign_index->d == d);
-        cent_distances.resize(n * beam_size * new_beam_size);
-        cent_ids.resize(n * beam_size * new_beam_size);
-        if (assign_index->ntotal != 0) {
-            // then we assume the codebooks are already added to the index
-            FAISS_THROW_IF_NOT(assign_index->ntotal == K);
-        } else {
-            assign_index->add(K, cent);
-        }
+void ResidualQuantizer::initialize_from(
+        const ResidualQuantizer& other,
+        int skip_M) {
+    FAISS_THROW_IF_NOT(M + skip_M <= other.M);
+    FAISS_THROW_IF_NOT(skip_M >= 0);
 
-        // printf("beam_search_encode_step -- mem usage %zd\n",
-        // get_mem_usage_kb());
-        assign_index->search(
-                n * beam_size,
-                residuals,
-                new_beam_size,
-                cent_distances.data(),
-                cent_ids.data());
-    } else {
-        // do one big distance computation
-        cent_distances.resize(n * beam_size * K);
-        pairwise_L2sqr(
-                d, n * beam_size, residuals, K, cent, cent_distances.data());
-    }
-    InterruptCallback::check();
-
-#pragma omp parallel for if (n > 100)
-    for (int64_t i = 0; i < n; i++) {
-        const int32_t* codes_i = codes + i * m * beam_size;
-        int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
-        const float* residuals_i = residuals + i * d * beam_size;
-        float* new_residuals_i = new_residuals + i * d * new_beam_size;
-
-        float* new_distances_i = new_distances + i * new_beam_size;
-        using C = CMax<float, int>;
-
-        if (assign_index) {
-            const float* cent_distances_i =
-                    cent_distances.data() + i * beam_size * new_beam_size;
-            const idx_t* cent_ids_i =
-                    cent_ids.data() + i * beam_size * new_beam_size;
-
-            // here we could be a tad more efficient by merging sorted arrays
-            for (int i = 0; i < new_beam_size; i++) {
-                new_distances_i[i] = C::neutral();
-            }
-            std::vector<int> perm(new_beam_size, -1);
-            heap_addn<C>(
-                    new_beam_size,
-                    new_distances_i,
-                    perm.data(),
-                    cent_distances_i,
-                    nullptr,
-                    beam_size * new_beam_size);
-            heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
-
-            for (int j = 0; j < new_beam_size; j++) {
-                int js = perm[j] / new_beam_size;
-                int ls = cent_ids_i[perm[j]];
-                if (m > 0) {
-                    memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
-                }
-                new_codes_i[m] = ls;
-                new_codes_i += m + 1;
-                fvec_sub(
-                        d,
-                        residuals_i + js * d,
-                        cent + ls * d,
-                        new_residuals_i);
-                new_residuals_i += d;
-            }
+    Search_type_t this_search_type = search_type;
+    int this_M = M;
 
-        } else {
-            const float* cent_distances_i =
-                    cent_distances.data() + i * beam_size * K;
-            // then we have to select the best results
-            for (int i = 0; i < new_beam_size; i++) {
-                new_distances_i[i] = C::neutral();
-            }
-            std::vector<int> perm(new_beam_size, -1);
-            heap_addn<C>(
-                    new_beam_size,
-                    new_distances_i,
-                    perm.data(),
-                    cent_distances_i,
-                    nullptr,
-                    beam_size * K);
-            heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
-
-            for (int j = 0; j < new_beam_size; j++) {
-                int js = perm[j] / K;
-                int ls = perm[j] % K;
-                if (m > 0) {
-                    memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
-                }
-                new_codes_i[m] = ls;
-                new_codes_i += m + 1;
-                fvec_sub(
-                        d,
-                        residuals_i + js * d,
-                        cent + ls * d,
-                        new_residuals_i);
-                new_residuals_i += d;
-            }
-        }
+    // a first good approximation: override everything
+    *this = other;
+
+    // adjust derived values
+    M = this_M;
+    search_type = this_search_type;
+    nbits.resize(M);
+    memcpy(nbits.data(),
+           other.nbits.data() + skip_M,
+           nbits.size() * sizeof(nbits[0]));
+
+    set_derived_values();
+
+    // resize codebooks if trained
+    if (codebooks.size() > 0) {
+        FAISS_THROW_IF_NOT(codebooks.size() == other.total_codebook_size * d);
+        codebooks.resize(total_codebook_size * d);
+        memcpy(codebooks.data(),
+               other.codebooks.data() + other.codebook_offsets[skip_M] * d,
+               codebooks.size() * sizeof(codebooks[0]));
+        // TODO: norm_tabs?
     }
 }
 
+/****************************************************************
+ * Training
+ ****************************************************************/
+
 void ResidualQuantizer::train(size_t n, const float* x) {
     codebooks.resize(d * codebook_offsets.back());
 
@@ -245,8 +158,6 @@ void ResidualQuantizer::train(size_t n, const float* x) {
             }
             train_residuals = residuals1;
         }
-        train_type_t tt = train_type_t(train_type & 1023);
-
         std::vector<float> codebooks;
         float obj = 0;
 
@@ -259,7 +170,7 @@ void ResidualQuantizer::train(size_t n, const float* x) {
 
         double t1 = getmillisecs();
 
-        if (tt == Train_default) {
+        if (!(train_type & Train_progressive_dim)) { // regular kmeans
             Clustering clus(d, K, cp);
             clus.train(
                     train_residuals.size() / d,
@@ -268,7 +179,7 @@ void ResidualQuantizer::train(size_t n, const float* x) {
             codebooks.swap(clus.centroids);
             assign_index->reset();
             obj = clus.iteration_stats.back().obj;
-        } else if (tt == Train_progressive_dim) {
+        } else { // progressive dim clustering
             ProgressiveDimClustering clus(d, K, cp);
             ProgressiveDimIndexFactory default_fac;
             clus.train(
@@ -277,8 +188,6 @@ void ResidualQuantizer::train(size_t n, const float* x) {
                     assign_index_factory ? *assign_index_factory : default_fac);
             codebooks.swap(clus.centroids);
             obj = clus.iteration_stats.back().obj;
-        } else {
-            FAISS_THROW_MSG("train type not supported");
         }
         clustering_time += (getmillisecs() - t1) / 1000;
 
@@ -323,7 +232,8 @@ void ResidualQuantizer::train(size_t n, const float* x) {
                     new_codes.data() + i0 * new_beam_size * (m + 1),
                     new_residuals.data() + i0 * new_beam_size * d,
                     new_distances.data() + i0 * new_beam_size,
-                    assign_index.get());
+                    assign_index.get(),
+                    approx_topk_mode);
         }
         codes.swap(new_codes);
         residuals.swap(new_residuals);
@@ -350,6 +260,19 @@ void ResidualQuantizer::train(size_t n, const float* x) {
         cur_beam_size = new_beam_size;
     }
 
+    is_trained = true;
+
+    if (train_type & Train_refine_codebook) {
+        for (int iter = 0; iter < niter_codebook_refine; iter++) {
+            if (verbose) {
+                printf("re-estimating the codebooks to minimize "
+                       "quantization errors (iter %d).\n",
+                       iter);
+            }
+            retrain_AQ_codebook(n, x);
+        }
+    }
+
     // find min and max norms
     std::vector<float> norms(n);
 
@@ -359,33 +282,128 @@ void ResidualQuantizer::train(size_t n, const float* x) {
     }
 
     // fvec_norms_L2sqr(norms.data(), x, d, n);
+    train_norm(n, norms.data());
 
-    norm_min = HUGE_VALF;
-    norm_max = -HUGE_VALF;
-    for (idx_t i = 0; i < n; i++) {
-        if (norms[i] < norm_min) {
-            norm_min = norms[i];
+    if (!(train_type & Skip_codebook_tables)) {
+        compute_codebook_tables();
+    }
+}
+
+float ResidualQuantizer::retrain_AQ_codebook(size_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(n >= total_codebook_size, "too few training points");
+
+    if (verbose) {
+        printf("  encoding %zd training vectors\n", n);
+    }
+    std::vector<uint8_t> codes(n * code_size);
+    compute_codes(x, codes.data(), n);
+
+    // compute reconstruction error
+    float input_recons_error;
+    {
+        std::vector<float> x_recons(n * d);
+        decode(codes.data(), x_recons.data(), n);
+        input_recons_error = fvec_L2sqr(x, x_recons.data(), n * d);
+        if (verbose) {
+            printf("  input quantization error %g\n", input_recons_error);
         }
-        if (norms[i] > norm_max) {
-            norm_max = norms[i];
+    }
+
+    // build matrix of the linear system
+    std::vector<float> C(n * total_codebook_size);
+    for (size_t i = 0; i < n; i++) {
+        BitstringReader bsr(codes.data() + i * code_size, code_size);
+        for (int m = 0; m < M; m++) {
+            int idx = bsr.read(nbits[m]);
+            C[i + (codebook_offsets[m] + idx) * n] = 1;
         }
     }
 
-    if (search_type == ST_norm_cqint8 || search_type == ST_norm_cqint4) {
-        size_t k = (1 << 8);
-        if (search_type == ST_norm_cqint4) {
-            k = (1 << 4);
+    // transpose training vectors
+    std::vector<float> xt(n * d);
+
+    for (size_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d; j++) {
+            xt[j * n + i] = x[i * d + j];
         }
-        Clustering1D clus(k);
-        clus.train_exact(n, norms.data());
-        qnorm.add(clus.k, clus.centroids.data());
     }
 
-    is_trained = true;
+    { // solve least squares
+        FINTEGER lwork = -1;
+        FINTEGER di = d, ni = n, tcsi = total_codebook_size;
+        FINTEGER info = -1, rank = -1;
+
+        float rcond = 1e-4; // this is an important parameter because the code
+                            // matrix can be rank deficient for small problems,
+                            // the default rcond=-1 does not work
+        float worksize;
+        std::vector<float> sing_vals(total_codebook_size);
+        FINTEGER nlvl = 1000; // formula is a bit convoluted so let's take an
+                              // upper bound
+        std::vector<FINTEGER> iwork(
+                3 * total_codebook_size * nlvl + 11 * total_codebook_size);
+
+        // worksize query
+        sgelsd_(&ni,
+                &tcsi,
+                &di,
+                C.data(),
+                &ni,
+                xt.data(),
+                &ni,
+                sing_vals.data(),
+                &rcond,
+                &rank,
+                &worksize,
+                &lwork,
+                iwork.data(),
+                &info);
+        FAISS_THROW_IF_NOT(info == 0);
+
+        lwork = worksize;
+        std::vector<float> work(lwork);
+        // actual call
+        sgelsd_(&ni,
+                &tcsi,
+                &di,
+                C.data(),
+                &ni,
+                xt.data(),
+                &ni,
+                sing_vals.data(),
+                &rcond,
+                &rank,
+                work.data(),
+                &lwork,
+                iwork.data(),
+                &info);
+        FAISS_THROW_IF_NOT_FMT(info == 0, "SGELS returned info=%d", int(info));
+        if (verbose) {
+            printf("   sgelsd rank=%d/%d\n",
+                   int(rank),
+                   int(total_codebook_size));
+        }
+    }
 
-    if (!(train_type & Skip_codebook_tables)) {
-        compute_codebook_tables();
+    // result is in xt, re-transpose to codebook
+
+    for (size_t i = 0; i < total_codebook_size; i++) {
+        for (size_t j = 0; j < d; j++) {
+            codebooks[i * d + j] = xt[j * n + i];
+            FAISS_THROW_IF_NOT(std::isfinite(codebooks[i * d + j]));
+        }
+    }
+
+    float output_recons_error = 0;
+    for (size_t j = 0; j < d; j++) {
+        output_recons_error += fvec_norm_L2sqr(
+                xt.data() + total_codebook_size + n * j,
+                n - total_codebook_size);
+    }
+    if (verbose) {
+        printf("  output quantization error %g\n", output_recons_error);
     }
+    return output_recons_error;
 }
 
 size_t ResidualQuantizer::memory_per_point(int beam_size) const {
@@ -395,101 +413,60 @@ size_t ResidualQuantizer::memory_per_point(int beam_size) const {
     size_t mem;
     mem = beam_size * d * 2 * sizeof(float); // size for 2 beams at a time
     mem += beam_size * beam_size *
-            (sizeof(float) +
-             sizeof(Index::idx_t)); // size for 1 beam search result
+            (sizeof(float) + sizeof(idx_t)); // size for 1 beam search result
     return mem;
 }
 
-void ResidualQuantizer::compute_codes(
+/****************************************************************
+ * Encoding
+ ****************************************************************/
+
+using namespace rq_encode_steps;
+
+void ResidualQuantizer::compute_codes_add_centroids(
         const float* x,
         uint8_t* codes_out,
-        size_t n) const {
+        size_t n,
+        const float* centroids) const {
     FAISS_THROW_IF_NOT_MSG(is_trained, "RQ is not trained yet.");
 
+    //
     size_t mem = memory_per_point();
-    if (n > 1 && mem * n > max_mem_distances) {
-        // then split queries to reduce temp memory
-        size_t bs = max_mem_distances / mem;
-        if (bs == 0) {
-            bs = 1; // otherwise we can't do much
-        }
-        for (size_t i0 = 0; i0 < n; i0 += bs) {
-            size_t i1 = std::min(n, i0 + bs);
-            compute_codes(x + i0 * d, codes_out + i0 * code_size, i1 - i0);
-        }
-        return;
+
+    size_t bs = max_mem_distances / mem;
+    if (bs == 0) {
+        bs = 1; // otherwise we can't do much
     }
 
-    std::vector<int32_t> codes(max_beam_size * M * n);
-    std::vector<float> norms;
-    std::vector<float> distances(max_beam_size * n);
-
-    if (use_beam_LUT == 0) {
-        std::vector<float> residuals(max_beam_size * n * d);
-
-        refine_beam(
-                n,
-                1,
-                x,
-                max_beam_size,
-                codes.data(),
-                residuals.data(),
-                distances.data());
-
-        if (search_type == ST_norm_float || search_type == ST_norm_qint8 ||
-            search_type == ST_norm_qint4) {
-            norms.resize(n);
-            // recover the norms of reconstruction as
-            // || original_vector - residual ||^2
-            for (size_t i = 0; i < n; i++) {
-                norms[i] = fvec_L2sqr(
-                        x + i * d, residuals.data() + i * max_beam_size * d, d);
-            }
-        }
-    } else if (use_beam_LUT == 1) {
-        FAISS_THROW_IF_NOT_MSG(
-                codebook_cross_products.size() ==
-                        total_codebook_size * total_codebook_size,
-                "call compute_codebook_tables first");
-
-        std::vector<float> query_norms(n);
-        fvec_norms_L2sqr(query_norms.data(), x, d, n);
-
-        std::vector<float> query_cp(n * total_codebook_size);
-        {
-            FINTEGER ti = total_codebook_size, di = d, ni = n;
-            float zero = 0, one = 1;
-            sgemm_("Transposed",
-                   "Not transposed",
-                   &ti,
-                   &ni,
-                   &di,
-                   &one,
-                   codebooks.data(),
-                   &di,
-                   x,
-                   &di,
-                   &zero,
-                   query_cp.data(),
-                   &ti);
+    // prepare memory pools
+    ComputeCodesAddCentroidsLUT0MemoryPool pool0;
+    ComputeCodesAddCentroidsLUT1MemoryPool pool1;
+
+    for (size_t i0 = 0; i0 < n; i0 += bs) {
+        size_t i1 = std::min(n, i0 + bs);
+        const float* cent = nullptr;
+        if (centroids != nullptr) {
+            cent = centroids + i0 * d;
         }
 
-        refine_beam_LUT(
-                n,
-                query_norms.data(),
-                query_cp.data(),
-                max_beam_size,
-                codes.data(),
-                distances.data());
+        if (use_beam_LUT == 0) {
+            compute_codes_add_centroids_mp_lut0(
+                    *this,
+                    x + i0 * d,
+                    codes_out + i0 * code_size,
+                    i1 - i0,
+                    cent,
+                    pool0);
+        } else if (use_beam_LUT == 1) {
+            compute_codes_add_centroids_mp_lut1(
+                    *this,
+                    x + i0 * d,
+                    codes_out + i0 * code_size,
+                    i1 - i0,
+                    cent,
+                    pool1);
+        }
     }
-    // pack only the first code of the beam (hence the ld_codes=M *
-    // max_beam_size)
-    pack_codes(
-            n,
-            codes.data(),
-            codes_out,
-            M * max_beam_size,
-            norms.size() > 0 ? norms.data() : nullptr);
 }
 
 void ResidualQuantizer::refine_beam(
@@ -500,82 +477,17 @@ void ResidualQuantizer::refine_beam(
         int32_t* out_codes,
         float* out_residuals,
         float* out_distances) const {
-    int cur_beam_size = beam_size;
-
-    std::vector<float> residuals(x, x + n * d * beam_size);
-    std::vector<int32_t> codes;
-    std::vector<float> distances;
-    double t0 = getmillisecs();
-
-    std::unique_ptr<Index> assign_index;
-    if (assign_index_factory) {
-        assign_index.reset((*assign_index_factory)(d));
-    } else {
-        assign_index.reset(new IndexFlatL2(d));
-    }
-
-    for (int m = 0; m < M; m++) {
-        int K = 1 << nbits[m];
-
-        const float* codebooks_m =
-                this->codebooks.data() + codebook_offsets[m] * d;
-
-        int new_beam_size = std::min(cur_beam_size * K, out_beam_size);
-
-        std::vector<int32_t> new_codes(n * new_beam_size * (m + 1));
-        std::vector<float> new_residuals(n * new_beam_size * d);
-        distances.resize(n * new_beam_size);
-
-        beam_search_encode_step(
-                d,
-                K,
-                codebooks_m,
-                n,
-                cur_beam_size,
-                residuals.data(),
-                m,
-                codes.data(),
-                new_beam_size,
-                new_codes.data(),
-                new_residuals.data(),
-                distances.data(),
-                assign_index.get());
-
-        assign_index->reset();
-
-        codes.swap(new_codes);
-        residuals.swap(new_residuals);
-
-        cur_beam_size = new_beam_size;
-
-        if (verbose) {
-            float sum_distances = 0;
-            for (int j = 0; j < distances.size(); j++) {
-                sum_distances += distances[j];
-            }
-            printf("[%.3f s] encode stage %d, %d bits, "
-                   "total error %g, beam_size %d\n",
-                   (getmillisecs() - t0) / 1000,
-                   m,
-                   int(nbits[m]),
-                   sum_distances,
-                   cur_beam_size);
-        }
-    }
-
-    if (out_codes) {
-        memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0]));
-    }
-    if (out_residuals) {
-        memcpy(out_residuals,
-               residuals.data(),
-               residuals.size() * sizeof(residuals[0]));
-    }
-    if (out_distances) {
-        memcpy(out_distances,
-               distances.data(),
-               distances.size() * sizeof(distances[0]));
-    }
+    RefineBeamMemoryPool pool;
+    refine_beam_mp(
+            *this,
+            n,
+            beam_size,
+            x,
+            out_beam_size,
+            out_codes,
+            out_residuals,
+            out_distances,
+            pool);
 }
 
 /*******************************************************************
@@ -583,109 +495,36 @@ void ResidualQuantizer::refine_beam(
  *******************************************************************/
 
 void ResidualQuantizer::compute_codebook_tables() {
-    codebook_cross_products.resize(total_codebook_size * total_codebook_size);
     cent_norms.resize(total_codebook_size);
-    // stricly speaking we could use ssyrk
-    {
-        FINTEGER ni = total_codebook_size;
+    fvec_norms_L2sqr(
+            cent_norms.data(), codebooks.data(), d, total_codebook_size);
+    size_t cross_table_size = 0;
+    for (int m = 0; m < M; m++) {
+        size_t K = (size_t)1 << nbits[m];
+        cross_table_size += K * codebook_offsets[m];
+    }
+    codebook_cross_products.resize(cross_table_size);
+    size_t ofs = 0;
+    for (int m = 1; m < M; m++) {
+        FINTEGER ki = (size_t)1 << nbits[m];
+        FINTEGER kk = codebook_offsets[m];
         FINTEGER di = d;
         float zero = 0, one = 1;
+        assert(ofs + ki * kk <= cross_table_size);
         sgemm_("Transposed",
                "Not transposed",
-               &ni,
-               &ni,
+               &ki,
+               &kk,
                &di,
                &one,
-               codebooks.data(),
+               codebooks.data() + d * kk,
                &di,
                codebooks.data(),
                &di,
                &zero,
-               codebook_cross_products.data(),
-               &ni);
-    }
-    for (size_t i = 0; i < total_codebook_size; i++) {
-        cent_norms[i] = codebook_cross_products[i + i * total_codebook_size];
-    }
-}
-
-void beam_search_encode_step_tab(
-        size_t K,
-        size_t n,
-        size_t beam_size,                  // input sizes
-        const float* codebook_cross_norms, // size K * ldc
-        size_t ldc,                        // >= K
-        const uint64_t* codebook_offsets,  // m
-        const float* query_cp,             // size n * ldqc
-        size_t ldqc,                       // >= K
-        const float* cent_norms_i,         // size K
-        size_t m,
-        const int32_t* codes,   // n * beam_size * m
-        const float* distances, // n * beam_size
-        size_t new_beam_size,
-        int32_t* new_codes,   // n * new_beam_size * (m + 1)
-        float* new_distances) // n * new_beam_size
-{
-    FAISS_THROW_IF_NOT(ldc >= K);
-
-#pragma omp parallel for if (n > 100)
-    for (int64_t i = 0; i < n; i++) {
-        std::vector<float> cent_distances(beam_size * K);
-        std::vector<float> cd_common(K);
-
-        const int32_t* codes_i = codes + i * m * beam_size;
-        const float* query_cp_i = query_cp + i * ldqc;
-        const float* distances_i = distances + i * beam_size;
-
-        for (size_t k = 0; k < K; k++) {
-            cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k];
-        }
-
-        for (size_t b = 0; b < beam_size; b++) {
-            std::vector<float> dp(K);
-
-            for (size_t m1 = 0; m1 < m; m1++) {
-                size_t c = codes_i[b * m + m1];
-                const float* cb =
-                        &codebook_cross_norms[(codebook_offsets[m1] + c) * ldc];
-                fvec_add(K, cb, dp.data(), dp.data());
-            }
-
-            for (size_t k = 0; k < K; k++) {
-                cent_distances[b * K + k] =
-                        distances_i[b] + cd_common[k] + 2 * dp[k];
-            }
-        }
-
-        using C = CMax<float, int>;
-        int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
-        float* new_distances_i = new_distances + i * new_beam_size;
-
-        const float* cent_distances_i = cent_distances.data();
-
-        // then we have to select the best results
-        for (int i = 0; i < new_beam_size; i++) {
-            new_distances_i[i] = C::neutral();
-        }
-        std::vector<int> perm(new_beam_size, -1);
-        heap_addn<C>(
-                new_beam_size,
-                new_distances_i,
-                perm.data(),
-                cent_distances_i,
-                nullptr,
-                beam_size * K);
-        heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
-
-        for (int j = 0; j < new_beam_size; j++) {
-            int js = perm[j] / K;
-            int ls = perm[j] % K;
-            if (m > 0) {
-                memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
-            }
-            new_codes_i[m] = ls;
-            new_codes_i += m + 1;
-        }
+               codebook_cross_products.data() + ofs,
+               &ki);
+        ofs += ki * kk;
     }
 }
 
@@ -696,63 +535,16 @@ void ResidualQuantizer::refine_beam_LUT(
         int out_beam_size,
         int32_t* out_codes,
         float* out_distances) const {
-    int beam_size = 1;
-
-    std::vector<int32_t> codes;
-    std::vector<float> distances(query_norms, query_norms + n);
-    double t0 = getmillisecs();
-
-    for (int m = 0; m < M; m++) {
-        int K = 1 << nbits[m];
-
-        int new_beam_size = std::min(beam_size * K, out_beam_size);
-        std::vector<int32_t> new_codes(n * new_beam_size * (m + 1));
-        std::vector<float> new_distances(n * new_beam_size);
-
-        beam_search_encode_step_tab(
-                K,
-                n,
-                beam_size,
-                codebook_cross_products.data() + codebook_offsets[m],
-                total_codebook_size,
-                codebook_offsets.data(),
-                query_cp + codebook_offsets[m],
-                total_codebook_size,
-                cent_norms.data() + codebook_offsets[m],
-                m,
-                codes.data(),
-                distances.data(),
-                new_beam_size,
-                new_codes.data(),
-                new_distances.data());
-
-        codes.swap(new_codes);
-        distances.swap(new_distances);
-        beam_size = new_beam_size;
-
-        if (verbose) {
-            float sum_distances = 0;
-            for (int j = 0; j < distances.size(); j++) {
-                sum_distances += distances[j];
-            }
-            printf("[%.3f s] encode stage %d, %d bits, "
-                   "total error %g, beam_size %d\n",
-                   (getmillisecs() - t0) / 1000,
-                   m,
-                   int(nbits[m]),
-                   sum_distances,
-                   beam_size);
-        }
-    }
-
-    if (out_codes) {
-        memcpy(out_codes, codes.data(), codes.size() * sizeof(codes[0]));
-    }
-    if (out_distances) {
-        memcpy(out_distances,
-               distances.data(),
-               distances.size() * sizeof(distances[0]));
-    }
+    RefineBeamLUTMemoryPool pool;
+    refine_beam_LUT_mp(
+            *this,
+            n,
+            query_norms,
+            query_cp,
+            out_beam_size,
+            out_codes,
+            out_distances,
+            pool);
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ResidualQuantizer.h b/thirdparty/faiss/faiss/impl/ResidualQuantizer.h
index 775a2eae5..95677d888 100644
--- a/thirdparty/faiss/faiss/impl/ResidualQuantizer.h
+++ b/thirdparty/faiss/faiss/impl/ResidualQuantizer.h
@@ -13,6 +13,8 @@
 #include <faiss/Clustering.h>
 #include <faiss/impl/AdditiveQuantizer.h>
 
+#include <faiss/utils/approx_topk/mode.h>
+
 namespace faiss {
 
 /** Residual quantizer with variable number of bits per sub-quantizer
@@ -24,42 +26,48 @@ namespace faiss {
 
 struct ResidualQuantizer : AdditiveQuantizer {
     /// initialization
-    enum train_type_t {
-        Train_default = 0,         ///< regular k-means
-        Train_progressive_dim = 1, ///< progressive dim clustering
-        Train_default_Train_top_beam = 1024,
-        Train_progressive_dim_Train_top_beam = 1025,
-        Train_default_Skip_codebook_tables = 2048,
-        Train_progressive_dim_Skip_codebook_tables = 2049,
-        Train_default_Train_top_beam_Skip_codebook_tables = 3072,
-        Train_progressive_dim_Train_top_beam_Skip_codebook_tables = 3073,
-    };
-
-    train_type_t train_type;
-
-    // set this bit on train_type if beam is to be trained only on the
-    // first element of the beam (faster but less accurate)
+
+    //  Was enum but that does not work so well with bitmasks
+    using train_type_t = int;
+
+    /// Binary or of the Train_* flags below
+    train_type_t train_type = Train_progressive_dim;
+
+    /// regular k-means (minimal amount of computation)
+    static const int Train_default = 0;
+
+    /// progressive dim clustering (set by default)
+    static const int Train_progressive_dim = 1;
+
+    /// do a few iterations of codebook refinement after first level estimation
+    static const int Train_refine_codebook = 2;
+
+    /// number of iterations for codebook refinement.
+    int niter_codebook_refine = 5;
+
+    /** set this bit on train_type if beam is to be trained only on the
+     *  first element of the beam (faster but less accurate) */
     static const int Train_top_beam = 1024;
 
-    // set this bit to not autmatically compute the codebook tables
-    // after training
+    /** set this bit to *not* autmatically compute the codebook tables
+     * after training */
     static const int Skip_codebook_tables = 2048;
 
     /// beam size used for training and for encoding
-    int max_beam_size;
+    int max_beam_size = 5;
 
     /// use LUT for beam search
-    int use_beam_LUT;
+    int use_beam_LUT = 0;
 
-    /// distance matrixes with beam search can get large, so use this
-    /// to batch computations at encoding time.
-    size_t max_mem_distances;
+    /// Currently used mode of approximate min-k computations.
+    /// Default value is EXACT_TOPK.
+    ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK;
 
     /// clustering parameters
     ProgressiveDimClusteringParameters cp;
 
     /// if non-NULL, use this index for assignment
-    ProgressiveDimIndexFactory* assign_index_factory;
+    ProgressiveDimIndexFactory* assign_index_factory = nullptr;
 
     ResidualQuantizer(
             size_t d,
@@ -74,15 +82,33 @@ struct ResidualQuantizer : AdditiveQuantizer {
 
     ResidualQuantizer();
 
-    // Train the residual quantizer
+    /// Train the residual quantizer
     void train(size_t n, const float* x) override;
 
+    /// Copy the M codebook levels from other, starting from skip_M
+    void initialize_from(const ResidualQuantizer& other, int skip_M = 0);
+
+    /** Encode the vectors and compute codebook that minimizes the quantization
+     * error on these codes
+     *
+     * @param x      training vectors, size n * d
+     * @param n      nb of training vectors, n >= total_codebook_size
+     * @return       returns quantization error for the new codebook with old
+     * codes
+     */
+    float retrain_AQ_codebook(size_t n, const float* x);
+
     /** Encode a set of vectors
      *
      * @param x      vectors to encode, size n * d
      * @param codes  output codes, size n * code_size
+     * @param centroids  centroids to be added to x, size n * d
      */
-    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
+    void compute_codes_add_centroids(
+            const float* x,
+            uint8_t* codes,
+            size_t n,
+            const float* centroids = nullptr) const override;
 
     /** lower-level encode function
      *
@@ -118,71 +144,15 @@ struct ResidualQuantizer : AdditiveQuantizer {
      */
     size_t memory_per_point(int beam_size = -1) const;
 
-    /** Cross products used in codebook tables
-     *
-     * These are used to keep trak of norms of centroids.
+    /** Cross products used in codebook tables used for beam_LUT = 1
      */
     void compute_codebook_tables();
 
-    /// dot products of all codebook vectors with each other
-    /// size total_codebook_size * total_codebook_size
+    /// dot products of all codebook entries with the previous codebooks
+    /// size sum(codebook_offsets[m] * 2^nbits[m], m=0..M-1)
     std::vector<float> codebook_cross_products;
-    /// norms of all vectors
+    /// norms of all codebook entries (size total_codebook_size)
     std::vector<float> cent_norms;
 };
 
-/** Encode a residual by sampling from a centroid table.
- *
- * This is a single encoding step the residual quantizer.
- * It allows low-level access to the encoding function, exposed mainly for unit
- * tests.
- *
- * @param n              number of vectors to hanlde
- * @param residuals      vectors to encode, size (n, beam_size, d)
- * @param cent           centroids, size (K, d)
- * @param beam_size      input beam size
- * @param m              size of the codes for the previous encoding steps
- * @param codes          code array for the previous steps of the beam (n,
- * beam_size, m)
- * @param new_beam_size  output beam size (should be <= K * beam_size)
- * @param new_codes      output codes, size (n, new_beam_size, m + 1)
- * @param new_residuals  output residuals, size (n, new_beam_size, d)
- * @param new_distances  output distances, size (n, new_beam_size)
- * @param assign_index   if non-NULL, will be used to perform assignment
- */
-void beam_search_encode_step(
-        size_t d,
-        size_t K,
-        const float* cent,
-        size_t n,
-        size_t beam_size,
-        const float* residuals,
-        size_t m,
-        const int32_t* codes,
-        size_t new_beam_size,
-        int32_t* new_codes,
-        float* new_residuals,
-        float* new_distances,
-        Index* assign_index = nullptr);
-
-/** Encode a set of vectors using their dot products with the codebooks
- *
- */
-void beam_search_encode_step_tab(
-        size_t K,
-        size_t n,
-        size_t beam_size,                  // input sizes
-        const float* codebook_cross_norms, // size K * ldc
-        size_t ldc,                        // >= K
-        const uint64_t* codebook_offsets,  // m
-        const float* query_cp,             // size n * ldqc
-        size_t ldqc,                       // >= K
-        const float* cent_norms_i,         // size K
-        size_t m,
-        const int32_t* codes,   // n * beam_size * m
-        const float* distances, // n * beam_size
-        size_t new_beam_size,
-        int32_t* new_codes,    // n * new_beam_size * (m + 1)
-        float* new_distances); // n * new_beam_size
-
 }; // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ResultHandler.h b/thirdparty/faiss/faiss/impl/ResultHandler.h
index f9c5027a5..76762143f 100644
--- a/thirdparty/faiss/faiss/impl/ResultHandler.h
+++ b/thirdparty/faiss/faiss/impl/ResultHandler.h
@@ -12,6 +12,7 @@
 #pragma once
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/partitioning.h>
 
@@ -26,14 +27,14 @@ struct HeapResultHandler {
     using T = typename C::T;
     using TI = typename C::TI;
 
-    int nq;
-    T* heap_dis_tab;
-    TI* heap_ids_tab;
-    bool own_fields;
+    int nq = 0;
+    T* heap_dis_tab = nullptr;
+    TI* heap_ids_tab = nullptr;
+    bool own_fields = false;
 
-    int64_t k; // number of results to keep
+    int64_t k = 0; // number of results to keep
 
-    HeapResultHandler() {}
+    HeapResultHandler() = default;
 
     HeapResultHandler(size_t nq, T* heap_dis_tab, TI* heap_ids_tab, size_t k)
             : nq(nq),
@@ -50,6 +51,7 @@ struct HeapResultHandler {
     }
 
     HeapResultHandler* clone_n(int n, size_t block_x) {
+        // todo aguzhva: potential memory leak
         HeapResultHandler* ress = new HeapResultHandler[n];
 
         T* global_heap_dis_tab = (T*)malloc(block_x * k * n * sizeof(T));
@@ -120,7 +122,7 @@ struct HeapResultHandler {
 
     /// add results for query i0..i1 and j0..j1
     void add_results(size_t j0, size_t j1, const T* dis_tab,
-                     BitsetView bitset = nullptr) {
+                     const IDSelector* sel = nullptr) {
 #pragma omp parallel for
         for (int64_t i = i0; i < i1; i++) {
             T* heap_dis = heap_dis_tab + i * k;
@@ -128,7 +130,7 @@ struct HeapResultHandler {
             const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
             T thresh = heap_dis[0];
             for (size_t j = j0; j < j1; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
                     T dis = dis_tab_i[j];
                     if (C::cmp(thresh, dis)) {
                         heap_replace_top<C>(k, heap_dis, heap_ids, dis, j);
@@ -148,6 +150,7 @@ struct HeapResultHandler {
     }
 
     void merge(size_t i, HeapResultHandler &rh) {
+        // todo aguzhva: no checks for matching sizes
         const size_t ki = i * k, uj = ki + k;
         for (size_t j = ki; j < uj; ++j) {
             add_single_result(i, rh.heap_dis_tab[j], rh.heap_ids_tab[j]);
@@ -163,6 +166,7 @@ struct HeapResultHandler {
     }
 
     void copy_from(HeapResultHandler &res, size_t x_from, size_t size) {
+        // todo aguzhva: no checks for matching sizes
         memcpy(heap_dis_tab + x_from * k, res.heap_dis_tab, size * k * sizeof(T));
         memcpy(heap_ids_tab + x_from * k, res.heap_ids_tab, size * k * sizeof(TI));
     }
@@ -242,13 +246,13 @@ struct ReservoirResultHandler {
     using T = typename C::T;
     using TI = typename C::TI;
 
-    int nq;
-    T* heap_dis_tab;
-    TI* heap_ids_tab;
+    int nq = 0;
+    T* heap_dis_tab = nullptr;
+    TI* heap_ids_tab = nullptr;
 
-    int64_t k;       // number of results to keep
-    size_t capacity; // capacity of the reservoirs
-    bool own_fields;
+    int64_t k = 0;       // number of results to keep
+    size_t capacity = 0; // capacity of the reservoirs
+    bool own_fields = false;
 
     ReservoirResultHandler(
             size_t nq,
@@ -264,7 +268,7 @@ struct ReservoirResultHandler {
         capacity = (2 * k + 15) & ~15;
     }
 
-    ReservoirResultHandler() {}
+    ReservoirResultHandler() = default;
 
     ~ReservoirResultHandler() {
         if (own_fields) {
@@ -274,6 +278,7 @@ struct ReservoirResultHandler {
     }
 
     ReservoirResultHandler *clone_n(int n, size_t block_x) {
+        // todo aguzhva: potential memory leak
         ReservoirResultHandler *ress = new ReservoirResultHandler[n];
 
         T* global_heap_dis_tab = (T*)malloc(block_x * k * n * sizeof(T));
@@ -361,14 +366,14 @@ struct ReservoirResultHandler {
 
     /// add results for query i0..i1 and j0..j1
     void add_results(size_t j0, size_t j1, const T* dis_tab,
-                     BitsetView bitset = nullptr) {
+                     const IDSelector* sel = nullptr) {
         // maybe parallel for
 #pragma omp parallel for
         for (int64_t i = i0; i < i1; i++) {
             ReservoirTopN<C>& reservoir = reservoirs[i - i0];
             const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
             for (size_t j = j0; j < j1; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
                     T dis = dis_tab_i[j];
                     reservoir.add(dis, j);
                 }
@@ -381,6 +386,7 @@ struct ReservoirResultHandler {
     }
 
     void merge(size_t i, ReservoirResultHandler &rh) {
+        // todo aguzhva: no checks for matching sizes
         const size_t ii = i - rh.i0;
         const T* dis = rh.reservoir_dis.data() + ii * rh.capacity;
         const TI* ids = rh.reservoir_ids.data() + ii * rh.capacity;
@@ -399,6 +405,7 @@ struct ReservoirResultHandler {
     }
 
     void copy_from(ReservoirResultHandler &res, size_t x_from, size_t size) {
+        // todo aguzhva: no checks for matching sizes
         memcpy(heap_dis_tab + x_from * k, res.heap_dis_tab, size * k * sizeof(T));
         memcpy(heap_ids_tab + x_from * k, res.heap_ids_tab, size * k * sizeof(TI));
     }
@@ -472,7 +479,7 @@ struct RangeSearchResultHandler {
     /// add results for query i0..i1 and j0..j1
 
     void add_results(size_t j0, size_t j1, const T* dis_tab,
-                     BitsetView bitset = nullptr) {
+                     const IDSelector* sel = nullptr) {
         RangeSearchPartialResult* pres;
         // there is one RangeSearchPartialResult structure per j0
         // (= block of columns of the large distance matrix)
@@ -496,8 +503,9 @@ struct RangeSearchResultHandler {
         for (size_t i = i0; i < i1; i++) {
             const float* ip_line = dis_tab + (i - i0) * (j1 - j0);
             RangeQueryResult& qres = pres->new_result(i);
+
             for (size_t j = j0; j < j1; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
                     float dis = *ip_line;
                     if (C::cmp(radius, dis)) {
                         qres.add(dis, j);
@@ -517,4 +525,100 @@ struct RangeSearchResultHandler {
     }
 };
 
+/*****************************************************************
+ * Single best result handler.
+ * Tracks the only best result, thus avoiding storing
+ * some temporary data in memory.
+ *****************************************************************/
+
+template <class C>
+struct SingleBestResultHandler {
+    using T = typename C::T;
+    using TI = typename C::TI;
+
+    int nq;
+    // contains exactly nq elements
+    T* dis_tab;
+    // contains exactly nq elements
+    TI* ids_tab;
+
+    SingleBestResultHandler(size_t nq, T* dis_tab, TI* ids_tab)
+            : nq(nq), dis_tab(dis_tab), ids_tab(ids_tab) {}
+
+    struct SingleResultHandler {
+        SingleBestResultHandler& hr;
+
+        T min_dis;
+        TI min_idx;
+        size_t current_idx = 0;
+
+        SingleResultHandler(SingleBestResultHandler& hr) : hr(hr) {}
+
+        /// begin results for query # i
+        void begin(const size_t current_idx) {
+            this->current_idx = current_idx;
+            min_dis = HUGE_VALF;
+            min_idx = -1;
+        }
+
+        /// add one result for query i
+        void add_result(T dis, TI idx) {
+            if (C::cmp(min_dis, dis)) {
+                min_dis = dis;
+                min_idx = idx;
+            }
+        }
+
+        /// series of results for query i is done
+        void end() {
+            hr.dis_tab[current_idx] = min_dis;
+            hr.ids_tab[current_idx] = min_idx;
+        }
+    };
+
+    size_t i0, i1;
+
+    /// begin
+    void begin_multiple(size_t i0, size_t i1) {
+        this->i0 = i0;
+        this->i1 = i1;
+
+        for (size_t i = i0; i < i1; i++) {
+            this->dis_tab[i] = HUGE_VALF;
+        }
+    }
+
+    /// add results for query i0..i1 and j0..j1
+    void add_results(size_t j0, size_t j1, const T* dis_tab) {
+        for (int64_t i = i0; i < i1; i++) {
+            const T* dis_tab_i = dis_tab + (j1 - j0) * (i - i0) - j0;
+
+            auto& min_distance = this->dis_tab[i];
+            auto& min_index = this->ids_tab[i];
+
+            for (size_t j = j0; j < j1; j++) {
+                const T distance = dis_tab_i[j];
+
+                if (C::cmp(min_distance, distance)) {
+                    min_distance = distance;
+                    min_index = j;
+                }
+            }
+        }
+    }
+
+    void add_result(const size_t i, const T dis, const TI idx) {
+        auto& min_distance = this->dis_tab[i];
+        auto& min_index = this->ids_tab[i];
+
+        if (C::cmp(min_distance, dis)) {
+            min_distance = dis;
+            min_index = idx;
+        }
+    }
+
+    /// series of results for queries i0..i1 is done
+    void end_multiple() {}
+};
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp b/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp
index 72c556ad2..2c81a3558 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizer.cpp
@@ -20,12 +20,21 @@
 #endif
 
 #include <faiss/FaissHook.h>
+#include <faiss/IndexIVF.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/utils/fp16.h>
 #include <faiss/utils/utils.h>
 
+#include <faiss/impl/ScalarQuantizerOp.h>
+
 namespace faiss {
 
+using QuantizerType = ScalarQuantizer::QuantizerType;
+using RangeStat = ScalarQuantizer::RangeStat;
+using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
+
 /*******************************************************************
  * ScalarQuantizer implementation
  *
@@ -55,39 +64,30 @@ namespace faiss {
  ********************************************************************/
 
 ScalarQuantizer::ScalarQuantizer(size_t d, QuantizerType qtype)
-        : qtype(qtype),
-          rangestat(RangeStat::RS_minmax),
-          rangestat_arg(0),
-          d(d) {
+        : Quantizer(d), qtype(qtype) {
     set_derived_sizes();
 }
 
-ScalarQuantizer::ScalarQuantizer()
-        : qtype(QuantizerType::QT_8bit),
-          rangestat(RangeStat::RS_minmax),
-          rangestat_arg(0),
-          d(0),
-          bits(0),
-          code_size(0) {}
+ScalarQuantizer::ScalarQuantizer() {}
 
 void ScalarQuantizer::set_derived_sizes() {
     switch (qtype) {
-        case QuantizerType::QT_8bit:
-        case QuantizerType::QT_8bit_uniform:
-        case QuantizerType::QT_8bit_direct:
+        case QT_8bit:
+        case QT_8bit_uniform:
+        case QT_8bit_direct:
             code_size = d;
             bits = 8;
             break;
-        case QuantizerType::QT_4bit:
-        case QuantizerType::QT_4bit_uniform:
+        case QT_4bit:
+        case QT_4bit_uniform:
             code_size = (d + 1) / 2;
             bits = 4;
             break;
-        case QuantizerType::QT_6bit:
+        case QT_6bit:
             code_size = (d * 6 + 7) / 8;
             bits = 6;
             break;
-        case QuantizerType::QT_fp16:
+        case QT_fp16:
             code_size = d * 2;
             bits = 16;
             break;
@@ -95,16 +95,16 @@ void ScalarQuantizer::set_derived_sizes() {
 }
 
 void ScalarQuantizer::train(size_t n, const float* x) {
-    int bit_per_dim = qtype == QuantizerType::QT_4bit_uniform ? 4
-            : qtype == QuantizerType::QT_4bit                 ? 4
-            : qtype == QuantizerType::QT_6bit                 ? 6
-            : qtype == QuantizerType::QT_8bit_uniform         ? 8
-            : qtype == QuantizerType::QT_8bit                 ? 8
-                                                              : -1;
+    int bit_per_dim = qtype == QT_4bit_uniform ? 4
+            : qtype == QT_4bit                 ? 4
+            : qtype == QT_6bit                 ? 6
+            : qtype == QT_8bit_uniform         ? 8
+            : qtype == QT_8bit                 ? 8
+                                               : -1;
 
     switch (qtype) {
-        case QuantizerType::QT_4bit_uniform:
-        case QuantizerType::QT_8bit_uniform:
+        case QT_4bit_uniform:
+        case QT_8bit_uniform:
             train_Uniform(
                     rangestat,
                     rangestat_arg,
@@ -113,9 +113,9 @@ void ScalarQuantizer::train(size_t n, const float* x) {
                     x,
                     trained);
             break;
-        case QuantizerType::QT_4bit:
-        case QuantizerType::QT_8bit:
-        case QuantizerType::QT_6bit:
+        case QT_4bit:
+        case QT_8bit:
+        case QT_6bit:
             train_NonUniform(
                     rangestat,
                     rangestat_arg,
@@ -125,42 +125,21 @@ void ScalarQuantizer::train(size_t n, const float* x) {
                     x,
                     trained);
             break;
-        case QuantizerType::QT_fp16:
-        case QuantizerType::QT_8bit_direct:
+        case QT_fp16:
+        case QT_8bit_direct:
             // no training necessary
             break;
     }
 }
 
-void ScalarQuantizer::train_residual(
-        size_t n,
-        const float* x,
-        Index* quantizer,
-        bool by_residual,
-        bool verbose) {
-    const float* x_in = x;
-
-    // 100k points more than enough
-    x = fvecs_maybe_subsample(d, (size_t*)&n, 100000, x, verbose, 1234);
-
-    ScopeDeleter<float> del_x(x_in == x ? nullptr : x);
-
-    if (by_residual) {
-        std::vector<Index::idx_t> idx(n);
-        quantizer->assign(n, x, idx.data());
-
-        std::vector<float> residuals(n * d);
-        quantizer->compute_residual_n(n, x, residuals.data(), idx.data());
-
-        train(n, residuals.data());
-    } else {
-        train(n, x);
-    }
+ScalarQuantizer::SQuantizer* ScalarQuantizer::select_quantizer() const {
+    /* use hook to decide use AVX512 or not */
+    return sq_sel_quantizer(qtype, d, trained);
 }
 
 void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
         const {
-    std::unique_ptr<Quantizer> squant(select_quantizer());
+    std::unique_ptr<SQuantizer> squant(select_quantizer());
 
     memset(codes, 0, code_size * n);
 #pragma omp parallel for
@@ -169,24 +148,13 @@ void ScalarQuantizer::compute_codes(const float* x, uint8_t* codes, size_t n)
 }
 
 void ScalarQuantizer::decode(const uint8_t* codes, float* x, size_t n) const {
-    std::unique_ptr<Quantizer> squant(select_quantizer());
+    std::unique_ptr<SQuantizer> squant(select_quantizer());
 
 #pragma omp parallel for
     for (int64_t i = 0; i < n; i++)
         squant->decode_vector(codes + i * code_size, x + i * d);
 }
 
-/*******************************************************************
- * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object
- *
- * It is an InvertedListScanner, but is designed to work with
- * IndexScalarQuantizer as well.
- ********************************************************************/
-Quantizer* ScalarQuantizer::select_quantizer() const {
-    /* use hook to decide use AVX512 or not */
-    return sq_sel_quantizer(qtype, d, trained);
-}
-
 SQDistanceComputer* ScalarQuantizer::get_distance_computer(
         MetricType metric) const {
     FAISS_THROW_IF_NOT(metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
@@ -194,14 +162,26 @@ SQDistanceComputer* ScalarQuantizer::get_distance_computer(
     return sq_get_distance_computer(metric, qtype, d, trained);
 }
 
+size_t ScalarQuantizer::cal_size() const {
+    return sizeof(*this) + trained.size() * sizeof(float);
+}
+
+/*******************************************************************
+ * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object
+ *
+ * It is an InvertedListScanner, but is designed to work with
+ * IndexScalarQuantizer as well.
+ ********************************************************************/
+
 InvertedListScanner* ScalarQuantizer::select_InvertedListScanner(
         MetricType mt,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual) const {
     /* use hook to decide use AVX512 or not */
     return sq_sel_inv_list_scanner(mt, this, quantizer, d, store_pairs,
-                                   by_residual);
+                                   sel, by_residual);
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h
index 375764f7d..a6ac1a67c 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizer.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizer.h
@@ -9,31 +9,50 @@
 
 #pragma once
 
-#include <faiss/IndexIVF.h>
-#include <faiss/impl/ScalarQuantizerOp.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
+#include <faiss/impl/Quantizer.h>
 
 namespace faiss {
 
+struct InvertedListScanner;
+
 /**
  * The uniform quantizer has a range [vmin, vmax]. The range can be
  * the same for all dimensions (uniform) or specific per dimension
  * (default).
  */
 
-struct ScalarQuantizer {
-    QuantizerType qtype;
-
-    RangeStat rangestat;
-    float rangestat_arg;
-
-    /// dimension of input vectors
-    size_t d;
+struct ScalarQuantizer : Quantizer {
+    enum QuantizerType {
+        QT_8bit,         ///< 8 bits per component
+        QT_4bit,         ///< 4 bits per component
+        QT_8bit_uniform, ///< same, shared range for all dimensions
+        QT_4bit_uniform,
+        QT_fp16,
+        QT_8bit_direct, ///< fast indexing of uint8s
+        QT_6bit,        ///< 6 bits per component
+    };
+
+    QuantizerType qtype = QT_8bit;
+
+    /** The uniform encoder can estimate the range of representable
+     * values of the unform encoder using different statistics. Here
+     * rs = rangestat_arg */
+
+    // rangestat_arg.
+    enum RangeStat {
+        RS_minmax,    ///< [min - rs*(max-min), max + rs*(max-min)]
+        RS_meanstd,   ///< [mean - std * rs, mean + std * rs]
+        RS_quantiles, ///< [Q(rs), Q(1-rs)]
+        RS_optim,     ///< alternate optimization of reconstruction error
+    };
+
+    RangeStat rangestat = RS_minmax;
+    float rangestat_arg = 0;
 
     /// bits per scalar code
-    size_t bits;
-
-    /// bytes per vector
-    size_t code_size;
+    size_t bits = 0;
 
     /// trained values (including the range)
     std::vector<float> trained;
@@ -44,226 +63,76 @@ struct ScalarQuantizer {
     /// updates internal values based on qtype and d
     void set_derived_sizes();
 
-    void train(size_t n, const float* x);
-
-    /// Used by an IVF index to train based on the residuals
-    void train_residual(
-            size_t n,
-            const float* x,
-            Index* quantizer,
-            bool by_residual,
-            bool verbose);
+    void train(size_t n, const float* x) override;
 
     /** Encode a set of vectors
      *
      * @param x      vectors to encode, size n * d
      * @param codes  output codes, size n * code_size
      */
-    void compute_codes(const float* x, uint8_t* codes, size_t n) const;
+    void compute_codes(const float* x, uint8_t* codes, size_t n) const override;
 
     /** Decode a set of vectors
      *
      * @param codes  codes to decode, size n * code_size
      * @param x      output vectors, size n * d
      */
-    void decode(const uint8_t* code, float* x, size_t n) const;
+    void decode(const uint8_t* code, float* x, size_t n) const override;
 
     /*****************************************************
      * Objects that provide methods for encoding/decoding, distance
      * computation and inverted list scanning
      *****************************************************/
 
-    Quantizer* select_quantizer() const;
+    struct SQuantizer {
+        // encodes one vector. Assumes code is filled with 0s on input!
+        virtual void encode_vector(const float* x, uint8_t* code) const = 0;
+        virtual void decode_vector(const uint8_t* code, float* x) const = 0;
 
-    SQDistanceComputer* get_distance_computer(
-            MetricType metric = METRIC_L2) const;
+        virtual ~SQuantizer() {}
+    };
 
-    InvertedListScanner* select_InvertedListScanner(
-            MetricType mt,
-            const Index* quantizer,
-            bool store_pairs,
-            bool by_residual = false) const;
+    SQuantizer* select_quantizer() const;
 
-    size_t cal_size() {
-        return sizeof(*this) + trained.size() * sizeof(float);
-    }
-};
-
-/*******************************************************************
- * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object
- *
- * It is an InvertedListScanner, but is designed to work with
- * IndexScalarQuantizer as well.
- ********************************************************************/
-
-template <class DCClass>
-struct IVFSQScannerIP : InvertedListScanner {
-    DCClass dc;
-    bool by_residual;
-
-    float accu0; /// added to all distances
-
-    IVFSQScannerIP(
-            int d,
-            const std::vector<float>& trained,
-            size_t code_size,
-            bool store_pairs,
-            bool by_residual)
-            : dc(d, trained), by_residual(by_residual), accu0(0) {
-        this->store_pairs = store_pairs;
-        this->code_size = code_size;
-    }
+    struct SQDistanceComputer : FlatCodesDistanceComputer {
+        const float* q;
 
-    void set_query(const float* query) override {
-        dc.set_query(query);
-    }
+        SQDistanceComputer() : q(nullptr) {}
 
-    void set_list(idx_t list_no, float coarse_dis) override {
-        this->list_no = list_no;
-        accu0 = by_residual ? coarse_dis : 0;
-    }
+        virtual float query_to_code(const uint8_t* code) const = 0;
 
-    float distance_to_code(const uint8_t* code) const final {
-        return accu0 + dc.query_to_code(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const float* code_norms,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k,
-            const BitsetView bitset = nullptr) const override {
-        size_t nup = 0;
-
-        for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float accu = accu0 + dc.query_to_code(codes);
-                if (accu > simi[0]) {
-                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    minheap_replace_top(k, simi, idxi, accu, id);
-                    nup++;
-                }
-            }
-            codes += code_size;
+        float distance_to_code(const uint8_t* code) final {
+            return query_to_code(code);
         }
-        return nup;
-    }
 
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const float* code_norms,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res,
-            const BitsetView bitset = nullptr) const override {
-        for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float accu = accu0 + dc.query_to_code(codes);
-                if (accu > radius) {
-                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    res.add(accu, id);
-                }
-            }
-            codes += code_size;
+        virtual void query_to_codes_batch_4(
+            const uint8_t* __restrict code_0,
+            const uint8_t* __restrict code_1,
+            const uint8_t* __restrict code_2,
+            const uint8_t* __restrict code_3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3
+        ) const {
+            dis0 = this->query_to_code(code_0);
+            dis1 = this->query_to_code(code_1);
+            dis2 = this->query_to_code(code_2);
+            dis3 = this->query_to_code(code_3);
         }
-    }
-};
-
-template <class DCClass>
-struct IVFSQScannerL2 : InvertedListScanner {
-    DCClass dc;
-
-    bool by_residual;
-    const Index* quantizer;
-    const float* x; /// current query
+    };
 
-    std::vector<float> tmp;
+    SQDistanceComputer* get_distance_computer(
+            MetricType metric = METRIC_L2) const;
 
-    IVFSQScannerL2(
-            int d,
-            const std::vector<float>& trained,
-            size_t code_size,
+    InvertedListScanner* select_InvertedListScanner(
+            MetricType mt,
             const Index* quantizer,
             bool store_pairs,
-            bool by_residual)
-            : dc(d, trained),
-              by_residual(by_residual),
-              quantizer(quantizer),
-              x(nullptr),
-              tmp(d) {
-        this->store_pairs = store_pairs;
-        this->code_size = code_size;
-    }
-
-    void set_query(const float* query) override {
-        x = query;
-        if (!quantizer) {
-            dc.set_query(query);
-        }
-    }
-
-    void set_list(idx_t list_no, float /*coarse_dis*/) override {
-        this->list_no = list_no;
-        if (by_residual) {
-            // shift of x_in wrt centroid
-            quantizer->compute_residual(x, tmp.data(), list_no);
-            dc.set_query(tmp.data());
-        } else {
-            dc.set_query(x);
-        }
-    }
-
-    float distance_to_code(const uint8_t* code) const final {
-        return dc.query_to_code(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const float* code_norms,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k,
-            const BitsetView bitset = nullptr) const override {
-        size_t nup = 0;
-        for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = dc.query_to_code(codes);
-                if (dis < simi[0]) {
-                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    maxheap_replace_top(k, simi, idxi, dis, id);
-                    nup++;
-                }
-            }
-            codes += code_size;
-        }
-        return nup;
-    }
+            const IDSelector* sel,
+            bool by_residual = false) const;
 
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const float* code_norms,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res,
-            const BitsetView bitset = nullptr) const override {
-        for (size_t j = 0; j < list_size; j++) {
-            if (bitset.empty() || !bitset.test(ids[j])) {
-                float dis = dc.query_to_code(codes);
-                if (dis < radius) {
-                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                    res.add(dis, id);
-                }
-            }
-            codes += code_size;
-        }
-    }
+    size_t cal_size() const;
 };
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h
index de255555c..bd85eda79 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec.h
@@ -13,10 +13,19 @@
 
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/impl/ScalarQuantizerOp.h>
+#include <faiss/utils/fp16.h>
 #include <faiss/utils/utils.h>
 
+#include <faiss/impl/ScalarQuantizerScanner.h>
+
 namespace faiss {
 
+using QuantizerType = ScalarQuantizer::QuantizerType;
+using RangeStat = ScalarQuantizer::RangeStat;
+using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
+using SQuantizer = ScalarQuantizer::SQuantizer;
+
 /*******************************************************************
  * Codec: converts between values in [0, 1] and an index in a code
  * array. The "i" parameter is the vector component index (not byte
@@ -97,7 +106,7 @@ template <class Codec, bool uniform, int SIMD>
 struct QuantizerTemplate {};
 
 template <class Codec>
-struct QuantizerTemplate<Codec, true, 1> : Quantizer {
+struct QuantizerTemplate<Codec, true, 1> : SQuantizer {
     const size_t d;
     const float vmin, vdiff;
 
@@ -134,7 +143,7 @@ struct QuantizerTemplate<Codec, true, 1> : Quantizer {
 };
 
 template <class Codec>
-struct QuantizerTemplate<Codec, false, 1> : Quantizer {
+struct QuantizerTemplate<Codec, false, 1> : SQuantizer {
     const size_t d;
     const float *vmin, *vdiff;
 
@@ -178,7 +187,7 @@ template <int SIMDWIDTH>
 struct QuantizerFP16 {};
 
 template <>
-struct QuantizerFP16<1> : Quantizer {
+struct QuantizerFP16<1> : SQuantizer {
     const size_t d;
 
     QuantizerFP16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
@@ -208,7 +217,7 @@ template <int SIMDWIDTH>
 struct Quantizer8bitDirect {};
 
 template <>
-struct Quantizer8bitDirect<1> : Quantizer {
+struct Quantizer8bitDirect<1> : SQuantizer {
     const size_t d;
 
     Quantizer8bitDirect(size_t d, const std::vector<float>& /* unused */)
@@ -232,29 +241,29 @@ struct Quantizer8bitDirect<1> : Quantizer {
 };
 
 template <int SIMDWIDTH>
-Quantizer* select_quantizer_1(
+SQuantizer* select_quantizer_1(
         QuantizerType qtype,
         size_t d,
         const std::vector<float>& trained) {
     switch (qtype) {
-        case QuantizerType::QT_8bit:
+        case ScalarQuantizer::QT_8bit:
             return new QuantizerTemplate<Codec8bit, false, SIMDWIDTH>(
                     d, trained);
-        case QuantizerType::QT_6bit:
+        case ScalarQuantizer::QT_6bit:
             return new QuantizerTemplate<Codec6bit, false, SIMDWIDTH>(
                     d, trained);
-        case QuantizerType::QT_4bit:
+        case ScalarQuantizer::QT_4bit:
             return new QuantizerTemplate<Codec4bit, false, SIMDWIDTH>(
                     d, trained);
-        case QuantizerType::QT_8bit_uniform:
+        case ScalarQuantizer::QT_8bit_uniform:
             return new QuantizerTemplate<Codec8bit, true, SIMDWIDTH>(
                     d, trained);
-        case QuantizerType::QT_4bit_uniform:
+        case ScalarQuantizer::QT_4bit_uniform:
             return new QuantizerTemplate<Codec4bit, true, SIMDWIDTH>(
                     d, trained);
-        case QuantizerType::QT_fp16:
+        case ScalarQuantizer::QT_fp16:
             return new QuantizerFP16<SIMDWIDTH>(d, trained);
-        case QuantizerType::QT_8bit_direct:
+        case ScalarQuantizer::QT_8bit_direct:
             return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
@@ -376,17 +385,12 @@ struct DCTemplate<Quantizer, Similarity, 1> : SQDistanceComputer {
         q = x;
     }
 
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) final {
-        return query_to_code(codes + i * code_size);
-    }
-
     float symmetric_dis(idx_t i, idx_t j) override {
         return compute_code_distance(
                 codes + i * code_size, codes + j * code_size);
     }
 
-    float query_to_code(const uint8_t* code) const final {
+    float query_to_code(const uint8_t* code) const override final {
         return compute_distance(q, code);
     }
 };
@@ -432,17 +436,12 @@ struct DistanceComputerByte<Similarity, 1> : SQDistanceComputer {
         return compute_code_distance(tmp.data(), code);
     }
 
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) final {
-        return query_to_code(codes + i * code_size);
-    }
-
     float symmetric_dis(idx_t i, idx_t j) override {
         return compute_code_distance(
                 codes + i * code_size, codes + j * code_size);
     }
 
-    float query_to_code(const uint8_t* code) const final {
+    float query_to_code(const uint8_t* code) const override final {
         return compute_code_distance(tmp.data(), code);
     }
 };
@@ -459,41 +458,41 @@ SQDistanceComputer* select_distance_computer(
         const std::vector<float>& trained) {
     constexpr int SIMDWIDTH = Sim::simdwidth;
     switch (qtype) {
-        case QuantizerType::QT_8bit_uniform:
+        case ScalarQuantizer::QT_8bit_uniform:
             return new DCTemplate<
                     QuantizerTemplate<Codec8bit, true, SIMDWIDTH>,
                     Sim,
                     SIMDWIDTH>(d, trained);
 
-        case QuantizerType::QT_4bit_uniform:
+        case ScalarQuantizer::QT_4bit_uniform:
             return new DCTemplate<
                     QuantizerTemplate<Codec4bit, true, SIMDWIDTH>,
                     Sim,
                     SIMDWIDTH>(d, trained);
 
-        case QuantizerType::QT_8bit:
+        case ScalarQuantizer::QT_8bit:
             return new DCTemplate<
                     QuantizerTemplate<Codec8bit, false, SIMDWIDTH>,
                     Sim,
                     SIMDWIDTH>(d, trained);
 
-        case QuantizerType::QT_6bit:
+        case ScalarQuantizer::QT_6bit:
             return new DCTemplate<
                     QuantizerTemplate<Codec6bit, false, SIMDWIDTH>,
                     Sim,
                     SIMDWIDTH>(d, trained);
 
-        case QuantizerType::QT_4bit:
+        case ScalarQuantizer::QT_4bit:
             return new DCTemplate<
                     QuantizerTemplate<Codec4bit, false, SIMDWIDTH>,
                     Sim,
                     SIMDWIDTH>(d, trained);
 
-        case QuantizerType::QT_fp16:
+        case ScalarQuantizer::QT_fp16:
             return new DCTemplate<QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(
                     d, trained);
 
-        case QuantizerType::QT_8bit_direct:
+        case ScalarQuantizer::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
             } else {
@@ -507,33 +506,63 @@ SQDistanceComputer* select_distance_computer(
     return nullptr;
 }
 
-template <class DCClass>
-InvertedListScanner* sel2_InvertedListScanner(
+template <class DCClass, int use_sel>
+InvertedListScanner* sel3_InvertedListScanner(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
     if (DCClass::Sim::metric_type == METRIC_L2) {
-        return new IVFSQScannerL2<DCClass>(
-                sq->d, sq->trained, sq->code_size, quantizer, store_pairs, r);
+        return new IVFSQScannerL2<DCClass, use_sel>(
+                sq->d,
+                sq->trained,
+                sq->code_size,
+                quantizer,
+                store_pairs,
+                sel,
+                r);
     } else if (DCClass::Sim::metric_type == METRIC_INNER_PRODUCT) {
-        return new IVFSQScannerIP<DCClass>(
-                sq->d, sq->trained, sq->code_size, store_pairs, r);
+        return new IVFSQScannerIP<DCClass, use_sel>(
+                sq->d, sq->trained, sq->code_size, store_pairs, sel, r);
     } else {
         FAISS_THROW_MSG("unsupported metric type");
     }
 }
 
+template <class DCClass>
+InvertedListScanner* sel2_InvertedListScanner(
+        const ScalarQuantizer* sq,
+        const Index* quantizer,
+        bool store_pairs,
+        const IDSelector* sel,
+        bool r) {
+    if (sel) {
+        if (store_pairs) {
+            return sel3_InvertedListScanner<DCClass, 2>(
+                    sq, quantizer, store_pairs, sel, r);
+        } else {
+            return sel3_InvertedListScanner<DCClass, 1>(
+                    sq, quantizer, store_pairs, sel, r);
+        }
+    } else {
+        return sel3_InvertedListScanner<DCClass, 0>(
+                sq, quantizer, store_pairs, sel, r);
+    }
+}
+
 template <class Similarity, class Codec, bool uniform>
 InvertedListScanner* sel12_InvertedListScanner(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
     constexpr int SIMDWIDTH = Similarity::simdwidth;
     using QuantizerClass = QuantizerTemplate<Codec, uniform, SIMDWIDTH>;
     using DCClass = DCTemplate<QuantizerClass, Similarity, SIMDWIDTH>;
-    return sel2_InvertedListScanner<DCClass>(sq, quantizer, store_pairs, r);
+    return sel2_InvertedListScanner<DCClass>(
+            sq, quantizer, store_pairs, sel, r);
 }
 
 template <class Similarity>
@@ -541,39 +570,40 @@ InvertedListScanner* sel1_InvertedListScanner(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
     constexpr int SIMDWIDTH = Similarity::simdwidth;
     switch (sq->qtype) {
-        case QuantizerType::QT_8bit_uniform:
+        case ScalarQuantizer::QT_8bit_uniform:
             return sel12_InvertedListScanner<Similarity, Codec8bit, true>(
-                    sq, quantizer, store_pairs, r);
-        case QuantizerType::QT_4bit_uniform:
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_4bit_uniform:
             return sel12_InvertedListScanner<Similarity, Codec4bit, true>(
-                    sq, quantizer, store_pairs, r);
-        case QuantizerType::QT_8bit:
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_8bit:
             return sel12_InvertedListScanner<Similarity, Codec8bit, false>(
-                    sq, quantizer, store_pairs, r);
-        case QuantizerType::QT_4bit:
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_4bit:
             return sel12_InvertedListScanner<Similarity, Codec4bit, false>(
-                    sq, quantizer, store_pairs, r);
-        case QuantizerType::QT_6bit:
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_6bit:
             return sel12_InvertedListScanner<Similarity, Codec6bit, false>(
-                    sq, quantizer, store_pairs, r);
-        case QuantizerType::QT_fp16:
+                    sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_fp16:
             return sel2_InvertedListScanner<DCTemplate<
                     QuantizerFP16<SIMDWIDTH>,
                     Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, r);
-        case QuantizerType::QT_8bit_direct:
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner<
                         DistanceComputerByte<Similarity, SIMDWIDTH>>(
-                        sq, quantizer, store_pairs, r);
+                        sq, quantizer, store_pairs, sel, r);
             } else {
                 return sel2_InvertedListScanner<DCTemplate<
                         Quantizer8bitDirect<SIMDWIDTH>,
                         Similarity,
-                        SIMDWIDTH>>(sq, quantizer, store_pairs, r);
+                        SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
     }
 
@@ -587,13 +617,14 @@ InvertedListScanner* sel0_InvertedListScanner(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual) {
     if (mt == METRIC_L2) {
         return sel1_InvertedListScanner<SimilarityL2<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, by_residual);
+                sq, quantizer, store_pairs, sel, by_residual);
     } else if (mt == METRIC_INNER_PRODUCT) {
         return sel1_InvertedListScanner<SimilarityIP<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, by_residual);
+                sq, quantizer, store_pairs, sel, by_residual);
     } else {
         FAISS_THROW_MSG("unsupported metric type");
     }
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h
index 0632f3af3..fef86c9be 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx.h
@@ -18,6 +18,11 @@
 
 namespace faiss {
 
+using QuantizerType = ScalarQuantizer::QuantizerType;
+using RangeStat = ScalarQuantizer::RangeStat;
+using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
+using SQuantizer = ScalarQuantizer::SQuantizer;
+
 /*******************************************************************
  * Codec: converts between values in [0, 1] and an index in a code
  * array. The "i" parameter is the vector component index (not byte
@@ -26,17 +31,14 @@ namespace faiss {
 
 struct Codec8bit_avx : public Codec8bit {
     static __m256 decode_8_components(const uint8_t* code, int i) {
-        uint64_t c8 = *(uint64_t*)(code + i);
-        __m128i c4lo = _mm_cvtepu8_epi32(_mm_set1_epi32(c8));
-        __m128i c4hi = _mm_cvtepu8_epi32(_mm_set1_epi32(c8 >> 32));
-        // __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
-        __m256i i8 = _mm256_castsi128_si256(c4lo);
-        i8 = _mm256_insertf128_si256(i8, c4hi, 1);
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        __m256 half = _mm256_set1_ps(0.5f);
-        f8 = _mm256_add_ps(f8, half);
-        __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
-        return _mm256_mul_ps(f8, one_255);
+        const uint64_t c8 = *(uint64_t*)(code + i);
+
+        const __m128i i8 = _mm_set1_epi64x(c8);
+        const __m256i i32 = _mm256_cvtepu8_epi32(i8);
+        const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f);
+        const __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
+        return _mm256_fmadd_ps(f8, one_255, half_one_255);
     }
 };
 
@@ -85,6 +87,17 @@ struct Codec6bit_avx : public Codec6bit {
     }
 
     static __m256 decode_8_components(const uint8_t* code, int i) {
+        // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3);
+        // const uint32_t* data32 = (const uint32_t*)data16;
+        // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32);
+        // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL);
+        // const __m128i i8 = _mm_set1_epi64x(vext);
+        // const __m256i i32 = _mm256_cvtepi8_epi32(i8);
+        // const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
+        // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
+        // return _mm256_fmadd_ps(f8, one_255, half_one_255);
+    
         __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3));
         __m256 f8 = _mm256_cvtepi32_ps(i8);
         // this could also be done with bit manipulations but it is
@@ -196,7 +209,7 @@ struct Quantizer8bitDirect_avx<8> : public Quantizer8bitDirect<1> {
 };
 
 template <int SIMDWIDTH>
-Quantizer* select_quantizer_1_avx(
+SQuantizer* select_quantizer_1_avx(
         QuantizerType qtype,
         size_t d,
         const std::vector<float>& trained) {
@@ -340,6 +353,7 @@ struct DCTemplate_avx<Quantizer, Similarity, 1>
             : DCTemplate<Quantizer, Similarity, 1>(d, trained) {}
 };
 
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 template <class Quantizer, class Similarity>
 struct DCTemplate_avx<Quantizer, Similarity, 8> : SQDistanceComputer {
     using Sim = Similarity;
@@ -385,10 +399,50 @@ struct DCTemplate_avx<Quantizer, Similarity, 8> : SQDistanceComputer {
                 codes + i * code_size, codes + j * code_size);
     }
 
-    float query_to_code(const uint8_t* code) const final {
+    float query_to_code(const uint8_t* code) const override final {
         return compute_distance(q, code);
     }
+
+    void query_to_codes_batch_4(
+        const uint8_t* __restrict code_0,
+        const uint8_t* __restrict code_1,
+        const uint8_t* __restrict code_2,
+        const uint8_t* __restrict code_3,
+        float& dis0,
+        float& dis1,
+        float& dis2,
+        float& dis3
+    ) const override final {
+
+        Similarity sim0(q);
+        Similarity sim1(q);
+        Similarity sim2(q);
+        Similarity sim3(q);
+
+        sim0.begin_8();
+        sim1.begin_8();
+        sim2.begin_8();
+        sim3.begin_8();
+
+        FAISS_PRAGMA_IMPRECISE_LOOP
+        for (size_t i = 0; i < quant.d; i += 8) {
+            __m256 xi0 = quant.reconstruct_8_components(code_0, i);
+            __m256 xi1 = quant.reconstruct_8_components(code_1, i);
+            __m256 xi2 = quant.reconstruct_8_components(code_2, i);
+            __m256 xi3 = quant.reconstruct_8_components(code_3, i);
+            sim0.add_8_components(xi0);
+            sim1.add_8_components(xi1);
+            sim2.add_8_components(xi2);
+            sim3.add_8_components(xi3);
+        }
+
+        dis0 = sim0.result_8();
+        dis1 = sim1.result_8();
+        dis2 = sim2.result_8();
+        dis3 = sim3.result_8();
+    }
 };
+FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 
 /*******************************************************************
  * DistanceComputerByte: computes distances in the integer domain
@@ -465,7 +519,7 @@ struct DistanceComputerByte_avx<Similarity, 8> : SQDistanceComputer {
                 codes + i * code_size, codes + j * code_size);
     }
 
-    float query_to_code(const uint8_t* code) const final {
+    float query_to_code(const uint8_t* code) const override final {
         return compute_code_distance(tmp.data(), code);
     }
 };
@@ -537,8 +591,9 @@ InvertedListScanner* sel2_InvertedListScanner_avx(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
-    return sel2_InvertedListScanner<DCClass>(sq, quantizer, store_pairs, r);
+    return sel2_InvertedListScanner<DCClass>(sq, quantizer, store_pairs, sel, r);
 }
 
 template<class Similarity, class Codec, bool uniform>
@@ -546,11 +601,12 @@ InvertedListScanner* sel12_InvertedListScanner_avx(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
     constexpr int SIMDWIDTH = Similarity::simdwidth;
     using QuantizerClass = QuantizerTemplate_avx<Codec, uniform, SIMDWIDTH>;
     using DCClass = DCTemplate_avx<QuantizerClass, Similarity, SIMDWIDTH>;
-    return sel2_InvertedListScanner_avx<DCClass>(sq, quantizer, store_pairs, r);
+    return sel2_InvertedListScanner_avx<DCClass>(sq, quantizer, store_pairs, sel, r);
 }
 
 template<class Similarity>
@@ -558,39 +614,40 @@ InvertedListScanner* sel1_InvertedListScanner_avx(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
     constexpr int SIMDWIDTH = Similarity::simdwidth;
     switch (sq->qtype) {
         case QuantizerType::QT_8bit_uniform:
             return sel12_InvertedListScanner_avx<Similarity, Codec8bit_avx, true>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_4bit_uniform:
             return sel12_InvertedListScanner_avx<Similarity, Codec4bit_avx, true>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_8bit:
             return sel12_InvertedListScanner_avx<Similarity, Codec8bit_avx, false>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_4bit:
             return sel12_InvertedListScanner_avx<Similarity, Codec4bit_avx, false>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_6bit:
             return sel12_InvertedListScanner_avx<Similarity, Codec6bit_avx, false>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_fp16:
             return sel2_InvertedListScanner_avx<DCTemplate_avx<
                     QuantizerFP16_avx<SIMDWIDTH>,
                     Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, r);
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner_avx<
                         DistanceComputerByte_avx<Similarity, SIMDWIDTH>>(
-                        sq, quantizer, store_pairs, r);
+                        sq, quantizer, store_pairs, sel, r);
             } else {
                 return sel2_InvertedListScanner_avx<DCTemplate_avx<
                         Quantizer8bitDirect_avx<SIMDWIDTH>,
                         Similarity,
-                        SIMDWIDTH>>(sq, quantizer, store_pairs, r);
+                        SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
     }
 
@@ -604,13 +661,14 @@ InvertedListScanner* sel0_InvertedListScanner_avx(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual) {
     if (mt == METRIC_L2) {
         return sel1_InvertedListScanner_avx<SimilarityL2_avx<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, by_residual);
+                sq, quantizer, store_pairs, sel, by_residual);
     } else if (mt == METRIC_INNER_PRODUCT) {
         return sel1_InvertedListScanner_avx<SimilarityIP_avx<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, by_residual);
+                sq, quantizer, store_pairs, sel, by_residual);
     } else {
         FAISS_THROW_MSG("unsupported metric type");
     }
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
index ded366299..ab0365a57 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
@@ -18,6 +18,11 @@
 
 namespace faiss {
 
+using QuantizerType = ScalarQuantizer::QuantizerType;
+using RangeStat = ScalarQuantizer::RangeStat;
+using SQDistanceComputer = ScalarQuantizer::SQDistanceComputer;
+using SQuantizer = ScalarQuantizer::SQuantizer;
+
 /*******************************************************************
  * Codec: converts between values in [0, 1] and an index in a code
  * array. The "i" parameter is the vector component index (not byte
@@ -26,18 +31,12 @@ namespace faiss {
 
 struct Codec8bit_avx512 : public Codec8bit_avx {
     static __m512 decode_16_components(const uint8_t *code, int i) {
-        uint64_t c8 = *(uint64_t*)(code + i);
-        __m256i c8lo = _mm256_cvtepu8_epi32(_mm_set1_epi64x(c8));
-        c8 = *(uint64_t*)(code + i + 8);
-        __m256i c8hi = _mm256_cvtepu8_epi32(_mm_set1_epi64x(c8));
-        // __m256i i8 = _mm256_set_m128i(c4lo, c4hi);
-        __m512i i16 = _mm512_castsi256_si512(c8lo);
-        i16 = _mm512_inserti32x8(i16, c8hi, 1);
-        __m512 f16 = _mm512_cvtepi32_ps(i16);
-        __m512 half = _mm512_set1_ps(0.5f);
-        f16 = _mm512_add_ps(f16, half);
-        __m512 one_255 = _mm512_set1_ps(1.f / 255.f);
-        return _mm512_mul_ps(f16, one_255);
+        const __m128i c8 = _mm_loadu_si128((const __m128i_u*)(code + i));
+        const __m512i i32 = _mm512_cvtepu8_epi32(c8);
+        const __m512 f8 = _mm512_cvtepi32_ps(i32);
+        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f);
+        const __m512 one_255 = _mm512_set1_ps(1.f / 255.f);
+        return _mm512_fmadd_ps(f8, one_255, half_one_255);
     }
 };
 
@@ -66,6 +65,26 @@ struct Codec4bit_avx512 : public Codec4bit_avx {
 struct Codec6bit_avx512 : public Codec6bit_avx {
     // TODO: can be optimized
     static __m512 decode_16_components(const uint8_t* code, int i) {
+        // // todo aguzhva: the following piece of code is very fast
+        // //   for Intel chips. AMD ones will be very slow unless Zen3+
+        //
+        // const uint16_t* data16_0 = (const uint16_t*)(code + (i >> 2) * 3);
+        // const uint64_t* data64_0 = (const uint64_t*)data16_0;
+        // const uint64_t val_0 = *data64_0;
+        // const uint64_t vext_0 = _pdep_u64(val_0, 0x3F3F3F3F3F3F3F3FULL);
+        //
+        // const uint16_t* data16_1 = data16_0 + 3;
+        // const uint32_t* data32_1 = (const uint32_t*)data16_1;
+        // const uint64_t val_1 = *data32_1 + ((uint64_t)data16_1[2] << 32);
+        // const uint64_t vext_1 = _pdep_u64(val_1, 0x3F3F3F3F3F3F3F3FULL);
+        //
+        // const __m128i i8 = _mm_set_epi64x(vext_1, vext_0);
+        // const __m512i i32 = _mm512_cvtepi8_epi32(i8);
+        // const __m512 f8 = _mm512_cvtepi32_ps(i32);
+        // const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f);
+        // const __m512 one_255 = _mm512_set1_ps(1.f / 63.f);
+        // return _mm512_fmadd_ps(f8, one_255, half_one_255);
+
         return _mm512_set_ps
             (decode_component(code, i + 15),
              decode_component(code, i + 14),
@@ -212,7 +231,7 @@ struct Quantizer8bitDirect_avx512<16> : public Quantizer8bitDirect_avx<8> {
 };
 
 template <int SIMDWIDTH>
-Quantizer* select_quantizer_1_avx512(
+SQuantizer* select_quantizer_1_avx512(
         QuantizerType qtype,
         size_t d,
         const std::vector<float>& trained) {
@@ -425,9 +444,65 @@ struct DCTemplate_avx512<Quantizer, Similarity, 16> : SQDistanceComputer {
                 codes + i * code_size, codes + j * code_size);
     }
 
-    float query_to_code(const uint8_t * code) const {
+    float query_to_code(const uint8_t * code) const override final {
         return compute_distance(q, code);
     }
+
+    void query_to_codes_batch_4(
+        const uint8_t* __restrict code_0,
+        const uint8_t* __restrict code_1,
+        const uint8_t* __restrict code_2,
+        const uint8_t* __restrict code_3,
+        float& dis0,
+        float& dis1,
+        float& dis2,
+        float& dis3
+    ) const override final {
+
+        Similarity sim0(q);
+        Similarity sim1(q);
+        Similarity sim2(q);
+        Similarity sim3(q);
+
+        sim0.begin_16();
+        sim1.begin_16();
+        sim2.begin_16();
+        sim3.begin_16();
+
+        FAISS_PRAGMA_IMPRECISE_LOOP
+        for (size_t i = 0; i < quant.d; i += 16) {
+            __m512 xi0 = quant.reconstruct_16_components(code_0, i);
+            __m512 xi1 = quant.reconstruct_16_components(code_1, i);
+            __m512 xi2 = quant.reconstruct_16_components(code_2, i);
+            __m512 xi3 = quant.reconstruct_16_components(code_3, i);
+            sim0.add_16_components(xi0);
+            sim1.add_16_components(xi1);
+            sim2.add_16_components(xi2);
+            sim3.add_16_components(xi3);
+        }
+
+        dis0 = sim0.result_16();
+        dis1 = sim1.result_16();
+        dis2 = sim2.result_16();
+        dis3 = sim3.result_16();
+    }
+
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) override {
+        query_to_codes_batch_4(
+            codes + idx0 * code_size,
+            codes + idx1 * code_size,
+            codes + idx2 * code_size,
+            codes + idx3 * code_size,
+            dis0, dis1, dis2, dis3);
+    }
 };
 
 /*******************************************************************
@@ -514,7 +589,7 @@ struct DistanceComputerByte_avx512<Similarity, 16> : SQDistanceComputer {
                 codes + i * code_size, codes + j * code_size);
     }
 
-    float query_to_code(const uint8_t* code) const {
+    float query_to_code(const uint8_t* code) const override final {
         return compute_code_distance(tmp.data(), code);
     }
 };
@@ -586,8 +661,9 @@ InvertedListScanner* sel2_InvertedListScanner_avx512(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
-    return sel2_InvertedListScanner<DCClass>(sq, quantizer, store_pairs, r);
+    return sel2_InvertedListScanner<DCClass>(sq, quantizer, store_pairs, sel, r);
 }
 
 template <class Similarity, class Codec, bool uniform>
@@ -595,11 +671,12 @@ InvertedListScanner* sel12_InvertedListScanner_avx512(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
     constexpr int SIMDWIDTH = Similarity::simdwidth;
     using QuantizerClass = QuantizerTemplate_avx512<Codec, uniform, SIMDWIDTH>;
     using DCClass = DCTemplate_avx512<QuantizerClass, Similarity, SIMDWIDTH>;
-    return sel2_InvertedListScanner_avx512<DCClass>(sq, quantizer, store_pairs, r);
+    return sel2_InvertedListScanner_avx512<DCClass>(sq, quantizer, store_pairs, sel, r);
 }
 
 template <class Similarity>
@@ -607,39 +684,40 @@ InvertedListScanner* sel1_InvertedListScanner_avx512(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool r) {
     constexpr int SIMDWIDTH = Similarity::simdwidth;
     switch (sq->qtype) {
         case QuantizerType::QT_8bit_uniform:
             return sel12_InvertedListScanner_avx512<Similarity, Codec8bit_avx512, true>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_4bit_uniform:
             return sel12_InvertedListScanner_avx512<Similarity, Codec4bit_avx512, true>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_8bit:
             return sel12_InvertedListScanner_avx512<Similarity, Codec8bit_avx512, false>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_4bit:
             return sel12_InvertedListScanner_avx512<Similarity, Codec4bit_avx512, false>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_6bit:
             return sel12_InvertedListScanner_avx512<Similarity, Codec6bit_avx512, false>(
-                    sq, quantizer, store_pairs, r);
+                    sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_fp16:
             return sel2_InvertedListScanner_avx512<DCTemplate_avx512<
                     QuantizerFP16_avx512<SIMDWIDTH>,
                     Similarity,
-                    SIMDWIDTH>>(sq, quantizer, store_pairs, r);
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case QuantizerType::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner_avx512<
                         DistanceComputerByte_avx512<Similarity, SIMDWIDTH>>(
-                        sq, quantizer, store_pairs, r);
+                        sq, quantizer, store_pairs, sel, r);
             } else {
                 return sel2_InvertedListScanner_avx512<DCTemplate_avx512<
                         Quantizer8bitDirect_avx512<SIMDWIDTH>,
                         Similarity,
-                        SIMDWIDTH>>(sq, quantizer, store_pairs, r);
+                        SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
     }
 
@@ -653,13 +731,14 @@ InvertedListScanner* sel0_InvertedListScanner_avx512(
         const ScalarQuantizer* sq,
         const Index* quantizer,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual) {
     if (mt == METRIC_L2) {
         return sel1_InvertedListScanner_avx512<SimilarityL2_avx512<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, by_residual);
+                sq, quantizer, store_pairs, sel, by_residual);
     } else if (mt == METRIC_INNER_PRODUCT) {
         return sel1_InvertedListScanner_avx512<SimilarityIP_avx512<SIMDWIDTH>>(
-                sq, quantizer, store_pairs, by_residual);
+                sq, quantizer, store_pairs, sel, by_residual);
     } else {
         FAISS_THROW_MSG("unsupported metric type");
     }
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.cpp b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.cpp
index 5763c9287..f1f712065 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.cpp
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.cpp
@@ -15,9 +15,9 @@ namespace faiss {
  ********************************************************************/
 
 /* SSE */
-SQDistanceComputer* sq_get_distance_computer_ref(
+ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_ref(
         MetricType metric,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained) {
     if (metric == METRIC_L2) {
@@ -27,8 +27,8 @@ SQDistanceComputer* sq_get_distance_computer_ref(
     }
 }
 
-Quantizer* sq_select_quantizer_ref(
-        QuantizerType qtype,
+ScalarQuantizer::SQuantizer* sq_select_quantizer_ref(
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained) {
     return select_quantizer_1<1>(qtype, dim, trained);
@@ -40,9 +40,10 @@ InvertedListScanner* sq_select_inverted_list_scanner_ref(
         const Index* quantizer,
         size_t dim,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual) {
     return sel0_InvertedListScanner<1>(
-            mt, sq, quantizer, store_pairs, by_residual);
+            mt, sq, quantizer, store_pairs, sel, by_residual);
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.h
index 05de480a4..d6bc76c9d 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC.h
@@ -14,14 +14,14 @@
 
 namespace faiss {
 
-SQDistanceComputer* sq_get_distance_computer_ref(
+ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_ref(
         MetricType metric,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained);
 
-Quantizer* sq_select_quantizer_ref(
-        QuantizerType qtype,
+ScalarQuantizer::SQuantizer* sq_select_quantizer_ref(
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained);
 
@@ -31,6 +31,7 @@ InvertedListScanner* sq_select_inverted_list_scanner_ref(
         const Index* quantizer,
         size_t dim,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual);
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.cpp b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.cpp
index 70924b717..4ed54eef9 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.cpp
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.cpp
@@ -14,9 +14,9 @@ namespace faiss {
  * ScalarQuantizer Distance Computer
  ********************************************************************/
 
-SQDistanceComputer* sq_get_distance_computer_avx(
+ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx(
         MetricType metric,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained) {
     if (metric == METRIC_L2) {
@@ -38,8 +38,8 @@ SQDistanceComputer* sq_get_distance_computer_avx(
     }
 }
 
-Quantizer* sq_select_quantizer_avx(
-        QuantizerType qtype,
+ScalarQuantizer::SQuantizer* sq_select_quantizer_avx(
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained) {
     if (dim % 8 == 0) {
@@ -55,13 +55,14 @@ InvertedListScanner* sq_select_inverted_list_scanner_avx(
         const Index *quantizer,
         size_t dim,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual) {
     if (dim % 8 == 0) {
         return sel0_InvertedListScanner_avx<8>(
-                mt, sq, quantizer, store_pairs, by_residual);
+                mt, sq, quantizer, store_pairs, sel, by_residual);
     } else {
         return sel0_InvertedListScanner_avx<1>(
-                mt, sq, quantizer, store_pairs, by_residual);
+                mt, sq, quantizer, store_pairs, sel, by_residual);
     }
 }
 
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.h
index 8cd8ef3e2..7e2359e17 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx.h
@@ -14,14 +14,14 @@
 
 namespace faiss {
 
-SQDistanceComputer* sq_get_distance_computer_avx(
+ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx(
         MetricType metric,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained);
 
-Quantizer* sq_select_quantizer_avx(
-        QuantizerType qtype,
+ScalarQuantizer::SQuantizer* sq_select_quantizer_avx(
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained);
 
@@ -31,6 +31,7 @@ InvertedListScanner* sq_select_inverted_list_scanner_avx(
         const Index* quantizer,
         size_t dim,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual);
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.cpp b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.cpp
index 5f838e020..de7df582a 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.cpp
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.cpp
@@ -14,9 +14,9 @@ namespace faiss {
  * ScalarQuantizer Distance Computer
  ********************************************************************/
 
-SQDistanceComputer* sq_get_distance_computer_avx512(
+ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx512(
         MetricType metric,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained) {
     if (metric == METRIC_L2) {
@@ -44,8 +44,8 @@ SQDistanceComputer* sq_get_distance_computer_avx512(
     }
 }
 
-Quantizer* sq_select_quantizer_avx512(
-        QuantizerType qtype,
+ScalarQuantizer::SQuantizer* sq_select_quantizer_avx512(
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained) {
     if (dim % 16 == 0) {
@@ -63,16 +63,17 @@ InvertedListScanner* sq_select_inverted_list_scanner_avx512(
         const Index* quantizer,
         size_t dim,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual) {
     if (dim % 16 == 0) {
         return sel0_InvertedListScanner_avx512<16>(
-                mt, sq, quantizer, store_pairs, by_residual);
+                mt, sq, quantizer, store_pairs, sel, by_residual);
     } else if (dim % 8 == 0) {
         return sel0_InvertedListScanner_avx512<8>(
-                mt, sq, quantizer, store_pairs, by_residual);
+                mt, sq, quantizer, store_pairs, sel, by_residual);
     } else {
         return sel0_InvertedListScanner_avx512<1>(
-                mt, sq, quantizer, store_pairs, by_residual);
+                mt, sq, quantizer, store_pairs, sel, by_residual);
     }
 }
 
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.h
index 70878cdfe..95daea160 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerDC_avx512.h
@@ -14,14 +14,14 @@
 
 namespace faiss {
 
-SQDistanceComputer* sq_get_distance_computer_avx512(
+ScalarQuantizer::SQDistanceComputer* sq_get_distance_computer_avx512(
         MetricType metric,
-        QuantizerType qtype,
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained);
 
-Quantizer* sq_select_quantizer_avx512(
-        QuantizerType qtype,
+ScalarQuantizer::SQuantizer* sq_select_quantizer_avx512(
+        ScalarQuantizer::QuantizerType qtype,
         size_t dim,
         const std::vector<float>& trained);
 
@@ -31,6 +31,7 @@ InvertedListScanner* sq_select_inverted_list_scanner_avx512(
         const Index* quantizer,
         size_t dim,
         bool store_pairs,
+        const IDSelector* sel,
         bool by_residual);
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.cpp b/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.cpp
index daa7696ce..ccbd21411 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.cpp
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.cpp
@@ -8,123 +8,13 @@
 #include <cstdio>
 #include <algorithm>
 
-#ifdef __SSE__
-#include <immintrin.h>
-#endif
-
-#include <faiss/utils/utils.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/ScalarQuantizerOp.h>
+#include <faiss/utils/utils.h>
 
 namespace faiss {
 
-#ifdef USE_F16C
-
-uint16_t encode_fp16(float x) {
-    __m128 xf = _mm_set1_ps(x);
-    __m128i xi =
-            _mm_cvtps_ph(xf, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-    return _mm_cvtsi128_si32(xi) & 0xffff;
-}
-
-float decode_fp16(uint16_t x) {
-    __m128i xi = _mm_set1_epi16(x);
-    __m128 xf = _mm_cvtph_ps(xi);
-    return _mm_cvtss_f32(xf);
-}
-
-#else
-
-// non-intrinsic FP16 <-> FP32 code adapted from
-// https://github.com/ispc/ispc/blob/master/stdlib.ispc
-
-float floatbits(uint32_t x) {
-    void* xptr = &x;
-    return *(float*)xptr;
-}
-
-uint32_t intbits(float f) {
-    void* fptr = &f;
-    return *(uint32_t*)fptr;
-}
-
-uint16_t encode_fp16(float f) {
-    // via Fabian "ryg" Giesen.
-    // https://gist.github.com/2156668
-    uint32_t sign_mask = 0x80000000u;
-    int32_t o;
-
-    uint32_t fint = intbits(f);
-    uint32_t sign = fint & sign_mask;
-    fint ^= sign;
-
-    // NOTE all the integer compares in this function can be safely
-    // compiled into signed compares since all operands are below
-    // 0x80000000. Important if you want fast straight SSE2 code (since
-    // there's no unsigned PCMPGTD).
-
-    // Inf or NaN (all exponent bits set)
-    // NaN->qNaN and Inf->Inf
-    // unconditional assignment here, will override with right value for
-    // the regular case below.
-    uint32_t f32infty = 255u << 23;
-    o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
-
-    // (De)normalized number or zero
-    // update fint unconditionally to save the blending; we don't need it
-    // anymore for the Inf/NaN case anyway.
-
-    const uint32_t round_mask = ~0xfffu;
-    const uint32_t magic = 15u << 23;
-
-    // Shift exponent down, denormalize if necessary.
-    // NOTE This represents half-float denormals using single
-    // precision denormals.  The main reason to do this is that
-    // there's no shift with per-lane variable shifts in SSE*, which
-    // we'd otherwise need. It has some funky side effects though:
-    // - This conversion will actually respect the FTZ (Flush To Zero)
-    //   flag in MXCSR - if it's set, no half-float denormals will be
-    //   generated. I'm honestly not sure whether this is good or
-    //   bad. It's definitely interesting.
-    // - If the underlying HW doesn't support denormals (not an issue
-    //   with Intel CPUs, but might be a problem on GPUs or PS3 SPUs),
-    //   you will always get flush-to-zero behavior. This is bad,
-    //   unless you're on a CPU where you don't care.
-    // - Denormals tend to be slow. FP32 denormals are rare in
-    //   practice outside of things like recursive filters in DSP -
-    //   not a typical half-float application. Whether FP16 denormals
-    //   are rare in practice, I don't know. Whatever slow path your
-    //   HW may or may not have for denormals, this may well hit it.
-    float fscale = floatbits(fint & round_mask) * floatbits(magic);
-    fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
-    int32_t fint2 = intbits(fscale) - round_mask;
-
-    if (fint < f32infty)
-        o = fint2 >> 13; // Take the bits!
-
-    return (o | (sign >> 16));
-}
-
-float decode_fp16(uint16_t h) {
-    // https://gist.github.com/2144712
-    // Fabian "ryg" Giesen.
-
-    const uint32_t shifted_exp = 0x7c00u << 13; // exponent mask after shift
-
-    int32_t o = ((int32_t)(h & 0x7fffu)) << 13; // exponent/mantissa bits
-    int32_t exp = shifted_exp & o;              // just the exponent
-    o += (int32_t)(127 - 15) << 23;             // exponent adjust
-
-    int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
-    int32_t zerodenorm_val =
-            intbits(floatbits(o + (1u << 23)) - floatbits(113u << 23));
-    int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
-
-    int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
-    return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
-}
-
-#endif
+using RangeStat = ScalarQuantizer::RangeStat;
 
 /*******************************************************************
  * Quantizer range training
@@ -172,7 +62,7 @@ void train_Uniform(
     } else if (rs == RangeStat::RS_quantiles) {
         std::vector<float> x_copy(n);
         memcpy(x_copy.data(), x, n * sizeof(*x));
-        // TODO just do a qucikselect
+        // TODO just do a quickselect
         std::sort(x_copy.begin(), x_copy.end());
         int o = int(rs_arg * n);
         if (o < 0)
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.h
index 83436d4af..057fe3370 100644
--- a/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.h
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerOp.h
@@ -7,66 +7,20 @@
 
 #pragma once
 
-#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/ScalarQuantizer.h>
 
 namespace faiss {
 
-typedef Index::idx_t idx_t;
-
-enum QuantizerType {
-    QT_8bit,         ///< 8 bits per component
-    QT_4bit,         ///< 4 bits per component
-    QT_8bit_uniform, ///< same, shared range for all dimensions
-    QT_4bit_uniform,
-    QT_fp16,
-    QT_8bit_direct, ///< fast indexing of uint8s
-    QT_6bit,        ///< 6 bits per component
-};
-
-/** The uniform encoder can estimate the range of representable
- * values of the unform encoder using different statistics. Here
- * rs = rangestat_arg */
-
-// rangestat_arg.
-enum RangeStat {
-    RS_minmax,    ///< [min - rs*(max-min), max + rs*(max-min)]
-    RS_meanstd,   ///< [mean - std * rs, mean + std * rs]
-    RS_quantiles, ///< [Q(rs), Q(1-rs)]
-    RS_optim,     ///< alternate optimization of reconstruction error
-};
-
-struct Quantizer {
-    // encodes one vector. Assumes code is filled with 0s on input!
-    virtual void encode_vector(const float* x, uint8_t* code) const = 0;
-    virtual void decode_vector(const uint8_t* code, float* x) const = 0;
-
-    virtual ~Quantizer() {}
-};
-
-struct SQDistanceComputer : DistanceComputer {
-    const float* q;
-    const uint8_t* codes;
-    size_t code_size;
-
-    SQDistanceComputer() : q(nullptr), codes(nullptr), code_size(0) {}
-
-    virtual float query_to_code(const uint8_t* code) const = 0;
-};
-
-extern uint16_t encode_fp16(float x);
-
-extern float decode_fp16(uint16_t x);
-
-extern void train_Uniform(
-        RangeStat rs,
+void train_Uniform(
+        ScalarQuantizer::RangeStat rs,
         float rs_arg,
         idx_t n,
         int k,
         const float* x,
         std::vector<float>& trained);
 
-extern void train_NonUniform(
-        RangeStat rs,
+void train_NonUniform(
+        ScalarQuantizer::RangeStat rs,
         float rs_arg,
         idx_t n,
         int d,
diff --git a/thirdparty/faiss/faiss/impl/ScalarQuantizerScanner.h b/thirdparty/faiss/faiss/impl/ScalarQuantizerScanner.h
new file mode 100644
index 000000000..fd6ad6005
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/ScalarQuantizerScanner.h
@@ -0,0 +1,278 @@
+#pragma once
+
+#include <faiss/IndexIVF.h>
+//#include <faiss/impl/IDSelector.h>
+
+//struct InvertedListScanner;
+//struct IDSelector;
+
+#include <faiss/utils/distances_if.h>
+
+namespace faiss {
+
+/*******************************************************************
+ * IndexScalarQuantizer/IndexIVFScalarQuantizer scanner object
+ *
+ * It is an InvertedListScanner, but is designed to work with
+ * IndexScalarQuantizer as well.
+ ********************************************************************/
+
+/* use_sel = 0: don't check selector
+ * = 1: check on ids[j]
+ * = 2: check in j directly (normally ids is nullptr and store_pairs)
+ */
+
+template <class DCClass, int use_sel>
+struct IVFSQScannerIP : InvertedListScanner {
+    DCClass dc;
+    bool by_residual;
+
+    float accu0; /// added to all distances
+
+    IVFSQScannerIP(
+            int d,
+            const std::vector<float>& trained,
+            size_t code_size,
+            bool store_pairs,
+            const IDSelector* sel,
+            bool by_residual)
+            : dc(d, trained), by_residual(by_residual), accu0(0) {
+        this->store_pairs = store_pairs;
+        this->sel = sel;
+        this->code_size = code_size;
+    }
+
+    void set_query(const float* query) override {
+        dc.set_query(query);
+    }
+
+    void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+        accu0 = by_residual ? coarse_dis : 0;
+    }
+
+    float distance_to_code(const uint8_t* code) const final {
+        return accu0 + dc.query_to_code(code);
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const float* code_norms,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        size_t nup = 0;
+
+        for (size_t j = 0; j < list_size; j++, codes += code_size) {
+            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+                continue;
+            }
+
+            // todo aguzhva: upgrade
+            float accu = accu0 + dc.query_to_code(codes);
+
+            if (accu > simi[0]) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                minheap_replace_top(k, simi, idxi, accu, id);
+                nup++;
+            }
+        }
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const float* code_norms,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        for (size_t j = 0; j < list_size; j++, codes += code_size) {
+            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+                continue;
+            }
+
+            // todo aguzhva: upgrade
+            float accu = accu0 + dc.query_to_code(codes);
+            if (accu > radius) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                res.add(accu, id);
+            }
+        }
+    }
+};
+
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const size_t idx);
+    typename Apply,
+    typename DCClass>
+void fvec_L2sqr_ny_scalar_if(
+        const DCClass& dc,
+        const uint8_t* __restrict codes,
+        const size_t code_size,
+        const size_t ny,
+        Pred pred,
+        Apply apply) {    
+    // compute a distance from the query to 1 element
+    auto distance1 = [&dc, codes, code_size](const size_t idx) { 
+        return dc.query_to_code(codes + idx * code_size);
+    };
+
+    // compute distances from the query to 4 elements
+    auto distance4 = [&dc, codes, code_size](const std::array<size_t, 4> indices, std::array<float, 4>& dis) { 
+        dc.query_to_codes_batch_4(
+            codes + indices[0] * code_size,
+            codes + indices[1] * code_size,
+            codes + indices[2] * code_size,
+            codes + indices[3] * code_size,
+            dis[0],
+            dis[1],
+            dis[2],
+            dis[3]
+        );
+    };
+
+    auto remapper = [](const size_t idx) { return idx; };
+
+    fvec_distance_ny_if<Pred, decltype(distance1), decltype(distance4), decltype(remapper), Apply, 4, DEFAULT_BUFFER_SIZE>(
+        ny,
+        pred,
+        distance1,
+        distance4,
+        remapper,
+        apply
+    );
+}
+
+/* use_sel = 0: don't check selector
+ * = 1: check on ids[j]
+ * = 2: check in j directly (normally ids is nullptr and store_pairs)
+ */
+
+template <class DCClass, int use_sel>
+struct IVFSQScannerL2 : InvertedListScanner {
+    DCClass dc;
+
+    bool by_residual;
+    const Index* quantizer;
+    const float* x; /// current query
+
+    std::vector<float> tmp;
+
+    IVFSQScannerL2(
+            int d,
+            const std::vector<float>& trained,
+            size_t code_size,
+            const Index* quantizer,
+            bool store_pairs,
+            const IDSelector* sel,
+            bool by_residual)
+            : dc(d, trained),
+              by_residual(by_residual),
+              quantizer(quantizer),
+              x(nullptr),
+              tmp(d) {
+        this->store_pairs = store_pairs;
+        this->sel = sel;
+        this->code_size = code_size;
+    }
+
+    void set_query(const float* query) override {
+        x = query;
+        if (!quantizer) {
+            dc.set_query(query);
+        }
+    }
+
+    void set_list(idx_t list_no, float) override {
+        this->list_no = list_no;
+        if (by_residual) {
+            // shift of x_in wrt centroid
+            quantizer->compute_residual(x, tmp.data(), list_no);
+            dc.set_query(tmp.data());
+        } else {
+            dc.set_query(x);
+        }
+    }
+
+    float distance_to_code(const uint8_t* code) const final {
+        return dc.query_to_code(code);
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const float* code_norms,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        size_t nup = 0;
+
+        // // baseline
+        // for (size_t j = 0; j < list_size; j++, codes += code_size) {
+        //     if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+        //         continue;
+        //     }
+        //
+        //     float dis = dc.query_to_code(codes);
+        //
+        //     if (dis < simi[0]) {
+        //         int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+        //         maxheap_replace_top(k, simi, idxi, dis, id);
+        //         nup++;
+        //     }
+        // }        
+
+        // the lambda that filters acceptable elements.
+        auto filter = 
+            [&](const size_t j) { return (!use_sel || sel->is_member(use_sel == 1 ? ids[j] : j)); };
+
+        // the lambda that applies a filtered element.
+        auto apply = 
+            [&](const float dis, const size_t j) {
+                if (dis < simi[0]) {
+                    int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                    maxheap_replace_top(k, simi, idxi, dis, id);
+                    nup++;
+                }
+            };
+
+        // compute distances
+        fvec_L2sqr_ny_scalar_if(dc, codes, code_size, list_size, filter, apply);
+
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const float* code_norms,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& res) const override {
+        for (size_t j = 0; j < list_size; j++, codes += code_size) {
+            if (use_sel && !sel->is_member(use_sel == 1 ? ids[j] : j)) {
+                continue;
+            }
+
+            // todo aguzhva: upgrade
+            float dis = dc.query_to_code(codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                res.add(dis, id);
+            }
+        }
+    }
+};
+
+}
\ No newline at end of file
diff --git a/thirdparty/faiss/faiss/impl/ThreadedIndex-inl.h b/thirdparty/faiss/faiss/impl/ThreadedIndex-inl.h
index e1c91c16c..1dbbb6727 100644
--- a/thirdparty/faiss/faiss/impl/ThreadedIndex-inl.h
+++ b/thirdparty/faiss/faiss/impl/ThreadedIndex-inl.h
@@ -18,7 +18,7 @@ ThreadedIndex<IndexT>::ThreadedIndex(bool threaded)
 
 template <typename IndexT>
 ThreadedIndex<IndexT>::ThreadedIndex(int d, bool threaded)
-        : IndexT(d), own_fields(false), isThreaded_(threaded) {}
+        : IndexT(d), isThreaded_(threaded) {}
 
 template <typename IndexT>
 ThreadedIndex<IndexT>::~ThreadedIndex() {
@@ -35,7 +35,7 @@ ThreadedIndex<IndexT>::~ThreadedIndex() {
             FAISS_ASSERT(!(bool)p.second);
         }
 
-        if (own_fields) {
+        if (own_indices) {
             delete p.first;
         }
     }
@@ -102,7 +102,7 @@ void ThreadedIndex<IndexT>::removeIndex(IndexT* index) {
             indices_.erase(it);
             onAfterRemoveIndex(index);
 
-            if (own_fields) {
+            if (own_indices) {
                 delete index;
             }
 
diff --git a/thirdparty/faiss/faiss/impl/ThreadedIndex.h b/thirdparty/faiss/faiss/impl/ThreadedIndex.h
index 2f5b8a36d..eaea6d933 100644
--- a/thirdparty/faiss/faiss/impl/ThreadedIndex.h
+++ b/thirdparty/faiss/faiss/impl/ThreadedIndex.h
@@ -29,7 +29,7 @@ class ThreadedIndex : public IndexT {
     /// WARNING: once an index is added, it becomes unsafe to touch it from any
     /// other thread than that on which is managing it, until we are shut
     /// down. Use runOnIndex to perform work on it instead.
-    void addIndex(IndexT* index);
+    virtual void addIndex(IndexT* index);
 
     /// Remove an index that is managed by ourselves.
     /// This will flush all pending work on that index, and then shut
@@ -52,17 +52,17 @@ class ThreadedIndex : public IndexT {
     }
 
     /// Returns the i-th sub-index
-    IndexT* at(int i) {
+    IndexT* at(size_t i) {
         return indices_[i].first;
     }
 
     /// Returns the i-th sub-index (const version)
-    const IndexT* at(int i) const {
+    const IndexT* at(size_t i) const {
         return indices_[i].first;
     }
 
     /// Whether or not we are responsible for deleting our contained indices
-    bool own_fields;
+    bool own_indices = false;
 
    protected:
     /// Called just after an index is added
diff --git a/thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h b/thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h
new file mode 100644
index 000000000..0aa1535b2
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/code_distance/code_distance-avx2.h
@@ -0,0 +1,529 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#ifdef __AVX2__
+
+#include <immintrin.h>
+
+#include <type_traits>
+
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/code_distance/code_distance-generic.h>
+
+namespace {
+
+inline float horizontal_sum(const __m128 v) {
+    const __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 3, 2));
+    const __m128 v1 = _mm_add_ps(v, v0);
+    __m128 v2 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0, 0, 0, 1));
+    const __m128 v3 = _mm_add_ps(v1, v2);
+    return _mm_cvtss_f32(v3);
+}
+
+// Computes a horizontal sum over an __m256 register
+inline float horizontal_sum(const __m256 v) {
+    const __m128 v0 =
+            _mm_add_ps(_mm256_castps256_ps128(v), _mm256_extractf128_ps(v, 1));
+    return horizontal_sum(v0);
+}
+
+// processes a single code for M=4, ksub=256, nbits=8
+float inline distance_single_code_avx2_pqdecoder8_m4(
+        // precomputed distances, layout (4, 256)
+        const float* sim_table,
+        const uint8_t* code) {
+    float result = 0;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    const __m128i vksub = _mm_set1_epi32(ksub);
+    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
+    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m128 partialSum;
+
+    // load 4 uint8 values
+    const __m128i mm1 = _mm_cvtsi32_si128(*((const int32_t*)code));
+    {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m128i idx1 = _mm_cvtepu8_epi32(mm1);
+
+        // add offsets
+        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
+
+        // gather 8 values, similar to 8 operations of tab[idx]
+        __m128 collected =
+                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSum = collected;
+    }
+
+    // horizontal sum for partialSum
+    result = horizontal_sum(partialSum);
+    return result;
+}
+
+// processes a single code for M=8, ksub=256, nbits=8
+float inline distance_single_code_avx2_pqdecoder8_m8(
+        // precomputed distances, layout (8, 256)
+        const float* sim_table,
+        const uint8_t* code) {
+    float result = 0;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    const __m256i vksub = _mm256_set1_epi32(ksub);
+    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m256 partialSum;
+
+    // load 8 uint8 values
+    const __m128i mm1 = _mm_loadu_si64((const __m128i_u*)code);
+    {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
+
+        // add offsets
+        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
+
+        // gather 8 values, similar to 8 operations of tab[idx]
+        __m256 collected =
+                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSum = collected;
+    }
+
+    // horizontal sum for partialSum
+    result = horizontal_sum(partialSum);
+    return result;
+}
+
+// processes four codes for M=4, ksub=256, nbits=8
+inline void distance_four_codes_avx2_pqdecoder8_m4(
+        // precomputed distances, layout (4, 256)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    constexpr intptr_t N = 4;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    // process 8 values
+    const __m128i vksub = _mm_set1_epi32(ksub);
+    __m128i offsets_0 = _mm_setr_epi32(0, 1, 2, 3);
+    offsets_0 = _mm_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m128 partialSums[N];
+
+    // load 4 uint8 values
+    __m128i mm1[N];
+    mm1[0] = _mm_cvtsi32_si128(*((const int32_t*)code0));
+    mm1[1] = _mm_cvtsi32_si128(*((const int32_t*)code1));
+    mm1[2] = _mm_cvtsi32_si128(*((const int32_t*)code2));
+    mm1[3] = _mm_cvtsi32_si128(*((const int32_t*)code3));
+
+    for (intptr_t j = 0; j < N; j++) {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m128i idx1 = _mm_cvtepu8_epi32(mm1[j]);
+
+        // add offsets
+        const __m128i indices_to_read_from = _mm_add_epi32(idx1, offsets_0);
+
+        // gather 4 values, similar to 4 operations of tab[idx]
+        __m128 collected =
+                _mm_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSums[j] = collected;
+    }
+
+    // horizontal sum for partialSum
+    result0 = horizontal_sum(partialSums[0]);
+    result1 = horizontal_sum(partialSums[1]);
+    result2 = horizontal_sum(partialSums[2]);
+    result3 = horizontal_sum(partialSums[3]);
+}
+
+// processes four codes for M=8, ksub=256, nbits=8
+inline void distance_four_codes_avx2_pqdecoder8_m8(
+        // precomputed distances, layout (8, 256)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    constexpr intptr_t N = 4;
+
+    const float* tab = sim_table;
+    constexpr size_t ksub = 1 << 8;
+
+    // process 8 values
+    const __m256i vksub = _mm256_set1_epi32(ksub);
+    __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+    // accumulators of partial sums
+    __m256 partialSums[N];
+
+    // load 8 uint8 values
+    __m128i mm1[N];
+    mm1[0] = _mm_loadu_si64((const __m128i_u*)code0);
+    mm1[1] = _mm_loadu_si64((const __m128i_u*)code1);
+    mm1[2] = _mm_loadu_si64((const __m128i_u*)code2);
+    mm1[3] = _mm_loadu_si64((const __m128i_u*)code3);
+
+    for (intptr_t j = 0; j < N; j++) {
+        // convert uint8 values (low part of __m128i) to int32
+        // values
+        const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
+
+        // add offsets
+        const __m256i indices_to_read_from = _mm256_add_epi32(idx1, offsets_0);
+
+        // gather 8 values, similar to 8 operations of tab[idx]
+        __m256 collected =
+                _mm256_i32gather_ps(tab, indices_to_read_from, sizeof(float));
+
+        // collect partial sums
+        partialSums[j] = collected;
+    }
+
+    // horizontal sum for partialSum
+    result0 = horizontal_sum(partialSums[0]);
+    result1 = horizontal_sum(partialSums[1]);
+    result2 = horizontal_sum(partialSums[2]);
+    result3 = horizontal_sum(partialSums[3]);
+}
+
+} // namespace
+
+namespace faiss {
+
+template <typename PQDecoderT>
+typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, float>::
+        type inline distance_single_code_avx2(
+                // number of subquantizers
+                const size_t M,
+                // number of bits per quantization index
+                const size_t nbits,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                const uint8_t* code) {
+    // default implementation
+    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
+}
+
+template <typename PQDecoderT>
+typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, float>::
+        type inline distance_single_code_avx2(
+                // number of subquantizers
+                const size_t M,
+                // number of bits per quantization index
+                const size_t nbits,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                const uint8_t* code) {
+    if (M == 4) {
+        return distance_single_code_avx2_pqdecoder8_m4(sim_table, code);
+    }
+    if (M == 8) {
+        return distance_single_code_avx2_pqdecoder8_m8(sim_table, code);
+    }
+
+    float result = 0;
+    constexpr size_t ksub = 1 << 8;
+
+    size_t m = 0;
+    const size_t pqM16 = M / 16;
+
+    const float* tab = sim_table;
+
+    if (pqM16 > 0) {
+        // process 16 values per loop
+
+        const __m256i vksub = _mm256_set1_epi32(ksub);
+        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+        // accumulators of partial sums
+        __m256 partialSum = _mm256_setzero_ps();
+
+        // loop
+        for (m = 0; m < pqM16 * 16; m += 16) {
+            // load 16 uint8 values
+            const __m128i mm1 = _mm_loadu_si128((const __m128i_u*)(code + m));
+            {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                tab += ksub * 8;
+
+                // collect partial sums
+                partialSum = _mm256_add_ps(partialSum, collected);
+            }
+
+            // move high 8 uint8 to low ones
+            const __m128i mm2 = _mm_unpackhi_epi64(mm1, _mm_setzero_si128());
+            {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+                tab += ksub * 8;
+
+                // collect partial sums
+                partialSum = _mm256_add_ps(partialSum, collected);
+            }
+        }
+
+        // horizontal sum for partialSum
+        result += horizontal_sum(partialSum);
+    }
+
+    //
+    if (m < M) {
+        // process leftovers
+        PQDecoder8 decoder(code + m, nbits);
+
+        for (; m < M; m++) {
+            result += tab[decoder.decode()];
+            tab += ksub;
+        }
+    }
+
+    return result;
+}
+
+template <typename PQDecoderT>
+typename std::enable_if<!std::is_same<PQDecoderT, PQDecoder8>::value, void>::
+        type
+        distance_four_codes_avx2(
+                // number of subquantizers
+                const size_t M,
+                // number of bits per quantization index
+                const size_t nbits,
+                // precomputed distances, layout (M, ksub)
+                const float* sim_table,
+                // codes
+                const uint8_t* __restrict code0,
+                const uint8_t* __restrict code1,
+                const uint8_t* __restrict code2,
+                const uint8_t* __restrict code3,
+                // computed distances
+                float& result0,
+                float& result1,
+                float& result2,
+                float& result3) {
+    distance_four_codes_generic<PQDecoderT>(
+            M,
+            nbits,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+
+// Combines 4 operations of distance_single_code()
+template <typename PQDecoderT>
+typename std::enable_if<std::is_same<PQDecoderT, PQDecoder8>::value, void>::type
+distance_four_codes_avx2(
+        // number of subquantizers
+        const size_t M,
+        // number of bits per quantization index
+        const size_t nbits,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    if (M == 4) {
+        distance_four_codes_avx2_pqdecoder8_m4(
+                sim_table,
+                code0,
+                code1,
+                code2,
+                code3,
+                result0,
+                result1,
+                result2,
+                result3);
+        return;
+    }
+    if (M == 8) {
+        distance_four_codes_avx2_pqdecoder8_m8(
+                sim_table,
+                code0,
+                code1,
+                code2,
+                code3,
+                result0,
+                result1,
+                result2,
+                result3);
+        return;
+    }
+
+    result0 = 0;
+    result1 = 0;
+    result2 = 0;
+    result3 = 0;
+    constexpr size_t ksub = 1 << 8;
+
+    size_t m = 0;
+    const size_t pqM16 = M / 16;
+
+    constexpr intptr_t N = 4;
+
+    const float* tab = sim_table;
+
+    if (pqM16 > 0) {
+        // process 16 values per loop
+        const __m256i vksub = _mm256_set1_epi32(ksub);
+        __m256i offsets_0 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+        offsets_0 = _mm256_mullo_epi32(offsets_0, vksub);
+
+        // accumulators of partial sums
+        __m256 partialSums[N];
+        for (intptr_t j = 0; j < N; j++) {
+            partialSums[j] = _mm256_setzero_ps();
+        }
+
+        // loop
+        for (m = 0; m < pqM16 * 16; m += 16) {
+            // load 16 uint8 values
+            __m128i mm1[N];
+            mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+            mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
+            mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
+            mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
+
+            // process first 8 codes
+            for (intptr_t j = 0; j < N; j++) {
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm1[j]);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+
+                // collect partial sums
+                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+            }
+            tab += ksub * 8;
+
+            // process next 8 codes
+            for (intptr_t j = 0; j < N; j++) {
+                // move high 8 uint8 to low ones
+                const __m128i mm2 =
+                        _mm_unpackhi_epi64(mm1[j], _mm_setzero_si128());
+
+                // convert uint8 values (low part of __m128i) to int32
+                // values
+                const __m256i idx1 = _mm256_cvtepu8_epi32(mm2);
+
+                // add offsets
+                const __m256i indices_to_read_from =
+                        _mm256_add_epi32(idx1, offsets_0);
+
+                // gather 8 values, similar to 8 operations of tab[idx]
+                __m256 collected = _mm256_i32gather_ps(
+                        tab, indices_to_read_from, sizeof(float));
+
+                // collect partial sums
+                partialSums[j] = _mm256_add_ps(partialSums[j], collected);
+            }
+
+            tab += ksub * 8;
+        }
+
+        // horizontal sum for partialSum
+        result0 += horizontal_sum(partialSums[0]);
+        result1 += horizontal_sum(partialSums[1]);
+        result2 += horizontal_sum(partialSums[2]);
+        result3 += horizontal_sum(partialSums[3]);
+    }
+
+    //
+    if (m < M) {
+        // process leftovers
+        PQDecoder8 decoder0(code0 + m, nbits);
+        PQDecoder8 decoder1(code1 + m, nbits);
+        PQDecoder8 decoder2(code2 + m, nbits);
+        PQDecoder8 decoder3(code3 + m, nbits);
+        for (; m < M; m++) {
+            result0 += tab[decoder0.decode()];
+            result1 += tab[decoder1.decode()];
+            result2 += tab[decoder2.decode()];
+            result3 += tab[decoder3.decode()];
+            tab += ksub;
+        }
+    }
+}
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/impl/code_distance/code_distance-generic.h b/thirdparty/faiss/faiss/impl/code_distance/code_distance-generic.h
new file mode 100644
index 000000000..31f18d277
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/code_distance/code_distance-generic.h
@@ -0,0 +1,81 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace faiss {
+
+/// Returns the distance to a single code.
+template <typename PQDecoderT>
+inline float distance_single_code_generic(
+        // number of subquantizers
+        const size_t M,
+        // number of bits per quantization index
+        const size_t nbits,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    PQDecoderT decoder(code, nbits);
+    const size_t ksub = 1 << nbits;
+
+    const float* tab = sim_table;
+    float result = 0;
+
+    for (size_t m = 0; m < M; m++) {
+        result += tab[decoder.decode()];
+        tab += ksub;
+    }
+
+    return result;
+}
+
+/// Combines 4 operations of distance_single_code()
+/// General-purpose version.
+template <typename PQDecoderT>
+inline void distance_four_codes_generic(
+        // number of subquantizers
+        const size_t M,
+        // number of bits per quantization index
+        const size_t nbits,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    PQDecoderT decoder0(code0, nbits);
+    PQDecoderT decoder1(code1, nbits);
+    PQDecoderT decoder2(code2, nbits);
+    PQDecoderT decoder3(code3, nbits);
+    const size_t ksub = 1 << nbits;
+
+    const float* tab = sim_table;
+    result0 = 0;
+    result1 = 0;
+    result2 = 0;
+    result3 = 0;
+
+    for (size_t m = 0; m < M; m++) {
+        result0 += tab[decoder0.decode()];
+        result1 += tab[decoder1.decode()];
+        result2 += tab[decoder2.decode()];
+        result3 += tab[decoder3.decode()];
+        tab += ksub;
+    }
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/code_distance/code_distance.h b/thirdparty/faiss/faiss/impl/code_distance/code_distance.h
new file mode 100644
index 000000000..7cdf932f5
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/code_distance/code_distance.h
@@ -0,0 +1,133 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/platform_macros.h>
+
+// This directory contains functions to compute a distance
+// from a given PQ code to a query vector, given that the
+// distances to a query vector for pq.M codebooks are precomputed.
+//
+// The code was originally the part of IndexIVFPQ.cpp.
+// The baseline implementation can be found in
+//   code_distance-generic.h, distance_single_code_generic().
+
+// The reason for this somewhat unusual structure is that
+// custom implementations may need to fall off to generic
+// implementation in certain cases. So, say, avx2 header file
+// needs to reference the generic header file. This is
+// why the names of the functions for custom implementations
+// have this _generic or _avx2 suffix.
+
+#ifdef __AVX2__
+
+#include <faiss/impl/code_distance/code_distance-avx2.h>
+
+namespace faiss {
+
+template <typename PQDecoderT>
+inline float distance_single_code(
+        // number of subquantizers
+        const size_t M,
+        // number of bits per quantization index
+        const size_t nbits,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    return distance_single_code_avx2<PQDecoderT>(M, nbits, sim_table, code);
+}
+
+template <typename PQDecoderT>
+inline void distance_four_codes(
+        // number of subquantizers
+        const size_t M,
+        // number of bits per quantization index
+        const size_t nbits,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    distance_four_codes_avx2<PQDecoderT>(
+            M,
+            nbits,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+
+} // namespace faiss
+
+#else
+
+#include <faiss/impl/code_distance/code_distance-generic.h>
+
+namespace faiss {
+
+template <typename PQDecoderT>
+inline float distance_single_code(
+        // number of subquantizers
+        const size_t M,
+        // number of bits per quantization index
+        const size_t nbits,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // the code
+        const uint8_t* code) {
+    return distance_single_code_generic<PQDecoderT>(M, nbits, sim_table, code);
+}
+
+template <typename PQDecoderT>
+inline void distance_four_codes(
+        // number of subquantizers
+        const size_t M,
+        // number of bits per quantization index
+        const size_t nbits,
+        // precomputed distances, layout (M, ksub)
+        const float* sim_table,
+        // codes
+        const uint8_t* __restrict code0,
+        const uint8_t* __restrict code1,
+        const uint8_t* __restrict code2,
+        const uint8_t* __restrict code3,
+        // computed distances
+        float& result0,
+        float& result1,
+        float& result2,
+        float& result3) {
+    distance_four_codes_generic<PQDecoderT>(
+            M,
+            nbits,
+            sim_table,
+            code0,
+            code1,
+            code2,
+            code3,
+            result0,
+            result1,
+            result2,
+            result3);
+}
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/impl/code_distance/code_distance_avx512.h b/thirdparty/faiss/faiss/impl/code_distance/code_distance_avx512.h
new file mode 100644
index 000000000..296e0df1b
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/code_distance/code_distance_avx512.h
@@ -0,0 +1,102 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// // // AVX-512 version. It is not used, but let it be for the future
+// // // needs.
+// // template <class SearchResultType, typename T = PQDecoder>
+// // typename std::enable_if<(std::is_same<T, PQDecoder8>::value), void>::
+// //         type distance_four_codes(
+// //     const uint8_t* __restrict code0,
+// //     const uint8_t* __restrict code1,
+// //     const uint8_t* __restrict code2,
+// //     const uint8_t* __restrict code3,
+// //     float& result0,
+// //     float& result1,
+// //     float& result2,
+// //     float& result3
+// // ) const {
+// //     result0 = 0;
+// //     result1 = 0;
+// //     result2 = 0;
+// //     result3 = 0;
+
+// //     size_t m = 0;
+// //     const size_t pqM16 = pq.M / 16;
+
+// //     constexpr intptr_t N = 4;
+
+// //     const float* tab = sim_table;
+
+// //     if (pqM16 > 0) {
+// //         // process 16 values per loop
+// //         const __m512i ksub = _mm512_set1_epi32(pq.ksub);
+// //         __m512i offsets_0 = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7,
+// //              8, 9, 10, 11, 12, 13, 14, 15);
+// //         offsets_0 = _mm512_mullo_epi32(offsets_0, ksub);
+
+// //         // accumulators of partial sums
+// //         __m512 partialSums[N];
+// //         for (intptr_t j = 0; j < N; j++) {
+// //             partialSums[j] = _mm512_setzero_ps();
+// //         }
+
+// //         // loop
+// //         for (m = 0; m < pqM16 * 16; m += 16) {
+// //             // load 16 uint8 values
+// //             __m128i mm1[N];
+// //             mm1[0] = _mm_loadu_si128((const __m128i_u*)(code0 + m));
+// //             mm1[1] = _mm_loadu_si128((const __m128i_u*)(code1 + m));
+// //             mm1[2] = _mm_loadu_si128((const __m128i_u*)(code2 + m));
+// //             mm1[3] = _mm_loadu_si128((const __m128i_u*)(code3 + m));
+
+// //             // process first 8 codes
+// //             for (intptr_t j = 0; j < N; j++) {
+// //                 // convert uint8 values (low part of __m128i) to int32
+// //                 // values
+// //                 const __m512i idx1 = _mm512_cvtepu8_epi32(mm1[j]);
+
+// //                 // add offsets
+// //                 const __m512i indices_to_read_from =
+// //                     _mm512_add_epi32(idx1, offsets_0);
+
+// //                 // gather 8 values, similar to 8 operations of
+// // //                    tab[idx]
+// //                 __m512 collected =
+// //                        _mm512_i32gather_ps(
+// //                             indices_to_read_from, tab, sizeof(float));
+
+// //                 // collect partial sums
+// //                 partialSums[j] = _mm512_add_ps(partialSums[j],
+// //                    collected);
+// //             }
+// //             tab += pq.ksub * 16;
+
+// //         }
+
+// //         // horizontal sum for partialSum
+// //         result0 += _mm512_reduce_add_ps(partialSums[0]);
+// //         result1 += _mm512_reduce_add_ps(partialSums[1]);
+// //         result2 += _mm512_reduce_add_ps(partialSums[2]);
+// //         result3 += _mm512_reduce_add_ps(partialSums[3]);
+// //     }
+
+// //     //
+// //     if (m < pq.M) {
+// //         // process leftovers
+// //         PQDecoder decoder0(code0 + m, pq.nbits);
+// //         PQDecoder decoder1(code1 + m, pq.nbits);
+// //         PQDecoder decoder2(code2 + m, pq.nbits);
+// //         PQDecoder decoder3(code3 + m, pq.nbits);
+// //         for (; m < pq.M; m++) {
+// //             result0 += tab[decoder0.decode()];
+// //             result1 += tab[decoder1.decode()];
+// //             result2 += tab[decoder2.decode()];
+// //             result3 += tab[decoder3.decode()];
+// //             tab += pq.ksub;
+// //         }
+// //     }
+// // }
diff --git a/thirdparty/faiss/faiss/impl/index_read.cpp b/thirdparty/faiss/faiss/impl/index_read.cpp
index f9c376e10..ac111eeb1 100644
--- a/thirdparty/faiss/faiss/impl/index_read.cpp
+++ b/thirdparty/faiss/faiss/impl/index_read.cpp
@@ -28,22 +28,27 @@
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexAdditiveQuantizerFastScan.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFAdditiveQuantizer.h>
+#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 #include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/IndexLattice.h>
+#include <faiss/IndexNNDescent.h>
 #include <faiss/IndexNSG.h>
 #include <faiss/IndexPQ.h>
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
+#include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
 #include <faiss/IndexScaNN.h>
 #include <faiss/MetaIndexes.h>
@@ -72,7 +77,7 @@ static void read_index_header(Index* idx, IOReader* f) {
     READ1(dummy8);
     uint32_t dummy32;
     READ1(dummy32);
-    Index::idx_t dummy;
+    idx_t dummy;
     READ1(dummy);
 
     READ1(idx->is_trained);
@@ -211,9 +216,9 @@ InvertedLists* read_InvertedLists(IOReader* f, int io_flags) {
         size_t n;
         READ1(n);
 #ifdef USE_GPU
-        ails->pin_readonly_ids = std::make_shared<PageLockMemory>(n * sizeof(InvertedLists::idx_t));
+        ails->pin_readonly_ids = std::make_shared<PageLockMemory>(n * sizeof(idx_t));
         ails->pin_readonly_codes = std::make_shared<PageLockMemory>(n * code_size * sizeof(uint8_t));
-        READANDCHECK((InvertedLists::idx_t *) ails->pin_readonly_ids->data, n);
+        READANDCHECK((idx_t *) ails->pin_readonly_ids->data, n);
         READANDCHECK((uint8_t *) ails->pin_readonly_codes->data, n * code_size);
 #else
         ails->readonly_ids.resize(n);
@@ -363,7 +368,7 @@ InvertedLists *read_InvertedLists_nm(IOReader *f, int io_flags) {
             OnDiskInvertedLists::List & l = ails->lists[i];
             l.size = l.capacity = sizes[i];
             l.offset = o;
-            o += l.size * (sizeof(OnDiskInvertedLists::idx_t) +
+            o += l.size * (sizeof(idx_t) +
                            ails->code_size);
         }
         FAISS_THROW_IF_NOT(o <= ails->totsize);
@@ -415,17 +420,33 @@ static void read_AdditiveQuantizer(AdditiveQuantizer* aq, IOReader* f) {
     READ1(aq->norm_min);
     READ1(aq->norm_max);
     if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_cqint4) {
+        aq->search_type == AdditiveQuantizer::ST_norm_cqint4 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
         READXBVECTOR(aq->qnorm.codes);
+        aq->qnorm.ntotal = aq->qnorm.codes.size() / 4;
+        aq->qnorm.update_permutation();
     }
+
+    if (aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
+        READVECTOR(aq->norm_tabs);
+    }
+
     aq->set_derived_values();
 }
 
-static void read_ResidualQuantizer(ResidualQuantizer* rq, IOReader* f) {
+static void read_ResidualQuantizer(
+        ResidualQuantizer* rq,
+        IOReader* f,
+        int io_flags) {
     read_AdditiveQuantizer(rq, f);
     READ1(rq->train_type);
     READ1(rq->max_beam_size);
-    if (!(rq->train_type & ResidualQuantizer::Skip_codebook_tables)) {
+    if ((rq->train_type & ResidualQuantizer::Skip_codebook_tables) ||
+        (io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE)) {
+        // don't precompute the tables
+    } else {
         rq->compute_codebook_tables();
     }
 }
@@ -445,6 +466,38 @@ static void read_LocalSearchQuantizer(LocalSearchQuantizer* lsq, IOReader* f) {
     READ1(lsq->update_codebooks_with_double);
 }
 
+static void read_ProductAdditiveQuantizer(
+        ProductAdditiveQuantizer* paq,
+        IOReader* f) {
+    read_AdditiveQuantizer(paq, f);
+    READ1(paq->nsplits);
+}
+
+static void read_ProductResidualQuantizer(
+        ProductResidualQuantizer* prq,
+        IOReader* f,
+        int io_flags) {
+    read_ProductAdditiveQuantizer(prq, f);
+
+    for (size_t i = 0; i < prq->nsplits; i++) {
+        auto rq = new ResidualQuantizer();
+        read_ResidualQuantizer(rq, f, io_flags);
+        prq->quantizers.push_back(rq);
+    }
+}
+
+static void read_ProductLocalSearchQuantizer(
+        ProductLocalSearchQuantizer* plsq,
+        IOReader* f) {
+    read_ProductAdditiveQuantizer(plsq, f);
+
+    for (size_t i = 0; i < plsq->nsplits; i++) {
+        auto lsq = new LocalSearchQuantizer();
+        read_LocalSearchQuantizer(lsq, f);
+        plsq->quantizers.push_back(lsq);
+    }
+}
+
 static void read_ScalarQuantizer(ScalarQuantizer* ivsc, IOReader* f) {
     READ1(ivsc->qtype);
     READ1(ivsc->rangestat);
@@ -489,6 +542,7 @@ static void read_NSG(NSG* nsg, IOReader* f) {
     graph = std::make_shared<nsg::Graph<int>>(N, R);
     std::fill_n(graph->data, N * R, EMPTY_ID);
 
+    int size = 0;
 
     for (int i = 0; i < N; i++) {
         for (int j = 0; j < R + 1; j++) {
@@ -496,6 +550,7 @@ static void read_NSG(NSG* nsg, IOReader* f) {
             READ1(id);
             if (id != EMPTY_ID) {
                 graph->at(i, j) = id;
+                size += 1;
             } else {
                 break;
             }
@@ -503,6 +558,21 @@ static void read_NSG(NSG* nsg, IOReader* f) {
     }
 }
 
+static void read_NNDescent(NNDescent* nnd, IOReader* f) {
+    READ1(nnd->ntotal);
+    READ1(nnd->d);
+    READ1(nnd->K);
+    READ1(nnd->S);
+    READ1(nnd->R);
+    READ1(nnd->L);
+    READ1(nnd->iter);
+    READ1(nnd->search_L);
+    READ1(nnd->random_seed);
+    READ1(nnd->has_built);
+
+    READVECTOR(nnd->final_graph);
+}
+
 ProductQuantizer* read_ProductQuantizer(const char* fname) {
     FileIOReader reader(fname);
     return read_ProductQuantizer(&reader);
@@ -523,7 +593,6 @@ static void read_direct_map(DirectMap* dm, IOReader* f) {
     dm->type = (DirectMap::Type)maintain_direct_map;
     READVECTOR(dm->array);
     if (dm->type == DirectMap::Hashtable) {
-        using idx_t = Index::idx_t;
         std::vector<std::pair<idx_t, idx_t>> v;
         READVECTOR(v);
         std::unordered_map<idx_t, idx_t>& map = dm->hashtable;
@@ -546,7 +615,7 @@ static void read_direct_map(DirectMap* dm, IOReader* f) {
 static void read_ivf_header(
         IndexIVF* ivf,
         IOReader* f,
-        std::vector<std::vector<Index::idx_t>>* ids = nullptr) {
+        std::vector<std::vector<idx_t>>* ids = nullptr) {
     read_index_header(ivf, f);
     READ1(ivf->nlist);
     READ1(ivf->nprobe);
@@ -563,7 +632,7 @@ static void read_ivf_header(
 // used for legacy formats
 static ArrayInvertedLists* set_array_invlist(
         IndexIVF* ivf,
-        std::vector<std::vector<Index::idx_t>>& ids) {
+        std::vector<std::vector<idx_t>>& ids) {
     ArrayInvertedLists* ail =
             new ArrayInvertedLists(ivf->nlist, ivf->code_size);
     std::swap(ail->ids, ids);
@@ -580,7 +649,7 @@ static IndexIVFPQ* read_ivfpq(IOReader* f, uint32_t h, int io_flags) {
             : nullptr;
     IndexIVFPQ* ivpq = ivfpqr ? ivfpqr : new IndexIVFPQ();
 
-    std::vector<std::vector<Index::idx_t>> ids;
+    std::vector<std::vector<idx_t>> ids;
     read_ivf_header(ivpq, f, legacy ? &ids : nullptr);
     READ1(ivpq->by_residual);
     READ1(ivpq->code_size);
@@ -595,10 +664,14 @@ static IndexIVFPQ* read_ivfpq(IOReader* f, uint32_t h, int io_flags) {
     }
 
     if (ivpq->is_trained) {
-        // precomputed table not stored. It is cheaper to recompute it
+        // precomputed table not stored. It is cheaper to recompute it.
+        // precompute_table() may be disabled with a flag.
         ivpq->use_precomputed_table = 0;
-        if (ivpq->by_residual)
-            ivpq->precompute_table();
+        if (ivpq->by_residual) {
+            if ((io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE) == 0) {
+                ivpq->precompute_table();
+            }
+        }
         if (ivfpqr) {
             read_ProductQuantizer(&ivfpqr->refine_pq, f);
             READVECTOR(ivfpqr->refine_codes);
@@ -678,15 +751,18 @@ Index* read_index(IOReader* f, int io_flags) {
             READ1(idxp->encode_signs);
             READ1(idxp->polysemous_ht);
         }
-        // Old versoins of PQ all had metric_type set to INNER_PRODUCT
+        // Old versions of PQ all had metric_type set to INNER_PRODUCT
         // when they were in fact using L2. Therefore, we force metric type
         // to L2 when the old format is detected
         if (h == fourcc("IxPQ") || h == fourcc("IxPo")) {
             idxp->metric_type = METRIC_L2;
         }
+
+        // the following "if" block is Knowhere-specific
         if (h == fourcc("IxPq")) {
             idxp->pq.compute_sdc_table ();
         }
+
         idx = idxp;
     } else if (h == fourcc("IxRQ") || h == fourcc("IxRq")) {
         IndexResidualQuantizer* idxr = new IndexResidualQuantizer();
@@ -694,7 +770,7 @@ Index* read_index(IOReader* f, int io_flags) {
         if (h == fourcc("IxRQ")) {
             read_ResidualQuantizer_old(&idxr->rq, f);
         } else {
-            read_ResidualQuantizer(&idxr->rq, f);
+            read_ResidualQuantizer(&idxr->rq, f, io_flags);
         }
         READ1(idxr->code_size);
         READVECTOR(idxr->codes);
@@ -706,16 +782,134 @@ Index* read_index(IOReader* f, int io_flags) {
         READ1(idxr->code_size);
         READVECTOR(idxr->codes);
         idx = idxr;
+    } else if (h == fourcc("IxPR")) {
+        auto idxpr = new IndexProductResidualQuantizer();
+        read_index_header(idxpr, f);
+        read_ProductResidualQuantizer(&idxpr->prq, f, io_flags);
+        READ1(idxpr->code_size);
+        READVECTOR(idxpr->codes);
+        idx = idxpr;
+    } else if (h == fourcc("IxPL")) {
+        auto idxpl = new IndexProductLocalSearchQuantizer();
+        read_index_header(idxpl, f);
+        read_ProductLocalSearchQuantizer(&idxpl->plsq, f);
+        READ1(idxpl->code_size);
+        READVECTOR(idxpl->codes);
+        idx = idxpl;
     } else if (h == fourcc("ImRQ")) {
         ResidualCoarseQuantizer* idxr = new ResidualCoarseQuantizer();
         read_index_header(idxr, f);
-        read_ResidualQuantizer(&idxr->rq, f);
+        read_ResidualQuantizer(&idxr->rq, f, io_flags);
         READ1(idxr->beam_factor);
+        if (io_flags & IO_FLAG_SKIP_PRECOMPUTE_TABLE) {
+            // then we force the beam factor to -1
+            // which skips the table precomputation.
+            idxr->beam_factor = -1;
+        }
         idxr->set_beam_factor(idxr->beam_factor);
         idx = idxr;
+    } else if (
+            h == fourcc("ILfs") || h == fourcc("IRfs") || h == fourcc("IPRf") ||
+            h == fourcc("IPLf")) {
+        bool is_LSQ = h == fourcc("ILfs");
+        bool is_RQ = h == fourcc("IRfs");
+        bool is_PLSQ = h == fourcc("IPLf");
+
+        IndexAdditiveQuantizerFastScan* idxaqfs;
+        if (is_LSQ) {
+            idxaqfs = new IndexLocalSearchQuantizerFastScan();
+        } else if (is_RQ) {
+            idxaqfs = new IndexResidualQuantizerFastScan();
+        } else if (is_PLSQ) {
+            idxaqfs = new IndexProductLocalSearchQuantizerFastScan();
+        } else {
+            idxaqfs = new IndexProductResidualQuantizerFastScan();
+        }
+        read_index_header(idxaqfs, f);
+
+        if (is_LSQ) {
+            read_LocalSearchQuantizer((LocalSearchQuantizer*)idxaqfs->aq, f);
+        } else if (is_RQ) {
+            read_ResidualQuantizer(
+                    (ResidualQuantizer*)idxaqfs->aq, f, io_flags);
+        } else if (is_PLSQ) {
+            read_ProductLocalSearchQuantizer(
+                    (ProductLocalSearchQuantizer*)idxaqfs->aq, f);
+        } else {
+            read_ProductResidualQuantizer(
+                    (ProductResidualQuantizer*)idxaqfs->aq, f, io_flags);
+        }
+
+        READ1(idxaqfs->implem);
+        READ1(idxaqfs->bbs);
+        READ1(idxaqfs->qbs);
+
+        READ1(idxaqfs->M);
+        READ1(idxaqfs->nbits);
+        READ1(idxaqfs->ksub);
+        READ1(idxaqfs->code_size);
+        READ1(idxaqfs->ntotal2);
+        READ1(idxaqfs->M2);
+
+        READ1(idxaqfs->rescale_norm);
+        READ1(idxaqfs->norm_scale);
+        READ1(idxaqfs->max_train_points);
+
+        READVECTOR(idxaqfs->codes);
+        idx = idxaqfs;
+    } else if (
+            h == fourcc("IVLf") || h == fourcc("IVRf") || h == fourcc("NPLf") ||
+            h == fourcc("NPRf")) {
+        bool is_LSQ = h == fourcc("IVLf");
+        bool is_RQ = h == fourcc("IVRf");
+        bool is_PLSQ = h == fourcc("NPLf");
+
+        IndexIVFAdditiveQuantizerFastScan* ivaqfs;
+        if (is_LSQ) {
+            ivaqfs = new IndexIVFLocalSearchQuantizerFastScan();
+        } else if (is_RQ) {
+            ivaqfs = new IndexIVFResidualQuantizerFastScan();
+        } else if (is_PLSQ) {
+            ivaqfs = new IndexIVFProductLocalSearchQuantizerFastScan();
+        } else {
+            ivaqfs = new IndexIVFProductResidualQuantizerFastScan();
+        }
+        read_ivf_header(ivaqfs, f);
+
+        if (is_LSQ) {
+            read_LocalSearchQuantizer((LocalSearchQuantizer*)ivaqfs->aq, f);
+        } else if (is_RQ) {
+            read_ResidualQuantizer((ResidualQuantizer*)ivaqfs->aq, f, io_flags);
+        } else if (is_PLSQ) {
+            read_ProductLocalSearchQuantizer(
+                    (ProductLocalSearchQuantizer*)ivaqfs->aq, f);
+        } else {
+            read_ProductResidualQuantizer(
+                    (ProductResidualQuantizer*)ivaqfs->aq, f, io_flags);
+        }
+
+        READ1(ivaqfs->by_residual);
+        READ1(ivaqfs->implem);
+        READ1(ivaqfs->bbs);
+        READ1(ivaqfs->qbs);
+
+        READ1(ivaqfs->M);
+        READ1(ivaqfs->nbits);
+        READ1(ivaqfs->ksub);
+        READ1(ivaqfs->code_size);
+        READ1(ivaqfs->qbs2);
+        READ1(ivaqfs->M2);
+
+        READ1(ivaqfs->rescale_norm);
+        READ1(ivaqfs->norm_scale);
+        READ1(ivaqfs->max_train_points);
+
+        read_InvertedLists(ivaqfs, f, io_flags);
+        ivaqfs->init_code_packer();
+        idx = ivaqfs;
     } else if (h == fourcc("IvFl") || h == fourcc("IvFL")) { // legacy
         IndexIVFFlat* ivfl = new IndexIVFFlat();
-        std::vector<std::vector<Index::idx_t>> ids;
+        std::vector<std::vector<idx_t>> ids;
         read_ivf_header(ivfl, f, &ids);
         ivfl->code_size = ivfl->d * sizeof(float);
         ArrayInvertedLists* ail = set_array_invlist(ivfl, ids);
@@ -738,10 +932,10 @@ Index* read_index(IOReader* f, int io_flags) {
         read_ivf_header(ivfl, f);
         ivfl->code_size = ivfl->d * sizeof(float);
         {
-            std::vector<Index::idx_t> tab;
+            std::vector<idx_t> tab;
             READVECTOR(tab);
             for (long i = 0; i < tab.size(); i += 2) {
-                std::pair<Index::idx_t, Index::idx_t> pair(tab[i], tab[i + 1]);
+                std::pair<idx_t, idx_t> pair(tab[i], tab[i + 1]);
                 ivfl->instances.insert(pair);
             }
         }
@@ -784,7 +978,7 @@ Index* read_index(IOReader* f, int io_flags) {
         idx = idxl;
     } else if (h == fourcc("IvSQ")) { // legacy
         IndexIVFScalarQuantizer* ivsc = new IndexIVFScalarQuantizer();
-        std::vector<std::vector<Index::idx_t>> ids;
+        std::vector<std::vector<idx_t>> ids;
         read_ivf_header(ivsc, f, &ids);
         read_ScalarQuantizer(&ivsc->sq, f);
         READ1(ivsc->code_size);
@@ -804,20 +998,34 @@ Index* read_index(IOReader* f, int io_flags) {
         }
         read_InvertedLists(ivsc, f, io_flags);
         idx = ivsc;
-    } else if (h == fourcc("IwLS") || h == fourcc("IwRQ")) {
+    } else if (
+            h == fourcc("IwLS") || h == fourcc("IwRQ") || h == fourcc("IwPL") ||
+            h == fourcc("IwPR")) {
         bool is_LSQ = h == fourcc("IwLS");
+        bool is_RQ = h == fourcc("IwRQ");
+        bool is_PLSQ = h == fourcc("IwPL");
         IndexIVFAdditiveQuantizer* iva;
         if (is_LSQ) {
             iva = new IndexIVFLocalSearchQuantizer();
-        } else {
+        } else if (is_RQ) {
             iva = new IndexIVFResidualQuantizer();
+        } else if (is_PLSQ) {
+            iva = new IndexIVFProductLocalSearchQuantizer();
+        } else {
+            iva = new IndexIVFProductResidualQuantizer();
         }
         read_ivf_header(iva, f);
         READ1(iva->code_size);
         if (is_LSQ) {
             read_LocalSearchQuantizer((LocalSearchQuantizer*)iva->aq, f);
+        } else if (is_RQ) {
+            read_ResidualQuantizer((ResidualQuantizer*)iva->aq, f, io_flags);
+        } else if (is_PLSQ) {
+            read_ProductLocalSearchQuantizer(
+                    (ProductLocalSearchQuantizer*)iva->aq, f);
         } else {
-            read_ResidualQuantizer((ResidualQuantizer*)iva->aq, f);
+            read_ProductResidualQuantizer(
+                    (ProductResidualQuantizer*)iva->aq, f, io_flags);
         }
         READ1(iva->by_residual);
         READ1(iva->use_precomputed_table);
@@ -840,7 +1048,22 @@ Index* read_index(IOReader* f, int io_flags) {
             h == fourcc("IvPQ") || h == fourcc("IvQR") || h == fourcc("IwPQ") ||
             h == fourcc("IwQR")) {
         idx = read_ivfpq(f, h, io_flags);
-
+    } else if (h == fourcc("IwIQ")) {
+        auto* indep = new IndexIVFIndependentQuantizer();
+        indep->own_fields = true;
+        read_index_header(indep, f);
+        indep->quantizer = read_index(f, io_flags);
+        bool has_vt;
+        READ1(has_vt);
+        if (has_vt) {
+            indep->vt = read_VectorTransform(f);
+        }
+        indep->index_ivf = dynamic_cast<IndexIVF*>(read_index(f, io_flags));
+        FAISS_THROW_IF_NOT(indep->index_ivf);
+        if (auto index_ivfpq = dynamic_cast<IndexIVFPQ*>(indep->index_ivf)) {
+            READ1(index_ivfpq->use_precomputed_table);
+        }
+        idx = indep;
     } else if (h == fourcc("IxPT")) {
         IndexPreTransform* ixpt = new IndexPreTransform();
         ixpt->own_fields = true;
@@ -935,8 +1158,15 @@ Index* read_index(IOReader* f, int io_flags) {
             dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
         }
         idx = idxhnsw;
-    } else if (h == fourcc("INSf")) {
-        IndexNSG* idxnsg = new IndexNSGFlat();
+    } else if (
+            h == fourcc("INSf") || h == fourcc("INSp") || h == fourcc("INSs")) {
+        IndexNSG* idxnsg;
+        if (h == fourcc("INSf"))
+            idxnsg = new IndexNSGFlat();
+        if (h == fourcc("INSp"))
+            idxnsg = new IndexNSGPQ();
+        if (h == fourcc("INSs"))
+            idxnsg = new IndexNSGSQ();
         read_index_header(idxnsg, f);
         READ1(idxnsg->GK);
         READ1(idxnsg->build_type);
@@ -948,6 +1178,13 @@ Index* read_index(IOReader* f, int io_flags) {
         idxnsg->storage = read_index(f, io_flags);
         idxnsg->own_fields = true;
         idx = idxnsg;
+    } else if (h == fourcc("INNf")) {
+        IndexNNDescent* idxnnd = new IndexNNDescentFlat();
+        read_index_header(idxnnd, f);
+        read_NNDescent(&idxnnd->nndescent, f);
+        idxnnd->storage = read_index(f, io_flags);
+        idxnnd->own_fields = true;
+        idx = idxnnd;
     } else if (h == fourcc("IPfs")) {
         IndexPQFastScan* idxpqfs = new IndexPQFastScan();
         read_index_header(idxpqfs, f);
@@ -958,6 +1195,13 @@ Index* read_index(IOReader* f, int io_flags) {
         READ1(idxpqfs->ntotal2);
         READ1(idxpqfs->M2);
         READVECTOR(idxpqfs->codes);
+
+        const auto& pq = idxpqfs->pq;
+        idxpqfs->M = pq.M;
+        idxpqfs->nbits = pq.nbits;
+        idxpqfs->ksub = (1 << pq.nbits);
+        idxpqfs->code_size = pq.code_size;
+
         idx = idxpqfs;
 
     } else if (h == fourcc("IwPf")) {
@@ -969,14 +1213,38 @@ Index* read_index(IOReader* f, int io_flags) {
         READ1(ivpq->M2);
         READ1(ivpq->implem);
         READ1(ivpq->qbs2);
-        READ1(ivpq->is_cosine_);
-        if (ivpq->is_cosine_) {
+        READ1(ivpq->is_cosine);
+        if (ivpq->is_cosine) {
             READVECTOR(ivpq->norms);
         }
         read_ProductQuantizer(&ivpq->pq, f);
         read_InvertedLists(ivpq, f, io_flags);
         ivpq->precompute_table();
+
+        const auto& pq = ivpq->pq;
+        ivpq->M = pq.M;
+        ivpq->nbits = pq.nbits;
+        ivpq->ksub = (1 << pq.nbits);
+        ivpq->code_size = pq.code_size;
+        ivpq->init_code_packer();
+
         idx = ivpq;
+    } else if (h == fourcc("IRMf")) {
+        IndexRowwiseMinMax* imm = new IndexRowwiseMinMax();
+        read_index_header(imm, f);
+
+        imm->index = read_index(f, io_flags);
+        imm->own_fields = true;
+
+        idx = imm;
+    } else if (h == fourcc("IRMh")) {
+        IndexRowwiseMinMaxFP16* imm = new IndexRowwiseMinMaxFP16();
+        read_index_header(imm, f);
+
+        imm->index = read_index(f, io_flags);
+        imm->own_fields = true;
+
+        idx = imm;
     } else {
         FAISS_THROW_FMT(
                 "Index type 0x%08x (\"%s\") not recognized",
@@ -1047,7 +1315,7 @@ static void read_index_binary_header(IndexBinary* idx, IOReader* f) {
 static void read_binary_ivf_header(
         IndexBinaryIVF* ivf,
         IOReader* f,
-        std::vector<std::vector<Index::idx_t>>* ids = nullptr) {
+        std::vector<std::vector<idx_t>>* ids = nullptr) {
     read_index_binary_header(ivf, f);
     READ1(ivf->nlist);
     READ1(ivf->nprobe);
diff --git a/thirdparty/faiss/faiss/impl/index_write.cpp b/thirdparty/faiss/faiss/impl/index_write.cpp
index fb69bdd41..bac1bc59b 100644
--- a/thirdparty/faiss/faiss/impl/index_write.cpp
+++ b/thirdparty/faiss/faiss/impl/index_write.cpp
@@ -28,22 +28,27 @@
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexAdditiveQuantizerFastScan.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFAdditiveQuantizer.h>
+#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 #include <faiss/IndexIVFFlat.h>
+#include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFSpectralHash.h>
 #include <faiss/IndexLSH.h>
 #include <faiss/IndexLattice.h>
+#include <faiss/IndexNNDescent.h>
 #include <faiss/IndexNSG.h>
 #include <faiss/IndexPQ.h>
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
+#include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
 #include <faiss/IndexScaNN.h>
 #include <faiss/MetaIndexes.h>
@@ -90,7 +95,7 @@ static void write_index_header(const Index* idx, IOWriter* f) {
     WRITE1(dummy8);
     uint32_t dummy32 = 0;
     WRITE1(dummy32);
-    Index::idx_t dummy = 0;
+    idx_t dummy = 0;
     WRITE1(dummy);
 
     WRITE1(idx->is_trained);
@@ -180,9 +185,16 @@ static void write_AdditiveQuantizer(const AdditiveQuantizer* aq, IOWriter* f) {
     WRITE1(aq->norm_min);
     WRITE1(aq->norm_max);
     if (aq->search_type == AdditiveQuantizer::ST_norm_cqint8 ||
-        aq->search_type == AdditiveQuantizer::ST_norm_cqint4) {
+        aq->search_type == AdditiveQuantizer::ST_norm_cqint4 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
         WRITEXBVECTOR(aq->qnorm.codes);
     }
+
+    if (aq->search_type == AdditiveQuantizer::ST_norm_lsq2x4 ||
+        aq->search_type == AdditiveQuantizer::ST_norm_rq2x4) {
+        WRITEVECTOR(aq->norm_tabs);
+    }
 }
 
 static void write_ResidualQuantizer(const ResidualQuantizer* rq, IOWriter* f) {
@@ -208,6 +220,33 @@ static void write_LocalSearchQuantizer(
     WRITE1(lsq->update_codebooks_with_double);
 }
 
+static void write_ProductAdditiveQuantizer(
+        const ProductAdditiveQuantizer* paq,
+        IOWriter* f) {
+    write_AdditiveQuantizer(paq, f);
+    WRITE1(paq->nsplits);
+}
+
+static void write_ProductResidualQuantizer(
+        const ProductResidualQuantizer* prq,
+        IOWriter* f) {
+    write_ProductAdditiveQuantizer(prq, f);
+    for (const auto aq : prq->quantizers) {
+        auto rq = dynamic_cast<const ResidualQuantizer*>(aq);
+        write_ResidualQuantizer(rq, f);
+    }
+}
+
+static void write_ProductLocalSearchQuantizer(
+        const ProductLocalSearchQuantizer* plsq,
+        IOWriter* f) {
+    write_ProductAdditiveQuantizer(plsq, f);
+    for (const auto aq : plsq->quantizers) {
+        auto lsq = dynamic_cast<const LocalSearchQuantizer*>(aq);
+        write_LocalSearchQuantizer(lsq, f);
+    }
+}
+
 static void write_ScalarQuantizer(const ScalarQuantizer* ivsc, IOWriter* f) {
     WRITE1(ivsc->qtype);
     WRITE1(ivsc->rangestat);
@@ -457,11 +496,13 @@ static void write_NSG(const NSG* nsg, IOWriter* f) {
     FAISS_THROW_IF_NOT(K == nsg->R);
     FAISS_THROW_IF_NOT(true == graph->own_fields);
 
+    int size = 0;
     for (int i = 0; i < N; i++) {
         for (int j = 0; j < K; j++) {
             int id = graph->at(i, j);
             if (id != EMPTY_ID) {
                 WRITE1(id);
+                size += 1;
             } else {
                 break;
             }
@@ -470,13 +511,27 @@ static void write_NSG(const NSG* nsg, IOWriter* f) {
     }
 }
 
+static void write_NNDescent(const NNDescent* nnd, IOWriter* f) {
+    WRITE1(nnd->ntotal);
+    WRITE1(nnd->d);
+    WRITE1(nnd->K);
+    WRITE1(nnd->S);
+    WRITE1(nnd->R);
+    WRITE1(nnd->L);
+    WRITE1(nnd->iter);
+    WRITE1(nnd->search_L);
+    WRITE1(nnd->random_seed);
+    WRITE1(nnd->has_built);
+
+    WRITEVECTOR(nnd->final_graph);
+}
+
 static void write_direct_map(const DirectMap* dm, IOWriter* f) {
     char maintain_direct_map =
             (char)dm->type; // for backwards compatibility with bool
     WRITE1(maintain_direct_map);
     WRITEVECTOR(dm->array);
     if (dm->type == DirectMap::Hashtable) {
-        using idx_t = Index::idx_t;
         std::vector<std::pair<idx_t, idx_t>> v;
         const std::unordered_map<idx_t, idx_t>& map = dm->hashtable;
         v.resize(map.size());
@@ -497,6 +552,8 @@ static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) {
     write_index_header(ivf, f);
     WRITE1(ivf->nlist);
     WRITE1(ivf->nprobe);
+    // subclasses write by_residual (some of them support only one setting of
+    // by_residual).
     write_index(ivf->quantizer, f);
     write_direct_map(&ivf->direct_map, f);
 }
@@ -552,6 +609,137 @@ void write_index(const Index* idx, IOWriter* f) {
         write_LocalSearchQuantizer(&idxr->lsq, f);
         WRITE1(idxr->code_size);
         WRITEVECTOR(idxr->codes);
+    } else if (
+            const IndexProductResidualQuantizer* idxpr =
+                    dynamic_cast<const IndexProductResidualQuantizer*>(idx)) {
+        uint32_t h = fourcc("IxPR");
+        WRITE1(h);
+        write_index_header(idx, f);
+        write_ProductResidualQuantizer(&idxpr->prq, f);
+        WRITE1(idxpr->code_size);
+        WRITEVECTOR(idxpr->codes);
+    } else if (
+            const IndexProductLocalSearchQuantizer* idxpl =
+                    dynamic_cast<const IndexProductLocalSearchQuantizer*>(
+                            idx)) {
+        uint32_t h = fourcc("IxPL");
+        WRITE1(h);
+        write_index_header(idx, f);
+        write_ProductLocalSearchQuantizer(&idxpl->plsq, f);
+        WRITE1(idxpl->code_size);
+        WRITEVECTOR(idxpl->codes);
+    } else if (
+            auto* idxaqfs =
+                    dynamic_cast<const IndexAdditiveQuantizerFastScan*>(idx)) {
+        auto idxlsqfs =
+                dynamic_cast<const IndexLocalSearchQuantizerFastScan*>(idx);
+        auto idxrqfs = dynamic_cast<const IndexResidualQuantizerFastScan*>(idx);
+        auto idxplsqfs =
+                dynamic_cast<const IndexProductLocalSearchQuantizerFastScan*>(
+                        idx);
+        auto idxprqfs =
+                dynamic_cast<const IndexProductResidualQuantizerFastScan*>(idx);
+        FAISS_THROW_IF_NOT(idxlsqfs || idxrqfs || idxplsqfs || idxprqfs);
+
+        if (idxlsqfs) {
+            uint32_t h = fourcc("ILfs");
+            WRITE1(h);
+        } else if (idxrqfs) {
+            uint32_t h = fourcc("IRfs");
+            WRITE1(h);
+        } else if (idxplsqfs) {
+            uint32_t h = fourcc("IPLf");
+            WRITE1(h);
+        } else if (idxprqfs) {
+            uint32_t h = fourcc("IPRf");
+            WRITE1(h);
+        }
+
+        write_index_header(idxaqfs, f);
+
+        if (idxlsqfs) {
+            write_LocalSearchQuantizer(&idxlsqfs->lsq, f);
+        } else if (idxrqfs) {
+            write_ResidualQuantizer(&idxrqfs->rq, f);
+        } else if (idxplsqfs) {
+            write_ProductLocalSearchQuantizer(&idxplsqfs->plsq, f);
+        } else if (idxprqfs) {
+            write_ProductResidualQuantizer(&idxprqfs->prq, f);
+        }
+        WRITE1(idxaqfs->implem);
+        WRITE1(idxaqfs->bbs);
+        WRITE1(idxaqfs->qbs);
+
+        WRITE1(idxaqfs->M);
+        WRITE1(idxaqfs->nbits);
+        WRITE1(idxaqfs->ksub);
+        WRITE1(idxaqfs->code_size);
+        WRITE1(idxaqfs->ntotal2);
+        WRITE1(idxaqfs->M2);
+
+        WRITE1(idxaqfs->rescale_norm);
+        WRITE1(idxaqfs->norm_scale);
+        WRITE1(idxaqfs->max_train_points);
+
+        WRITEVECTOR(idxaqfs->codes);
+    } else if (
+            auto* ivaqfs =
+                    dynamic_cast<const IndexIVFAdditiveQuantizerFastScan*>(
+                            idx)) {
+        auto ivlsqfs =
+                dynamic_cast<const IndexIVFLocalSearchQuantizerFastScan*>(idx);
+        auto ivrqfs =
+                dynamic_cast<const IndexIVFResidualQuantizerFastScan*>(idx);
+        auto ivplsqfs = dynamic_cast<
+                const IndexIVFProductLocalSearchQuantizerFastScan*>(idx);
+        auto ivprqfs =
+                dynamic_cast<const IndexIVFProductResidualQuantizerFastScan*>(
+                        idx);
+        FAISS_THROW_IF_NOT(ivlsqfs || ivrqfs || ivplsqfs || ivprqfs);
+
+        if (ivlsqfs) {
+            uint32_t h = fourcc("IVLf");
+            WRITE1(h);
+        } else if (ivrqfs) {
+            uint32_t h = fourcc("IVRf");
+            WRITE1(h);
+        } else if (ivplsqfs) {
+            uint32_t h = fourcc("NPLf"); // N means IV ...
+            WRITE1(h);
+        } else {
+            uint32_t h = fourcc("NPRf");
+            WRITE1(h);
+        }
+
+        write_ivf_header(ivaqfs, f);
+
+        if (ivlsqfs) {
+            write_LocalSearchQuantizer(&ivlsqfs->lsq, f);
+        } else if (ivrqfs) {
+            write_ResidualQuantizer(&ivrqfs->rq, f);
+        } else if (ivplsqfs) {
+            write_ProductLocalSearchQuantizer(&ivplsqfs->plsq, f);
+        } else {
+            write_ProductResidualQuantizer(&ivprqfs->prq, f);
+        }
+
+        WRITE1(ivaqfs->by_residual);
+        WRITE1(ivaqfs->implem);
+        WRITE1(ivaqfs->bbs);
+        WRITE1(ivaqfs->qbs);
+
+        WRITE1(ivaqfs->M);
+        WRITE1(ivaqfs->nbits);
+        WRITE1(ivaqfs->ksub);
+        WRITE1(ivaqfs->code_size);
+        WRITE1(ivaqfs->qbs2);
+        WRITE1(ivaqfs->M2);
+
+        WRITE1(ivaqfs->rescale_norm);
+        WRITE1(ivaqfs->norm_scale);
+        WRITE1(ivaqfs->max_train_points);
+
+        write_InvertedLists(ivaqfs->invlists, f);
     } else if (
             const ResidualCoarseQuantizer* idxr =
                     dynamic_cast<const ResidualCoarseQuantizer*>(idx)) {
@@ -598,7 +786,7 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(h);
         write_ivf_header(ivfl, f);
         {
-            std::vector<Index::idx_t> tab(2 * ivfl->instances.size());
+            std::vector<idx_t> tab(2 * ivfl->instances.size());
             long i = 0;
             for (auto it = ivfl->instances.begin(); it != ivfl->instances.end();
                  ++it) {
@@ -631,14 +819,33 @@ void write_index(const Index* idx, IOWriter* f) {
         write_InvertedLists(ivsc->invlists, f);
     } else if (auto iva = dynamic_cast<const IndexIVFAdditiveQuantizer*>(idx)) {
         bool is_LSQ = dynamic_cast<const IndexIVFLocalSearchQuantizer*>(iva);
-        uint32_t h = fourcc(is_LSQ ? "IwLS" : "IwRQ");
+        bool is_RQ = dynamic_cast<const IndexIVFResidualQuantizer*>(iva);
+        bool is_PLSQ =
+                dynamic_cast<const IndexIVFProductLocalSearchQuantizer*>(iva);
+        uint32_t h;
+        if (is_LSQ) {
+            h = fourcc("IwLS");
+        } else if (is_RQ) {
+            h = fourcc("IwRQ");
+        } else if (is_PLSQ) {
+            h = fourcc("IwPL");
+        } else {
+            h = fourcc("IwPR");
+        }
+
         WRITE1(h);
         write_ivf_header(iva, f);
         WRITE1(iva->code_size);
         if (is_LSQ) {
             write_LocalSearchQuantizer((LocalSearchQuantizer*)iva->aq, f);
-        } else {
+        } else if (is_RQ) {
             write_ResidualQuantizer((ResidualQuantizer*)iva->aq, f);
+        } else if (is_PLSQ) {
+            write_ProductLocalSearchQuantizer(
+                    (ProductLocalSearchQuantizer*)iva->aq, f);
+        } else {
+            write_ProductResidualQuantizer(
+                    (ProductResidualQuantizer*)iva->aq, f);
         }
         WRITE1(iva->by_residual);
         WRITE1(iva->use_precomputed_table);
@@ -670,7 +877,22 @@ void write_index(const Index* idx, IOWriter* f) {
             WRITEVECTOR(ivfpqr->refine_codes);
             WRITE1(ivfpqr->k_factor);
         }
-
+    } else if (
+            auto* indep =
+                    dynamic_cast<const IndexIVFIndependentQuantizer*>(idx)) {
+        uint32_t h = fourcc("IwIQ");
+        WRITE1(h);
+        write_index_header(indep, f);
+        write_index(indep->quantizer, f);
+        bool has_vt = indep->vt != nullptr;
+        WRITE1(has_vt);
+        if (has_vt) {
+            write_VectorTransform(indep->vt, f);
+        }
+        write_index(indep->index_ivf, f);
+        if (auto index_ivfpq = dynamic_cast<IndexIVFPQ*>(indep->index_ivf)) {
+            WRITE1(index_ivfpq->use_precomputed_table);
+        }
     } else if (
             const IndexPreTransform* ixpt =
                     dynamic_cast<const IndexPreTransform*>(idx)) {
@@ -729,8 +951,10 @@ void write_index(const Index* idx, IOWriter* f) {
         write_HNSW(&idxhnsw->hnsw, f);
         write_index(idxhnsw->storage, f);
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {
-        uint32_t h =
-                dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf") : 0;
+        uint32_t h = dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf")
+                : dynamic_cast<const IndexNSGPQ*>(idx)      ? fourcc("INSp")
+                : dynamic_cast<const IndexNSGSQ*>(idx)      ? fourcc("INSs")
+                                                            : 0;
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxnsg, f);
@@ -742,6 +966,17 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(idxnsg->nndescent_iter);
         write_NSG(&idxnsg->nsg, f);
         write_index(idxnsg->storage, f);
+    } else if (
+            const IndexNNDescent* idxnnd =
+                    dynamic_cast<const IndexNNDescent*>(idx)) {
+        auto idxnndflat = dynamic_cast<const IndexNNDescentFlat*>(idx);
+        FAISS_THROW_IF_NOT(idxnndflat != nullptr);
+        uint32_t h = fourcc("INNf");
+        FAISS_THROW_IF_NOT(h != 0);
+        WRITE1(h);
+        write_index_header(idxnnd, f);
+        write_NNDescent(&idxnnd->nndescent, f);
+        write_index(idxnnd->storage, f);
     } else if (
             const IndexPQFastScan* idxpqfs =
                     dynamic_cast<const IndexPQFastScan*>(idx)) {
@@ -767,12 +1002,28 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(ivpq->M2);
         WRITE1(ivpq->implem);
         WRITE1(ivpq->qbs2);
-        WRITE1(ivpq->is_cosine_);
-        if (ivpq->is_cosine_) {
+        WRITE1(ivpq->is_cosine);
+        if (ivpq->is_cosine) {
             WRITEVECTOR(ivpq->norms);
         }
         write_ProductQuantizer(&ivpq->pq, f);
         write_InvertedLists(ivpq->invlists, f);
+    } else if (
+            const IndexRowwiseMinMax* imm =
+                    dynamic_cast<const IndexRowwiseMinMax*>(idx)) {
+        // IndexRowwiseMinmaxFloat
+        uint32_t h = fourcc("IRMf");
+        WRITE1(h);
+        write_index_header(imm, f);
+        write_index(imm->index, f);
+    } else if (
+            const IndexRowwiseMinMaxFP16* imm =
+                    dynamic_cast<const IndexRowwiseMinMaxFP16*>(idx)) {
+        // IndexRowwiseMinmaxHalf
+        uint32_t h = fourcc("IRMh");
+        WRITE1(h);
+        write_index_header(imm, f);
+        write_index(imm->index, f);
     } else {
         FAISS_THROW_MSG("don't know how to serialize this type of index");
     }
@@ -868,7 +1119,7 @@ static void write_binary_multi_hash_map(
         size_t ntotal,
         IOWriter* f) {
     int id_bits = 0;
-    while ((ntotal > ((Index::idx_t)1 << id_bits))) {
+    while ((ntotal > ((idx_t)1 << id_bits))) {
         id_bits++;
     }
     WRITE1(id_bits);
diff --git a/thirdparty/faiss/faiss/impl/kmeans1d.cpp b/thirdparty/faiss/faiss/impl/kmeans1d.cpp
index 0a56c2292..faa4fad84 100644
--- a/thirdparty/faiss/faiss/impl/kmeans1d.cpp
+++ b/thirdparty/faiss/faiss/impl/kmeans1d.cpp
@@ -20,7 +20,6 @@
 
 namespace faiss {
 
-using idx_t = Index::idx_t;
 using LookUpFunc = std::function<float(idx_t, idx_t)>;
 
 void reduce(
@@ -278,13 +277,15 @@ double kmeans1d(const float* x, size_t n, size_t nclusters, float* centroids) {
     ****************************************************/
 
     // for imbalance factor
-    double tot = 0.0, uf = 0.0;
+    double tot = 0.0;
+    double uf = 0.0;
 
     idx_t end = n;
     for (idx_t k = nclusters - 1; k >= 0; k--) {
-        idx_t start = T.at(k, end - 1);
-        float sum = std::accumulate(&arr[start], &arr[end], 0.0f);
-        idx_t size = end - start;
+        const idx_t start = T.at(k, end - 1);
+        const float sum =
+                std::accumulate(arr.data() + start, arr.data() + end, 0.0f);
+        const idx_t size = end - start;
         FAISS_THROW_IF_NOT_FMT(
                 size > 0, "Cluster %d: size %d", int(k), int(size));
         centroids[k] = sum / size;
diff --git a/thirdparty/faiss/faiss/impl/kmeans1d.h b/thirdparty/faiss/faiss/impl/kmeans1d.h
index 0e8a1994f..f9a132668 100644
--- a/thirdparty/faiss/faiss/impl/kmeans1d.h
+++ b/thirdparty/faiss/faiss/impl/kmeans1d.h
@@ -22,10 +22,10 @@ namespace faiss {
  * @param argmins  argmin of each row
  */
 void smawk(
-        const Index::idx_t nrows,
-        const Index::idx_t ncols,
+        const idx_t nrows,
+        const idx_t ncols,
         const float* x,
-        Index::idx_t* argmins);
+        idx_t* argmins);
 
 /** Exact 1D K-Means by dynamic programming
  *
diff --git a/thirdparty/faiss/faiss/impl/lattice_Zn.cpp b/thirdparty/faiss/faiss/impl/lattice_Zn.cpp
index 7448a6936..5163b12f8 100644
--- a/thirdparty/faiss/faiss/impl/lattice_Zn.cpp
+++ b/thirdparty/faiss/faiss/impl/lattice_Zn.cpp
@@ -19,10 +19,11 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include <faiss/FaissHook.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/distances.h>
 
+#include "simd/hook.h"
+
 namespace faiss {
 
 /********************************************
@@ -456,7 +457,7 @@ void ZnSphereCodec::decode(uint64_t code, float* c) const {
     int nnz = 0;
     for (int i = 0; i < dim; i++) {
         if (c[i] != 0) {
-            if (signs & (1UL << nnz)) {
+            if (signs & (uint64_t(1) << nnz)) {
                 c[i] = -c[i];
             }
             nnz++;
@@ -637,7 +638,7 @@ void ZnSphereCodecRec::decode(uint64_t code, float* c) const {
     }
 }
 
-// if not use_rec, instanciate an arbitrary harmless znc_rec
+// if not use_rec, instantiate an arbitrary harmless znc_rec
 ZnSphereCodecAlt::ZnSphereCodecAlt(int dim, int r2)
         : ZnSphereCodec(dim, r2),
           use_rec((dim & (dim - 1)) == 0),
diff --git a/thirdparty/faiss/faiss/impl/platform_macros.h b/thirdparty/faiss/faiss/impl/platform_macros.h
index 5c43ab62e..3315d0405 100644
--- a/thirdparty/faiss/faiss/impl/platform_macros.h
+++ b/thirdparty/faiss/faiss/impl/platform_macros.h
@@ -7,6 +7,10 @@
 
 #pragma once
 
+// basic int types and size_t
+#include <cstdint>
+#include <cstdio>
+
 #ifdef _MSC_VER
 
 /*******************************************************
@@ -19,6 +23,10 @@
 #define FAISS_API __declspec(dllimport)
 #endif // FAISS_MAIN_LIB
 
+#ifdef _MSC_VER
+#define strtok_r strtok_s
+#endif // _MSC_VER
+
 #define __PRETTY_FUNCTION__ __FUNCSIG__
 
 #define posix_memalign(p, a, s) \
@@ -74,14 +82,6 @@ inline int __builtin_clzll(uint64_t x) {
 #define __F16C__ 1
 #endif
 
-#elif __MINGW64__
-// MSYS
-#define FAISS_API
-#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ?0 :errno)
-#define posix_memalign_free _aligned_free
-#define ALIGNED(x) __attribute__ ((aligned(x)))
-
-
 #else
 /*******************************************************
  * Linux and OSX
@@ -92,6 +92,63 @@ inline int __builtin_clzll(uint64_t x) {
 
 // aligned should be *in front* of the declaration, for compatibility with
 // windows
+#ifdef SWIG
+#define ALIGNED(x)
+#else
 #define ALIGNED(x) __attribute__((aligned(x)))
+#endif
 
-#endif // _MSC_VER
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define FAISS_DEPRECATED(msg) __attribute__((deprecated(msg)))
+#else
+#define FAISS_DEPRECATED(msg)
+#endif // GCC or Clang
+
+// Localized enablement of imprecise floating point operations
+// You need to use all 3 macros to cover all compilers.
+#if defined(_MSC_VER)
+#define FAISS_PRAGMA_IMPRECISE_LOOP
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    __pragma(float_control(precise, off, push))
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END __pragma(float_control(pop))
+#elif defined(__clang__)
+#define FAISS_PRAGMA_IMPRECISE_LOOP \
+    _Pragma("clang loop vectorize(enable) interleave(enable)")
+
+// clang-format off
+
+// the following ifdef is needed, because old versions of clang (prior to 14)
+// do not generate FMAs on x86 unless this pragma is used. On the other hand,
+// ARM does not support the following pragma flag.
+// TODO: find out how to enable FMAs on clang 10 and earlier.
+#if defined(__x86_64__) && (defined(__clang_major__) && (__clang_major__ > 10))
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("float_control(precise, off, push)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
+#else
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+#endif
+#elif defined(__GNUC__)
+// Unfortunately, GCC does not provide a pragma for detecting it.
+// So, we have to stick to GNUC, which is defined by MANY compilers.
+// This is why clang/icc needs to be checked first.
+
+// todo: add __INTEL_COMPILER check for the classic ICC
+// todo: add __INTEL_LLVM_COMPILER for ICX
+
+#define FAISS_PRAGMA_IMPRECISE_LOOP
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("GCC push_options") \
+    _Pragma("GCC optimize (\"unroll-loops,associative-math,no-signed-zeros\")")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END \
+    _Pragma("GCC pop_options")
+#else
+#define FAISS_PRAGMA_IMPRECISE_LOOP
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
+#endif
+
+// clang-format on
diff --git a/thirdparty/faiss/faiss/impl/pq4_fast_scan.cpp b/thirdparty/faiss/faiss/impl/pq4_fast_scan.cpp
index 3244c7719..222b92aad 100644
--- a/thirdparty/faiss/faiss/impl/pq4_fast_scan.cpp
+++ b/thirdparty/faiss/faiss/impl/pq4_fast_scan.cpp
@@ -87,7 +87,7 @@ void pq4_pack_codes_range(
         size_t i0,
         size_t i1,
         size_t bbs,
-        size_t M2,
+        size_t nsq,
         uint8_t* blocks) {
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
@@ -97,9 +97,9 @@ void pq4_pack_codes_range(
     size_t block1 = ((i1 - 1) / bbs) + 1;
 
     for (size_t b = block0; b < block1; b++) {
-        uint8_t* codes2 = blocks + b * bbs * M2 / 2;
+        uint8_t* codes2 = blocks + b * bbs * nsq / 2;
         int64_t i_base = b * bbs - i0;
-        for (int sq = 0; sq < M2; sq += 2) {
+        for (int sq = 0; sq < nsq; sq += 2) {
             for (size_t i = 0; i < bbs; i += 32) {
                 std::array<uint8_t, 32> c, c0, c1;
                 get_matrix_column(
@@ -121,30 +121,114 @@ void pq4_pack_codes_range(
     }
 }
 
+namespace {
+
+// get the specific address of the vector inside a block
+// shift is used for determine the if the saved in bits 0..3 (false) or
+// bits 4..7 (true)
+size_t get_vector_specific_address(
+        size_t bbs,
+        size_t vector_id,
+        size_t sq,
+        bool& shift) {
+    // get the vector_id inside the block
+    vector_id = vector_id % bbs;
+    shift = vector_id > 15;
+    vector_id = vector_id & 15;
+
+    // get the address of the vector in sq
+    size_t address;
+    if (vector_id < 8) {
+        address = vector_id << 1;
+    } else {
+        address = ((vector_id - 8) << 1) + 1;
+    }
+    if (sq & 1) {
+        address += 16;
+    }
+    return (sq >> 1) * bbs + address;
+}
+
+} // anonymous namespace
+
 uint8_t pq4_get_packed_element(
         const uint8_t* data,
         size_t bbs,
         size_t nsq,
-        size_t i,
+        size_t vector_id,
         size_t sq) {
     // move to correct bbs-sized block
-    data += (i / bbs * (nsq / 2) + sq / 2) * bbs;
-    sq = sq & 1;
-    i = i % bbs;
+    // number of blocks * block size
+    data += (vector_id / bbs) * (((nsq + 1) / 2) * bbs);
+    bool shift;
+    size_t address = get_vector_specific_address(bbs, vector_id, sq, shift);
+    if (shift) {
+        return data[address] >> 4;
+    } else {
+        return data[address] & 15;
+    }
+}
 
-    // another step
-    data += (i / 32) * 32;
-    i = i % 32;
+void pq4_set_packed_element(
+        uint8_t* data,
+        uint8_t code,
+        size_t bbs,
+        size_t nsq,
+        size_t vector_id,
+        size_t sq) {
+    // move to correct bbs-sized block
+    // number of blocks * block size
+    data += (vector_id / bbs) * (((nsq + 1) / 2) * bbs);
+    bool shift;
+    size_t address = get_vector_specific_address(bbs, vector_id, sq, shift);
+    if (shift) {
+        data[address] = (code << 4) | (data[address] & 15);
+    } else {
+        data[address] = code | (data[address] & ~15);
+    }
+}
+
+/***************************************************************
+ * CodePackerPQ4 implementation
+ ***************************************************************/
+
+CodePackerPQ4::CodePackerPQ4(size_t nsq, size_t bbs) {
+    this->nsq = nsq;
+    nvec = bbs;
+    code_size = (nsq * 4 + 7) / 8;
+    block_size = ((nsq + 1) / 2) * bbs;
+}
 
-    if (sq == 1) {
-        data += 16;
+void CodePackerPQ4::pack_1(
+        const uint8_t* flat_code,
+        size_t offset,
+        uint8_t* block) const {
+    size_t bbs = nvec;
+    if (offset >= nvec) {
+        block += (offset / nvec) * block_size;
+        offset = offset % nvec;
     }
-    const uint8_t iperm0[16] = {
-            0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
-    if (i < 16) {
-        return data[iperm0[i]] & 15;
-    } else {
-        return data[iperm0[i - 16]] >> 4;
+    for (size_t i = 0; i < code_size; i++) {
+        uint8_t code = flat_code[i];
+        pq4_set_packed_element(block, code & 15, bbs, nsq, offset, 2 * i);
+        pq4_set_packed_element(block, code >> 4, bbs, nsq, offset, 2 * i + 1);
+    }
+}
+
+void CodePackerPQ4::unpack_1(
+        const uint8_t* block,
+        size_t offset,
+        uint8_t* flat_code) const {
+    size_t bbs = nvec;
+    if (offset >= nvec) {
+        block += (offset / nvec) * block_size;
+        offset = offset % nvec;
+    }
+    for (size_t i = 0; i < code_size; i++) {
+        uint8_t code0, code1;
+        code0 = pq4_get_packed_element(block, bbs, nsq, offset, 2 * i);
+        code1 = pq4_get_packed_element(block, bbs, nsq, offset, 2 * i + 1);
+        flat_code[i] = code0 | (code1 << 4);
     }
 }
 
diff --git a/thirdparty/faiss/faiss/impl/pq4_fast_scan.h b/thirdparty/faiss/faiss/impl/pq4_fast_scan.h
index e99507a59..2e6931f8d 100644
--- a/thirdparty/faiss/faiss/impl/pq4_fast_scan.h
+++ b/thirdparty/faiss/faiss/impl/pq4_fast_scan.h
@@ -9,8 +9,9 @@
 
 #include <cstdint>
 #include <cstdlib>
-#include <knowhere/bitsetview.h>
-using knowhere::BitsetView;
+
+#include <faiss/impl/CodePacker.h>
+
 /** PQ4 SIMD packing and accumulation functions
  *
  * The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors
@@ -18,7 +19,7 @@ using knowhere::BitsetView;
  * otherwise register spilling becomes too large.
  *
  * The implementation of these functions is spread over 3 cpp files to reduce
- * parallel compile times. Templates are instanciated explicitly.
+ * parallel compile times. Templates are instantiated explicitly.
  */
 
 namespace faiss {
@@ -27,10 +28,10 @@ namespace faiss {
  *  The unused bytes are set to 0.
  *
  * @param codes   input codes, size (ntotal, ceil(M / 2))
- * @param nototal number of input codes
+ * @param ntotal  number of input codes
  * @param nb      output number of codes (ntotal rounded up to a multiple of
  *                bbs)
- * @param M2      number of sub-quantizers (=M rounded up to a muliple of 2)
+ * @param nsq      number of sub-quantizers (=M rounded up to a muliple of 2)
  * @param bbs     size of database blocks (multiple of 32)
  * @param blocks  output array, size nb * nsq / 2.
  */
@@ -40,7 +41,7 @@ void pq4_pack_codes(
         size_t M,
         size_t nb,
         size_t bbs,
-        size_t M2,
+        size_t nsq,
         uint8_t* blocks);
 
 /** Same as pack_codes but write in a given range of the output,
@@ -57,21 +58,46 @@ void pq4_pack_codes_range(
         size_t i0,
         size_t i1,
         size_t bbs,
-        size_t M2,
+        size_t nsq,
         uint8_t* blocks);
 
 /** get a single element from a packed codes table
  *
- * @param i        vector id
+ * @param vector_id        vector id
  * @param sq       subquantizer (< nsq)
  */
 uint8_t pq4_get_packed_element(
         const uint8_t* data,
         size_t bbs,
         size_t nsq,
-        size_t i,
+        size_t vector_id,
+        size_t sq);
+
+/** set a single element "code" into a packed codes table
+ *
+ * @param vector_id       vector id
+ * @param sq       subquantizer (< nsq)
+ */
+void pq4_set_packed_element(
+        uint8_t* data,
+        uint8_t code,
+        size_t bbs,
+        size_t nsq,
+        size_t vector_id,
         size_t sq);
 
+/** CodePacker API for the PQ4 fast-scan */
+struct CodePackerPQ4 : CodePacker {
+    size_t nsq;
+
+    CodePackerPQ4(size_t nsq, size_t bbs);
+
+    void pack_1(const uint8_t* flat_code, size_t offset, uint8_t* block)
+            const final;
+    void unpack_1(const uint8_t* block, size_t offset, uint8_t* flat_code)
+            const final;
+};
+
 /** Pack Look-up table for consumption by the kernel.
  *
  * @param nq      number of queries
@@ -89,8 +115,9 @@ void pq4_pack_LUT(int nq, int nsq, const uint8_t* src, uint8_t* dest);
  * @param nsq     number of sub-quantizers (muliple of 2)
  * @param codes   packed codes array
  * @param LUT     packed look-up table
+ * @param scaler  scaler to scale the encoded norm
  */
-template <class ResultHandler>
+template <class ResultHandler, class Scaler>
 void pq4_accumulate_loop(
         int nq,
         size_t nb,
@@ -98,7 +125,8 @@ void pq4_accumulate_loop(
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res);
+        ResultHandler& res,
+        const Scaler& scaler);
 
 /* qbs versions, supported only for bbs=32.
  *
@@ -142,20 +170,22 @@ int pq4_pack_LUT_qbs_q_map(
 
 /** Run accumulation loop.
  *
- * @param qbs     4-bit encded number of queries
+ * @param qbs     4-bit encoded number of queries
  * @param nb      number of database codes (mutliple of bbs)
  * @param nsq     number of sub-quantizers
  * @param codes   encoded database vectors (packed)
  * @param LUT     look-up table (packed)
  * @param res     call-back for the resutls
+ * @param scaler  scaler to scale the encoded norm
  */
-template <class ResultHandler>
+template <class ResultHandler, class Scaler>
 void pq4_accumulate_loop_qbs(
         int qbs,
         size_t nb,
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res);
+        ResultHandler& res,
+        const Scaler& scaler);
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_1.cpp b/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_1.cpp
index b835bb648..672c7d6db 100644
--- a/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_1.cpp
+++ b/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_1.cpp
@@ -8,6 +8,7 @@
 #include <faiss/impl/pq4_fast_scan.h>
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/LookupTableScaler.h>
 #include <faiss/impl/simd_result_handlers.h>
 
 namespace faiss {
@@ -26,12 +27,13 @@ namespace {
  * writes results in a ResultHandler
  */
 
-template <int NQ, int BB, class ResultHandler>
+template <int NQ, int BB, class ResultHandler, class Scaler>
 void kernel_accumulate_block(
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     // distance accumulators
     simd16uint16 accu[NQ][BB][4];
 
@@ -44,7 +46,7 @@ void kernel_accumulate_block(
         }
     }
 
-    for (int sq = 0; sq < nsq; sq += 2) {
+    for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
         simd32uint8 lut_cache[NQ];
         for (int q = 0; q < NQ; q++) {
             lut_cache[q] = simd32uint8(LUT);
@@ -72,6 +74,35 @@ void kernel_accumulate_block(
         }
     }
 
+    for (int sq = 0; sq < scaler.nscale; sq += 2) {
+        simd32uint8 lut_cache[NQ];
+        for (int q = 0; q < NQ; q++) {
+            lut_cache[q] = simd32uint8(LUT);
+            LUT += 32;
+        }
+
+        for (int b = 0; b < BB; b++) {
+            simd32uint8 c = simd32uint8(codes);
+            codes += 32;
+            simd32uint8 mask(15);
+            simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+            simd32uint8 clo = c & mask;
+
+            for (int q = 0; q < NQ; q++) {
+                simd32uint8 lut = lut_cache[q];
+
+                simd32uint8 res0 = scaler.lookup(lut, clo);
+                accu[q][b][0] += scaler.scale_lo(res0); // handle vectors 0..7
+                accu[q][b][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+                simd32uint8 res1 = scaler.lookup(lut, chi);
+                accu[q][b][2] += scaler.scale_lo(res1); // handle vectors 16..23
+                accu[q][b][3] +=
+                        scaler.scale_hi(res1); //  handle vectors 24..31
+            }
+        }
+    }
+
     for (int q = 0; q < NQ; q++) {
         for (int b = 0; b < BB; b++) {
             accu[q][b][0] -= accu[q][b][1] << 8;
@@ -85,17 +116,18 @@ void kernel_accumulate_block(
     }
 }
 
-template <int NQ, int BB, class ResultHandler>
+template <int NQ, int BB, class ResultHandler, class Scaler>
 void accumulate_fixed_blocks(
         size_t nb,
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     constexpr int bbs = 32 * BB;
     for (int64_t j0 = 0; j0 < nb; j0 += bbs) {
         FixedStorageHandler<NQ, 2 * BB> res2;
-        kernel_accumulate_block<NQ, BB>(nsq, codes, LUT, res2);
+        kernel_accumulate_block<NQ, BB>(nsq, codes, LUT, res2, scaler);
         res.set_block_origin(0, j0);
         res2.to_other_handler(res);
         codes += bbs * nsq / 2;
@@ -104,7 +136,7 @@ void accumulate_fixed_blocks(
 
 } // anonymous namespace
 
-template <class ResultHandler>
+template <class ResultHandler, class Scaler>
 void pq4_accumulate_loop(
         int nq,
         size_t nb,
@@ -112,15 +144,16 @@ void pq4_accumulate_loop(
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     FAISS_THROW_IF_NOT(is_aligned_pointer(codes));
     FAISS_THROW_IF_NOT(is_aligned_pointer(LUT));
     FAISS_THROW_IF_NOT(bbs % 32 == 0);
     FAISS_THROW_IF_NOT(nb % bbs == 0);
 
-#define DISPATCH(NQ, BB)                                           \
-    case NQ * 1000 + BB:                                           \
-        accumulate_fixed_blocks<NQ, BB>(nb, nsq, codes, LUT, res); \
+#define DISPATCH(NQ, BB)                                                   \
+    case NQ * 1000 + BB:                                                   \
+        accumulate_fixed_blocks<NQ, BB>(nb, nsq, codes, LUT, res, scaler); \
         break
 
     switch (nq * 1000 + bbs / 32) {
@@ -141,20 +174,28 @@ void pq4_accumulate_loop(
 
 // explicit template instantiations
 
-#define INSTANTIATE_ACCUMULATE(TH, C, with_id_map)         \
-    template void pq4_accumulate_loop<TH<C, with_id_map>>( \
-            int,                                           \
-            size_t,                                        \
-            int,                                           \
-            int,                                           \
-            const uint8_t*,                                \
-            const uint8_t*,                                \
-            TH<C, with_id_map>&);
-
-#define INSTANTIATE_3(C, with_id_map)                           \
-    INSTANTIATE_ACCUMULATE(SingleResultHandler, C, with_id_map) \
-    INSTANTIATE_ACCUMULATE(HeapHandler, C, with_id_map)         \
-    INSTANTIATE_ACCUMULATE(ReservoirHandler, C, with_id_map)
+#define INSTANTIATE_ACCUMULATE(TH, C, with_id_map, S)         \
+    template void pq4_accumulate_loop<TH<C, with_id_map>, S>( \
+            int,                                              \
+            size_t,                                           \
+            int,                                              \
+            int,                                              \
+            const uint8_t*,                                   \
+            const uint8_t*,                                   \
+            TH<C, with_id_map>&,                              \
+            const S&);
+
+using DS = DummyScaler;
+using NS = NormTableScaler;
+
+#define INSTANTIATE_3(C, with_id_map)                               \
+    INSTANTIATE_ACCUMULATE(SingleResultHandler, C, with_id_map, DS) \
+    INSTANTIATE_ACCUMULATE(HeapHandler, C, with_id_map, DS)         \
+    INSTANTIATE_ACCUMULATE(ReservoirHandler, C, with_id_map, DS)    \
+                                                                    \
+    INSTANTIATE_ACCUMULATE(SingleResultHandler, C, with_id_map, NS) \
+    INSTANTIATE_ACCUMULATE(HeapHandler, C, with_id_map, NS)         \
+    INSTANTIATE_ACCUMULATE(ReservoirHandler, C, with_id_map, NS)
 
 using Csi = CMax<uint16_t, int>;
 INSTANTIATE_3(Csi, false);
diff --git a/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp b/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp
index 01df039a4..8121c7b90 100644
--- a/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp
+++ b/thirdparty/faiss/faiss/impl/pq4_fast_scan_search_qbs.cpp
@@ -8,6 +8,7 @@
 #include <faiss/impl/pq4_fast_scan.h>
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/LookupTableScaler.h>
 #include <faiss/impl/simd_result_handlers.h>
 #include <faiss/utils/simdlib.h>
 
@@ -27,15 +28,17 @@ namespace {
  * writes results in a ResultHandler
  */
 
-template <int NQ, class ResultHandler>
+template <int NQ, class ResultHandler, class Scaler>
 void kernel_accumulate_block(
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     // dummy alloc to keep the windows compiler happy
     constexpr int NQA = NQ > 0 ? NQ : 1;
     // distance accumulators
+    // layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
     simd16uint16 accu[NQA][4];
 
     for (int q = 0; q < NQ; q++) {
@@ -45,7 +48,7 @@ void kernel_accumulate_block(
     }
 
     // _mm_prefetch(codes + 768, 0);
-    for (int sq = 0; sq < nsq; sq += 2) {
+    for (int sq = 0; sq < nsq - scaler.nscale; sq += 2) {
         // prefetch
         simd32uint8 c(codes);
         codes += 32;
@@ -71,6 +74,31 @@ void kernel_accumulate_block(
         }
     }
 
+    for (int sq = 0; sq < scaler.nscale; sq += 2) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
+            accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+            simd32uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
+            accu[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
+        }
+    }
+
     for (int q = 0; q < NQ; q++) {
         accu[q][0] -= accu[q][1] << 8;
         simd16uint16 dis0 = combine2x2(accu[q][0], accu[q][1]);
@@ -81,86 +109,87 @@ void kernel_accumulate_block(
 }
 
 // handle at most 4 blocks of queries
-template <int QBS, class ResultHandler>
+template <int QBS, class ResultHandler, class Scaler>
 void accumulate_q_4step(
         size_t ntotal2,
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT0,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     constexpr int Q1 = QBS & 15;
     constexpr int Q2 = (QBS >> 4) & 15;
     constexpr int Q3 = (QBS >> 8) & 15;
     constexpr int Q4 = (QBS >> 12) & 15;
     constexpr int SQ = Q1 + Q2 + Q3 + Q4;
 
-    for (int64_t j0 = 0; j0 < ntotal2; j0 += 32) {
+    for (int64_t j0 = 0; j0 < ntotal2; j0 += 32, codes += 32 * nsq / 2) {
         res.set_block_origin(0, j0);
 
         // skip computing distances if all vectors inside a block are filtered out
-        bool skip_flag = false;
-        if (!res.bitset.empty()) {  // we have filter here
-            skip_flag = true;
+        if (res.sel != nullptr) {  // we have filter here
+            bool skip_flag = true;
             for (int64_t jj = 0; jj < std::min<int64_t>(32, ntotal2 - j0);
                  jj++) {
                 auto real_idx = res.adjust_id(0, jj);
-                if (!res.bitset.test(real_idx)) {  // id is not filtered out, can not skip computing
+                if (res.sel->is_member(real_idx)) {  // id is not filtered out, can not skip computing
                     skip_flag = false;
                     break;
                 }
             }
-        }
-        if (skip_flag) {
-            codes += 32 * nsq / 2;
-            continue;
+
+            if (skip_flag) {
+                continue;
+            }
         }
 
-        FixedStorageHandler<SQ, 2> res2;        
+        FixedStorageHandler<SQ, 2> res2;
         const uint8_t* LUT = LUT0;
-        kernel_accumulate_block<Q1>(nsq, codes, LUT, res2);
+        kernel_accumulate_block<Q1>(nsq, codes, LUT, res2, scaler);
         LUT += Q1 * nsq * 16;
         if (Q2 > 0) {
             res2.set_block_origin(Q1, 0);
-            kernel_accumulate_block<Q2>(nsq, codes, LUT, res2);
+            kernel_accumulate_block<Q2>(nsq, codes, LUT, res2, scaler);
             LUT += Q2 * nsq * 16;
         }
         if (Q3 > 0) {
             res2.set_block_origin(Q1 + Q2, 0);
-            kernel_accumulate_block<Q3>(nsq, codes, LUT, res2);
+            kernel_accumulate_block<Q3>(nsq, codes, LUT, res2, scaler);
             LUT += Q3 * nsq * 16;
         }
         if (Q4 > 0) {
             res2.set_block_origin(Q1 + Q2 + Q3, 0);
-            kernel_accumulate_block<Q4>(nsq, codes, LUT, res2);
+            kernel_accumulate_block<Q4>(nsq, codes, LUT, res2, scaler);
         }
         res2.to_other_handler(res);
-        codes += 32 * nsq / 2;
     }
 }
 
-template <int NQ, class ResultHandler>
+template <int NQ, class ResultHandler, class Scaler>
 void kernel_accumulate_block_loop(
         size_t ntotal2,
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     for (int64_t j0 = 0; j0 < ntotal2; j0 += 32) {
         res.set_block_origin(0, j0);
         kernel_accumulate_block<NQ, ResultHandler>(
-                nsq, codes + j0 * nsq / 2, LUT, res);
+                nsq, codes + j0 * nsq / 2, LUT, res, scaler);
     }
 }
 
 // non-template version of accumulate kernel -- dispatches dynamically
-template <class ResultHandler>
+template <class ResultHandler, class Scaler>
 void accumulate(
         int nq,
         size_t ntotal2,
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     assert(nsq % 2 == 0);
     assert(is_aligned_pointer(codes));
     assert(is_aligned_pointer(LUT));
@@ -168,7 +197,7 @@ void accumulate(
 #define DISPATCH(NQ)                                     \
     case NQ:                                             \
         kernel_accumulate_block_loop<NQ, ResultHandler>( \
-                ntotal2, nsq, codes, LUT, res);          \
+                ntotal2, nsq, codes, LUT, res, scaler);  \
         return
 
     switch (nq) {
@@ -177,30 +206,31 @@ void accumulate(
         DISPATCH(3);
         DISPATCH(4);
     }
-    FAISS_THROW_FMT("accumulate nq=%d not instanciated", nq);
+    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
 
 #undef DISPATCH
 }
 
 } // namespace
 
-template <class ResultHandler>
+template <class ResultHandler, class Scaler>
 void pq4_accumulate_loop_qbs(
         int qbs,
         size_t ntotal2,
         int nsq,
         const uint8_t* codes,
         const uint8_t* LUT0,
-        ResultHandler& res) {
+        ResultHandler& res,
+        const Scaler& scaler) {
     assert(nsq % 2 == 0);
     assert(is_aligned_pointer(codes));
     assert(is_aligned_pointer(LUT0));
 
     // try out optimized versions
     switch (qbs) {
-#define DISPATCH(QBS)                                            \
-    case QBS:                                                    \
-        accumulate_q_4step<QBS>(ntotal2, nsq, codes, LUT0, res); \
+#define DISPATCH(QBS)                                                    \
+    case QBS:                                                            \
+        accumulate_q_4step<QBS>(ntotal2, nsq, codes, LUT0, res, scaler); \
         return;
         DISPATCH(0x3333); // 12
         DISPATCH(0x2333); // 11
@@ -238,9 +268,10 @@ void pq4_accumulate_loop_qbs(
             int nq = qi & 15;
             qi >>= 4;
             res.set_block_origin(i0, j0);
-#define DISPATCH(NQ)                                                      \
-    case NQ:                                                              \
-        kernel_accumulate_block<NQ, ResultHandler>(nsq, codes, LUT, res); \
+#define DISPATCH(NQ)                                \
+    case NQ:                                        \
+        kernel_accumulate_block<NQ, ResultHandler>( \
+                nsq, codes, LUT, res, scaler);      \
         break
             switch (nq) {
                 DISPATCH(1);
@@ -249,7 +280,7 @@ void pq4_accumulate_loop_qbs(
                 DISPATCH(4);
 #undef DISPATCH
                 default:
-                    FAISS_THROW_FMT("accumulate nq=%d not instanciated", nq);
+                    FAISS_THROW_FMT("accumulate nq=%d not instantiated", nq);
             }
             i0 += nq;
             LUT += nq * nsq * 16;
@@ -260,9 +291,23 @@ void pq4_accumulate_loop_qbs(
 
 // explicit template instantiations
 
-#define INSTANTIATE_ACCUMULATE_Q(RH)           \
-    template void pq4_accumulate_loop_qbs<RH>( \
-            int, size_t, int, const uint8_t*, const uint8_t*, RH&);
+#define INSTANTIATE_ACCUMULATE_Q(RH)                            \
+    template void pq4_accumulate_loop_qbs<RH, DummyScaler>(     \
+            int,                                                \
+            size_t,                                             \
+            int,                                                \
+            const uint8_t*,                                     \
+            const uint8_t*,                                     \
+            RH&,                                                \
+            const DummyScaler&);                                \
+    template void pq4_accumulate_loop_qbs<RH, NormTableScaler>( \
+            int,                                                \
+            size_t,                                             \
+            int,                                                \
+            const uint8_t*,                                     \
+            const uint8_t*,                                     \
+            RH&,                                                \
+            const NormTableScaler&);
 
 using Csi = CMax<uint16_t, int>;
 INSTANTIATE_ACCUMULATE_Q(SingleResultHandler<Csi>)
@@ -284,24 +329,24 @@ using Cfl = CMax<uint16_t, int64_t>;
 using HHCsl = HeapHandler<Cfl, true>;
 using RHCsl = ReservoirHandler<Cfl, true>;
 using SHCsl = SingleResultHandler<Cfl, true>;
-
-using CRSfl = CMax<float, int64_t>;
-using RSHCsl = RangeSearchResultHandler<CRSfl, true>;
 INSTANTIATE_ACCUMULATE_Q(HHCsl)
 INSTANTIATE_ACCUMULATE_Q(RHCsl)
 INSTANTIATE_ACCUMULATE_Q(SHCsl)
+
+using CRSfl = CMax<float, int64_t>;
+using RSHCsl = RangeSearchResultHandler<CRSfl, true>;
 INSTANTIATE_ACCUMULATE_Q(RSHCsl)
 
 using Cfl2 = CMin<uint16_t, int64_t>;
 using HHCsl2 = HeapHandler<Cfl2, true>;
 using RHCsl2 = ReservoirHandler<Cfl2, true>;
 using SHCsl2 = SingleResultHandler<Cfl2, true>;
-
-using CRSfl2 = CMin<float, int64_t>;
-using RSHCsl2 = RangeSearchResultHandler<CRSfl2, true>;
 INSTANTIATE_ACCUMULATE_Q(HHCsl2)
 INSTANTIATE_ACCUMULATE_Q(RHCsl2)
 INSTANTIATE_ACCUMULATE_Q(SHCsl2)
+
+using CRSfl2 = CMin<float, int64_t>;
+using RSHCsl2 = RangeSearchResultHandler<CRSfl2, true>;
 INSTANTIATE_ACCUMULATE_Q(RSHCsl2)
 
 /***************************************************************
@@ -328,7 +373,8 @@ void accumulate_to_mem(
         uint16_t* accu) {
     FAISS_THROW_IF_NOT(ntotal2 % 32 == 0);
     StoreResultHandler handler(accu, ntotal2);
-    accumulate(nq, ntotal2, nsq, codes, LUT, handler);
+    DummyScaler scaler;
+    accumulate(nq, ntotal2, nsq, codes, LUT, handler, scaler);
 }
 
 int pq4_preferred_qbs(int n) {
diff --git a/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp b/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
new file mode 100644
index 000000000..94567eea7
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.cpp
@@ -0,0 +1,962 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/residual_quantizer_encode_steps.h>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ResidualQuantizer.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/distances.h>
+#include <faiss/utils/simdlib.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/utils/approx_topk/approx_topk.h>
+
+#include "simd/hook.h"
+
+extern "C" {
+
+// general matrix multiplication
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+}
+
+namespace faiss {
+
+/********************************************************************
+ * Basic routines
+ ********************************************************************/
+
+namespace {
+
+template <size_t M, size_t NK>
+void accum_and_store_tab(
+        const size_t m_offset,
+        const float* const __restrict codebook_cross_norms,
+        const uint64_t* const __restrict codebook_offsets,
+        const int32_t* const __restrict codes_i,
+        const size_t b,
+        const size_t ldc,
+        const size_t K,
+        float* const __restrict output) {
+    // load pointers into registers
+    const float* cbs[M];
+    for (size_t ij = 0; ij < M; ij++) {
+        const size_t code = static_cast<size_t>(codes_i[b * m_offset + ij]);
+        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
+    }
+
+    // do accumulation in registers using SIMD.
+    // It is possible that compiler may be smart enough so that
+    //   this manual SIMD unrolling might be unneeded.
+#if defined(__AVX2__) || defined(__aarch64__)
+    const size_t K8 = (K / (8 * NK)) * (8 * NK);
+
+    // process in chunks of size (8 * NK) floats
+    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
+        simd8float32 regs[NK];
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].loadu(cbs[0] + kk + ik * 8);
+        }
+
+        for (size_t ij = 1; ij < M; ij++) {
+            for (size_t ik = 0; ik < NK; ik++) {
+                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
+            }
+        }
+
+        // write the result
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].storeu(output + kk + ik * 8);
+        }
+    }
+#else
+    const size_t K8 = 0;
+#endif
+
+    // process leftovers
+    for (size_t kk = K8; kk < K; kk++) {
+        float reg = cbs[0][kk];
+        for (size_t ij = 1; ij < M; ij++) {
+            reg += cbs[ij][kk];
+        }
+        output[b * K + kk] = reg;
+    }
+}
+
+template <size_t M, size_t NK>
+void accum_and_add_tab(
+        const size_t m_offset,
+        const float* const __restrict codebook_cross_norms,
+        const uint64_t* const __restrict codebook_offsets,
+        const int32_t* const __restrict codes_i,
+        const size_t b,
+        const size_t ldc,
+        const size_t K,
+        float* const __restrict output) {
+    // load pointers into registers
+    const float* cbs[M];
+    for (size_t ij = 0; ij < M; ij++) {
+        const size_t code = static_cast<size_t>(codes_i[b * m_offset + ij]);
+        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
+    }
+
+    // do accumulation in registers using SIMD.
+    // It is possible that compiler may be smart enough so that
+    //   this manual SIMD unrolling might be unneeded.
+#if defined(__AVX2__) || defined(__aarch64__)
+    const size_t K8 = (K / (8 * NK)) * (8 * NK);
+
+    // process in chunks of size (8 * NK) floats
+    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
+        simd8float32 regs[NK];
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].loadu(cbs[0] + kk + ik * 8);
+        }
+
+        for (size_t ij = 1; ij < M; ij++) {
+            for (size_t ik = 0; ik < NK; ik++) {
+                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
+            }
+        }
+
+        // write the result
+        for (size_t ik = 0; ik < NK; ik++) {
+            simd8float32 existing(output + kk + ik * 8);
+            existing += regs[ik];
+            existing.storeu(output + kk + ik * 8);
+        }
+    }
+#else
+    const size_t K8 = 0;
+#endif
+
+    // process leftovers
+    for (size_t kk = K8; kk < K; kk++) {
+        float reg = cbs[0][kk];
+        for (size_t ij = 1; ij < M; ij++) {
+            reg += cbs[ij][kk];
+        }
+        output[b * K + kk] += reg;
+    }
+}
+
+template <size_t M, size_t NK>
+void accum_and_finalize_tab(
+        const float* const __restrict codebook_cross_norms,
+        const uint64_t* const __restrict codebook_offsets,
+        const int32_t* const __restrict codes_i,
+        const size_t b,
+        const size_t ldc,
+        const size_t K,
+        const float* const __restrict distances_i,
+        const float* const __restrict cd_common,
+        float* const __restrict output) {
+    // load pointers into registers
+    const float* cbs[M];
+    for (size_t ij = 0; ij < M; ij++) {
+        const size_t code = static_cast<size_t>(codes_i[b * M + ij]);
+        cbs[ij] = &codebook_cross_norms[(codebook_offsets[ij] + code) * ldc];
+    }
+
+    // do accumulation in registers using SIMD.
+    // It is possible that compiler may be smart enough so that
+    //   this manual SIMD unrolling might be unneeded.
+#if defined(__AVX2__) || defined(__aarch64__)
+    const size_t K8 = (K / (8 * NK)) * (8 * NK);
+
+    // process in chunks of size (8 * NK) floats
+    for (size_t kk = 0; kk < K8; kk += 8 * NK) {
+        simd8float32 regs[NK];
+        for (size_t ik = 0; ik < NK; ik++) {
+            regs[ik].loadu(cbs[0] + kk + ik * 8);
+        }
+
+        for (size_t ij = 1; ij < M; ij++) {
+            for (size_t ik = 0; ik < NK; ik++) {
+                regs[ik] += simd8float32(cbs[ij] + kk + ik * 8);
+            }
+        }
+
+        simd8float32 two(2.0f);
+        for (size_t ik = 0; ik < NK; ik++) {
+            // cent_distances[b * K + k] = distances_i[b] + cd_common[k]
+            //     + 2 * dp[k];
+
+            simd8float32 common_v(cd_common + kk + ik * 8);
+            common_v = fmadd(two, regs[ik], common_v);
+
+            common_v += simd8float32(distances_i[b]);
+            common_v.storeu(output + b * K + kk + ik * 8);
+        }
+    }
+#else
+    const size_t K8 = 0;
+#endif
+
+    // process leftovers
+    for (size_t kk = K8; kk < K; kk++) {
+        float reg = cbs[0][kk];
+        for (size_t ij = 1; ij < M; ij++) {
+            reg += cbs[ij][kk];
+        }
+
+        output[b * K + kk] = distances_i[b] + cd_common[kk] + 2 * reg;
+    }
+}
+
+} // anonymous namespace
+
+/********************************************************************
+ * Single encoding step
+ ********************************************************************/
+
+void beam_search_encode_step(
+        size_t d,
+        size_t K,
+        const float* cent, /// size (K, d)
+        size_t n,
+        size_t beam_size,
+        const float* residuals, /// size (n, beam_size, d)
+        size_t m,
+        const int32_t* codes, /// size (n, beam_size, m)
+        size_t new_beam_size,
+        int32_t* new_codes,   /// size (n, new_beam_size, m + 1)
+        float* new_residuals, /// size (n, new_beam_size, d)
+        float* new_distances, /// size (n, new_beam_size)
+        Index* assign_index,
+        ApproxTopK_mode_t approx_topk_mode) {
+    // we have to fill in the whole output matrix
+    FAISS_THROW_IF_NOT(new_beam_size <= beam_size * K);
+
+    std::vector<float> cent_distances;
+    std::vector<idx_t> cent_ids;
+
+    if (assign_index) {
+        // search beam_size distances per query
+        FAISS_THROW_IF_NOT(assign_index->d == d);
+        cent_distances.resize(n * beam_size * new_beam_size);
+        cent_ids.resize(n * beam_size * new_beam_size);
+        if (assign_index->ntotal != 0) {
+            // then we assume the codebooks are already added to the index
+            FAISS_THROW_IF_NOT(assign_index->ntotal == K);
+        } else {
+            assign_index->add(K, cent);
+        }
+
+        // printf("beam_search_encode_step -- mem usage %zd\n",
+        // get_mem_usage_kb());
+        assign_index->search(
+                n * beam_size,
+                residuals,
+                new_beam_size,
+                cent_distances.data(),
+                cent_ids.data());
+    } else {
+        // do one big distance computation
+        cent_distances.resize(n * beam_size * K);
+        pairwise_L2sqr(
+                d, n * beam_size, residuals, K, cent, cent_distances.data());
+    }
+    InterruptCallback::check();
+
+#pragma omp parallel for if (n > 100)
+    for (int64_t i = 0; i < n; i++) {
+        const int32_t* codes_i = codes + i * m * beam_size;
+        int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
+        const float* residuals_i = residuals + i * d * beam_size;
+        float* new_residuals_i = new_residuals + i * d * new_beam_size;
+
+        float* new_distances_i = new_distances + i * new_beam_size;
+        using C = CMax<float, int>;
+
+        if (assign_index) {
+            const float* cent_distances_i =
+                    cent_distances.data() + i * beam_size * new_beam_size;
+            const idx_t* cent_ids_i =
+                    cent_ids.data() + i * beam_size * new_beam_size;
+
+            // here we could be a tad more efficient by merging sorted arrays
+            for (int i = 0; i < new_beam_size; i++) {
+                new_distances_i[i] = C::neutral();
+            }
+            std::vector<int> perm(new_beam_size, -1);
+            heap_addn<C>(
+                    new_beam_size,
+                    new_distances_i,
+                    perm.data(),
+                    cent_distances_i,
+                    nullptr,
+                    beam_size * new_beam_size);
+            heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
+
+            for (int j = 0; j < new_beam_size; j++) {
+                int js = perm[j] / new_beam_size;
+                int ls = cent_ids_i[perm[j]];
+                if (m > 0) {
+                    memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
+                }
+                new_codes_i[m] = ls;
+                new_codes_i += m + 1;
+                fvec_sub(
+                        d,
+                        residuals_i + js * d,
+                        cent + ls * d,
+                        new_residuals_i);
+                new_residuals_i += d;
+            }
+
+        } else {
+            const float* cent_distances_i =
+                    cent_distances.data() + i * beam_size * K;
+            // then we have to select the best results
+            for (int i = 0; i < new_beam_size; i++) {
+                new_distances_i[i] = C::neutral();
+            }
+            std::vector<int> perm(new_beam_size, -1);
+
+#define HANDLE_APPROX(NB, BD)                                  \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \
+        HeapWithBuckets<C, NB, BD>::bs_addn(                   \
+                beam_size,                                     \
+                K,                                             \
+                cent_distances_i,                              \
+                new_beam_size,                                 \
+                new_distances_i,                               \
+                perm.data());                                  \
+        break;
+
+            switch (approx_topk_mode) {
+                HANDLE_APPROX(8, 3)
+                HANDLE_APPROX(8, 2)
+                HANDLE_APPROX(16, 2)
+                HANDLE_APPROX(32, 2)
+                default:
+                    heap_addn<C>(
+                            new_beam_size,
+                            new_distances_i,
+                            perm.data(),
+                            cent_distances_i,
+                            nullptr,
+                            beam_size * K);
+            }
+            heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
+
+#undef HANDLE_APPROX
+
+            for (int j = 0; j < new_beam_size; j++) {
+                int js = perm[j] / K;
+                int ls = perm[j] % K;
+                if (m > 0) {
+                    memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
+                }
+                new_codes_i[m] = ls;
+                new_codes_i += m + 1;
+                fvec_sub(
+                        d,
+                        residuals_i + js * d,
+                        cent + ls * d,
+                        new_residuals_i);
+                new_residuals_i += d;
+            }
+        }
+    }
+}
+
+// exposed in the faiss namespace
+void beam_search_encode_step_tab(
+        size_t K,
+        size_t n,
+        size_t beam_size,                  // input sizes
+        const float* codebook_cross_norms, // size K * ldc
+        size_t ldc,
+        const uint64_t* codebook_offsets, // m
+        const float* query_cp,            // size n * ldqc
+        size_t ldqc,                      // >= K
+        const float* cent_norms_i,        // size K
+        size_t m,
+        const int32_t* codes,   // n * beam_size * m
+        const float* distances, // n * beam_size
+        size_t new_beam_size,
+        int32_t* new_codes,                 // n * new_beam_size * (m + 1)
+        float* new_distances,               // n * new_beam_size
+        ApproxTopK_mode_t approx_topk_mode) //
+{
+    FAISS_THROW_IF_NOT(ldc >= K);
+
+#pragma omp parallel for if (n > 100) schedule(dynamic)
+    for (int64_t i = 0; i < n; i++) {
+        std::vector<float> cent_distances(beam_size * K);
+        std::vector<float> cd_common(K);
+
+        const int32_t* codes_i = codes + i * m * beam_size;
+        const float* query_cp_i = query_cp + i * ldqc;
+        const float* distances_i = distances + i * beam_size;
+
+        for (size_t k = 0; k < K; k++) {
+            cd_common[k] = cent_norms_i[k] - 2 * query_cp_i[k];
+        }
+
+        bool use_baseline_implementation = false;
+
+        // This is the baseline implementation. Its primary flaw
+        //   that it writes way too many info to the temporary buffer
+        //   called dp.
+        //
+        // This baseline code is kept intentionally because it is easy to
+        // understand what an optimized version optimizes exactly.
+        //
+        if (use_baseline_implementation) {
+            for (size_t b = 0; b < beam_size; b++) {
+                std::vector<float> dp(K);
+
+                for (size_t m1 = 0; m1 < m; m1++) {
+                    size_t c = codes_i[b * m + m1];
+                    const float* cb =
+                            &codebook_cross_norms
+                                    [(codebook_offsets[m1] + c) * ldc];
+                    fvec_add(K, cb, dp.data(), dp.data());
+                }
+
+                for (size_t k = 0; k < K; k++) {
+                    cent_distances[b * K + k] =
+                            distances_i[b] + cd_common[k] + 2 * dp[k];
+                }
+            }
+
+        } else {
+            // An optimized implementation that avoids using a temporary buffer
+            // and does the accumulation in registers.
+
+            // Compute a sum of NK AQ codes.
+#define ACCUM_AND_FINALIZE_TAB(NK)               \
+    case NK:                                     \
+        for (size_t b = 0; b < beam_size; b++) { \
+            accum_and_finalize_tab<NK, 4>(       \
+                    codebook_cross_norms,        \
+                    codebook_offsets,            \
+                    codes_i,                     \
+                    b,                           \
+                    ldc,                         \
+                    K,                           \
+                    distances_i,                 \
+                    cd_common.data(),            \
+                    cent_distances.data());      \
+        }                                        \
+        break;
+
+            // this version contains many switch-case scenarios, but
+            // they won't affect branch predictor.
+            switch (m) {
+                case 0:
+                    // trivial case
+                    for (size_t b = 0; b < beam_size; b++) {
+                        for (size_t k = 0; k < K; k++) {
+                            cent_distances[b * K + k] =
+                                    distances_i[b] + cd_common[k];
+                        }
+                    }
+                    break;
+
+                    ACCUM_AND_FINALIZE_TAB(1)
+                    ACCUM_AND_FINALIZE_TAB(2)
+                    ACCUM_AND_FINALIZE_TAB(3)
+                    ACCUM_AND_FINALIZE_TAB(4)
+                    ACCUM_AND_FINALIZE_TAB(5)
+                    ACCUM_AND_FINALIZE_TAB(6)
+                    ACCUM_AND_FINALIZE_TAB(7)
+
+                default: {
+                    // m >= 8 case.
+
+                    // A temporary buffer has to be used due to the lack of
+                    // registers. But we'll try to accumulate up to 8 AQ codes
+                    // in registers and issue a single write operation to the
+                    // buffer, while the baseline does no accumulation. So, the
+                    // number of write operations to the temporary buffer is
+                    // reduced 8x.
+
+                    // allocate a temporary buffer
+                    std::vector<float> dp(K);
+
+                    for (size_t b = 0; b < beam_size; b++) {
+                        // Initialize it. Compute a sum of first 8 AQ codes
+                        // because m >= 8 .
+                        accum_and_store_tab<8, 4>(
+                                m,
+                                codebook_cross_norms,
+                                codebook_offsets,
+                                codes_i,
+                                b,
+                                ldc,
+                                K,
+                                dp.data());
+
+#define ACCUM_AND_ADD_TAB(NK)          \
+    case NK:                           \
+        accum_and_add_tab<NK, 4>(      \
+                m,                     \
+                codebook_cross_norms,  \
+                codebook_offsets + im, \
+                codes_i + im,          \
+                b,                     \
+                ldc,                   \
+                K,                     \
+                dp.data());            \
+        break;
+
+                        // accumulate up to 8 additional AQ codes into
+                        // a temporary buffer
+                        for (size_t im = 8; im < ((m + 7) / 8) * 8; im += 8) {
+                            size_t m_left = m - im;
+                            if (m_left > 8) {
+                                m_left = 8;
+                            }
+
+                            switch (m_left) {
+                                ACCUM_AND_ADD_TAB(1)
+                                ACCUM_AND_ADD_TAB(2)
+                                ACCUM_AND_ADD_TAB(3)
+                                ACCUM_AND_ADD_TAB(4)
+                                ACCUM_AND_ADD_TAB(5)
+                                ACCUM_AND_ADD_TAB(6)
+                                ACCUM_AND_ADD_TAB(7)
+                                ACCUM_AND_ADD_TAB(8)
+                            }
+                        }
+
+                        // done. finalize the result
+                        for (size_t k = 0; k < K; k++) {
+                            cent_distances[b * K + k] =
+                                    distances_i[b] + cd_common[k] + 2 * dp[k];
+                        }
+                    }
+                }
+            }
+
+            // the optimized implementation ends here
+        }
+        using C = CMax<float, int>;
+        int32_t* new_codes_i = new_codes + i * (m + 1) * new_beam_size;
+        float* new_distances_i = new_distances + i * new_beam_size;
+
+        const float* cent_distances_i = cent_distances.data();
+
+        // then we have to select the best results
+        for (int i = 0; i < new_beam_size; i++) {
+            new_distances_i[i] = C::neutral();
+        }
+        std::vector<int> perm(new_beam_size, -1);
+
+#define HANDLE_APPROX(NB, BD)                                  \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \
+        HeapWithBuckets<C, NB, BD>::bs_addn(                   \
+                beam_size,                                     \
+                K,                                             \
+                cent_distances_i,                              \
+                new_beam_size,                                 \
+                new_distances_i,                               \
+                perm.data());                                  \
+        break;
+
+        switch (approx_topk_mode) {
+            HANDLE_APPROX(8, 3)
+            HANDLE_APPROX(8, 2)
+            HANDLE_APPROX(16, 2)
+            HANDLE_APPROX(32, 2)
+            default:
+                heap_addn<C>(
+                        new_beam_size,
+                        new_distances_i,
+                        perm.data(),
+                        cent_distances_i,
+                        nullptr,
+                        beam_size * K);
+                break;
+        }
+
+        heap_reorder<C>(new_beam_size, new_distances_i, perm.data());
+
+#undef HANDLE_APPROX
+
+        for (int j = 0; j < new_beam_size; j++) {
+            int js = perm[j] / K;
+            int ls = perm[j] % K;
+            if (m > 0) {
+                memcpy(new_codes_i, codes_i + js * m, sizeof(*codes) * m);
+            }
+            new_codes_i[m] = ls;
+            new_codes_i += m + 1;
+        }
+    }
+}
+
+/********************************************************************
+ * Multiple encoding steps
+ ********************************************************************/
+
+namespace rq_encode_steps {
+
+void refine_beam_mp(
+        const ResidualQuantizer& rq,
+        size_t n,
+        size_t beam_size,
+        const float* x,
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_residuals,
+        float* out_distances,
+        RefineBeamMemoryPool& pool) {
+    int cur_beam_size = beam_size;
+
+    double t0 = getmillisecs();
+
+    // find the max_beam_size
+    int max_beam_size = 0;
+    {
+        int tmp_beam_size = cur_beam_size;
+        for (int m = 0; m < rq.M; m++) {
+            int K = 1 << rq.nbits[m];
+            int new_beam_size = std::min(tmp_beam_size * K, out_beam_size);
+            tmp_beam_size = new_beam_size;
+
+            if (max_beam_size < new_beam_size) {
+                max_beam_size = new_beam_size;
+            }
+        }
+    }
+
+    // preallocate buffers
+    pool.new_codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.new_residuals.resize(n * max_beam_size * rq.d);
+
+    pool.codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.distances.resize(n * max_beam_size);
+    pool.residuals.resize(n * rq.d * max_beam_size);
+
+    for (size_t i = 0; i < n * rq.d * beam_size; i++) {
+        pool.residuals[i] = x[i];
+    }
+
+    // set up pointers to buffers
+    int32_t* __restrict codes_ptr = pool.codes.data();
+    float* __restrict residuals_ptr = pool.residuals.data();
+
+    int32_t* __restrict new_codes_ptr = pool.new_codes.data();
+    float* __restrict new_residuals_ptr = pool.new_residuals.data();
+
+    // index
+    std::unique_ptr<Index> assign_index;
+    if (rq.assign_index_factory) {
+        assign_index.reset((*rq.assign_index_factory)(rq.d));
+    } else {
+        assign_index.reset(new IndexFlatL2(rq.d));
+    }
+
+    // main loop
+    size_t codes_size = 0;
+    size_t distances_size = 0;
+    size_t residuals_size = 0;
+
+    for (int m = 0; m < rq.M; m++) {
+        int K = 1 << rq.nbits[m];
+
+        const float* __restrict codebooks_m =
+                rq.codebooks.data() + rq.codebook_offsets[m] * rq.d;
+
+        const int new_beam_size = std::min(cur_beam_size * K, out_beam_size);
+
+        codes_size = n * new_beam_size * (m + 1);
+        residuals_size = n * new_beam_size * rq.d;
+        distances_size = n * new_beam_size;
+
+        beam_search_encode_step(
+                rq.d,
+                K,
+                codebooks_m,
+                n,
+                cur_beam_size,
+                residuals_ptr,
+                m,
+                codes_ptr,
+                new_beam_size,
+                new_codes_ptr,
+                new_residuals_ptr,
+                pool.distances.data(),
+                assign_index.get(),
+                rq.approx_topk_mode);
+
+        assign_index->reset();
+
+        std::swap(codes_ptr, new_codes_ptr);
+        std::swap(residuals_ptr, new_residuals_ptr);
+
+        cur_beam_size = new_beam_size;
+
+        if (rq.verbose) {
+            float sum_distances = 0;
+            for (int j = 0; j < distances_size; j++) {
+                sum_distances += pool.distances[j];
+            }
+
+            printf("[%.3f s] encode stage %d, %d bits, "
+                   "total error %g, beam_size %d\n",
+                   (getmillisecs() - t0) / 1000,
+                   m,
+                   int(rq.nbits[m]),
+                   sum_distances,
+                   cur_beam_size);
+        }
+    }
+
+    if (out_codes) {
+        memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr));
+    }
+    if (out_residuals) {
+        memcpy(out_residuals,
+               residuals_ptr,
+               residuals_size * sizeof(*residuals_ptr));
+    }
+    if (out_distances) {
+        memcpy(out_distances,
+               pool.distances.data(),
+               distances_size * sizeof(pool.distances[0]));
+    }
+}
+
+void refine_beam_LUT_mp(
+        const ResidualQuantizer& rq,
+        size_t n,
+        const float* query_norms, // size n
+        const float* query_cp,    //
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_distances,
+        RefineBeamLUTMemoryPool& pool) {
+    int beam_size = 1;
+
+    double t0 = getmillisecs();
+
+    // find the max_beam_size
+    int max_beam_size = 0;
+    {
+        int tmp_beam_size = beam_size;
+        for (int m = 0; m < rq.M; m++) {
+            int K = 1 << rq.nbits[m];
+            int new_beam_size = std::min(tmp_beam_size * K, out_beam_size);
+            tmp_beam_size = new_beam_size;
+
+            if (max_beam_size < new_beam_size) {
+                max_beam_size = new_beam_size;
+            }
+        }
+    }
+
+    // preallocate buffers
+    pool.new_codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.new_distances.resize(n * max_beam_size);
+
+    pool.codes.resize(n * max_beam_size * (rq.M + 1));
+    pool.distances.resize(n * max_beam_size);
+
+    for (size_t i = 0; i < n; i++) {
+        pool.distances[i] = query_norms[i];
+    }
+
+    // set up pointers to buffers
+    int32_t* __restrict new_codes_ptr = pool.new_codes.data();
+    float* __restrict new_distances_ptr = pool.new_distances.data();
+
+    int32_t* __restrict codes_ptr = pool.codes.data();
+    float* __restrict distances_ptr = pool.distances.data();
+
+    // main loop
+    size_t codes_size = 0;
+    size_t distances_size = 0;
+    size_t cross_ofs = 0;
+    for (int m = 0; m < rq.M; m++) {
+        int K = 1 << rq.nbits[m];
+
+        // it is guaranteed that (new_beam_size <= max_beam_size)
+        int new_beam_size = std::min(beam_size * K, out_beam_size);
+
+        codes_size = n * new_beam_size * (m + 1);
+        distances_size = n * new_beam_size;
+        FAISS_THROW_IF_NOT(
+                cross_ofs + rq.codebook_offsets[m] * K <=
+                rq.codebook_cross_products.size());
+        beam_search_encode_step_tab(
+                K,
+                n,
+                beam_size,
+                rq.codebook_cross_products.data() + cross_ofs,
+                K,
+                rq.codebook_offsets.data(),
+                query_cp + rq.codebook_offsets[m],
+                rq.total_codebook_size,
+                rq.cent_norms.data() + rq.codebook_offsets[m],
+                m,
+                codes_ptr,
+                distances_ptr,
+                new_beam_size,
+                new_codes_ptr,
+                new_distances_ptr,
+                rq.approx_topk_mode);
+        cross_ofs += rq.codebook_offsets[m] * K;
+        std::swap(codes_ptr, new_codes_ptr);
+        std::swap(distances_ptr, new_distances_ptr);
+
+        beam_size = new_beam_size;
+
+        if (rq.verbose) {
+            float sum_distances = 0;
+            for (int j = 0; j < distances_size; j++) {
+                sum_distances += distances_ptr[j];
+            }
+            printf("[%.3f s] encode stage %d, %d bits, "
+                   "total error %g, beam_size %d\n",
+                   (getmillisecs() - t0) / 1000,
+                   m,
+                   int(rq.nbits[m]),
+                   sum_distances,
+                   beam_size);
+        }
+    }
+    if (out_codes) {
+        memcpy(out_codes, codes_ptr, codes_size * sizeof(*codes_ptr));
+    }
+    if (out_distances) {
+        memcpy(out_distances,
+               distances_ptr,
+               distances_size * sizeof(*distances_ptr));
+    }
+}
+
+// this is for use_beam_LUT == 0
+void compute_codes_add_centroids_mp_lut0(
+        const ResidualQuantizer& rq,
+        const float* x,
+        uint8_t* codes_out,
+        size_t n,
+        const float* centroids,
+        ComputeCodesAddCentroidsLUT0MemoryPool& pool) {
+    pool.codes.resize(rq.max_beam_size * rq.M * n);
+    pool.distances.resize(rq.max_beam_size * n);
+
+    pool.residuals.resize(rq.max_beam_size * n * rq.d);
+
+    refine_beam_mp(
+            rq,
+            n,
+            1,
+            x,
+            rq.max_beam_size,
+            pool.codes.data(),
+            pool.residuals.data(),
+            pool.distances.data(),
+            pool.refine_beam_pool);
+
+    if (rq.search_type == ResidualQuantizer::ST_norm_float ||
+        rq.search_type == ResidualQuantizer::ST_norm_qint8 ||
+        rq.search_type == ResidualQuantizer::ST_norm_qint4) {
+        pool.norms.resize(n);
+        // recover the norms of reconstruction as
+        // || original_vector - residual ||^2
+        for (size_t i = 0; i < n; i++) {
+            pool.norms[i] = fvec_L2sqr(
+                    x + i * rq.d,
+                    pool.residuals.data() + i * rq.max_beam_size * rq.d,
+                    rq.d);
+        }
+    }
+
+    // pack only the first code of the beam
+    //   (hence the ld_codes=M * max_beam_size)
+    rq.pack_codes(
+            n,
+            pool.codes.data(),
+            codes_out,
+            rq.M * rq.max_beam_size,
+            (pool.norms.size() > 0) ? pool.norms.data() : nullptr,
+            centroids);
+}
+
+// use_beam_LUT == 1
+void compute_codes_add_centroids_mp_lut1(
+        const ResidualQuantizer& rq,
+        const float* x,
+        uint8_t* codes_out,
+        size_t n,
+        const float* centroids,
+        ComputeCodesAddCentroidsLUT1MemoryPool& pool) {
+    //
+    pool.codes.resize(rq.max_beam_size * rq.M * n);
+    pool.distances.resize(rq.max_beam_size * n);
+
+    FAISS_THROW_IF_NOT_MSG(
+            rq.M == 1 || rq.codebook_cross_products.size() > 0,
+            "call compute_codebook_tables first");
+
+    pool.query_norms.resize(n);
+    fvec_norms_L2sqr(pool.query_norms.data(), x, rq.d, n);
+
+    pool.query_cp.resize(n * rq.total_codebook_size);
+    {
+        FINTEGER ti = rq.total_codebook_size, di = rq.d, ni = n;
+        float zero = 0, one = 1;
+        sgemm_("Transposed",
+               "Not transposed",
+               &ti,
+               &ni,
+               &di,
+               &one,
+               rq.codebooks.data(),
+               &di,
+               x,
+               &di,
+               &zero,
+               pool.query_cp.data(),
+               &ti);
+    }
+
+    refine_beam_LUT_mp(
+            rq,
+            n,
+            pool.query_norms.data(),
+            pool.query_cp.data(),
+            rq.max_beam_size,
+            pool.codes.data(),
+            pool.distances.data(),
+            pool.refine_beam_lut_pool);
+
+    // pack only the first code of the beam
+    //   (hence the ld_codes=M * max_beam_size)
+    rq.pack_codes(
+            n,
+            pool.codes.data(),
+            codes_out,
+            rq.M * rq.max_beam_size,
+            nullptr,
+            centroids);
+}
+
+} // namespace rq_encode_steps
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.h b/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.h
new file mode 100644
index 000000000..70ce4b847
--- /dev/null
+++ b/thirdparty/faiss/faiss/impl/residual_quantizer_encode_steps.h
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/utils/approx_topk/mode.h>
+
+namespace faiss {
+
+/********************************************************************
+ * Single step of encoding
+ ********************************************************************/
+
+/** Encode a residual by sampling from a centroid table.
+ *
+ * This is a single encoding step the residual quantizer.
+ * It allows low-level access to the encoding function, exposed mainly for unit
+ * tests.
+ *
+ * @param n              number of vectors to hanlde
+ * @param residuals      vectors to encode, size (n, beam_size, d)
+ * @param cent           centroids, size (K, d)
+ * @param beam_size      input beam size
+ * @param m              size of the codes for the previous encoding steps
+ * @param codes          code array for the previous steps of the beam (n,
+ * beam_size, m)
+ * @param new_beam_size  output beam size (should be <= K * beam_size)
+ * @param new_codes      output codes, size (n, new_beam_size, m + 1)
+ * @param new_residuals  output residuals, size (n, new_beam_size, d)
+ * @param new_distances  output distances, size (n, new_beam_size)
+ * @param assign_index   if non-NULL, will be used to perform assignment
+ */
+void beam_search_encode_step(
+        size_t d,
+        size_t K,
+        const float* cent,
+        size_t n,
+        size_t beam_size,
+        const float* residuals,
+        size_t m,
+        const int32_t* codes,
+        size_t new_beam_size,
+        int32_t* new_codes,
+        float* new_residuals,
+        float* new_distances,
+        Index* assign_index = nullptr,
+        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
+
+/** Encode a set of vectors using their dot products with the codebooks
+ *
+ * @param K           number of vectors in the codebook
+ * @param n           nb of vectors to encode
+ * @param beam_size   input beam size
+ * @param codebook_cross_norms inner product of this codebook with the m
+ *                             previously encoded codebooks
+ * @param codebook_offsets     offsets into codebook_cross_norms for each
+ *                             previous codebook
+ * @param query_cp    dot products of query vectors with ???
+ * @param cent_norms_i  norms of centroids
+ */
+void beam_search_encode_step_tab(
+        size_t K,
+        size_t n,
+        size_t beam_size,                  // input sizes
+        const float* codebook_cross_norms, // size K * ldc
+        size_t ldc,                        // >= K
+        const uint64_t* codebook_offsets,  // m
+        const float* query_cp,             // size n * ldqc
+        size_t ldqc,                       // >= K
+        const float* cent_norms_i,         // size K
+        size_t m,
+        const int32_t* codes,   // n * beam_size * m
+        const float* distances, // n * beam_size
+        size_t new_beam_size,
+        int32_t* new_codes,   // n * new_beam_size * (m + 1)
+        float* new_distances, // n * new_beam_size
+        ApproxTopK_mode_t approx_topk = ApproxTopK_mode_t::EXACT_TOPK);
+
+/********************************************************************
+ * Multiple encoding steps
+ *
+ * The following functions take buffer objects that they use as temp
+ * memory (allocated within the functions). The buffers are intended
+ * to be re-used over batches of points to encode.
+ ********************************************************************/
+
+struct ResidualQuantizer;
+
+namespace rq_encode_steps {
+
+// Preallocated memory chunk for refine_beam_mp() call
+struct RefineBeamMemoryPool {
+    std::vector<int32_t> new_codes;
+    std::vector<float> new_residuals;
+
+    std::vector<float> residuals;
+    std::vector<int32_t> codes;
+    std::vector<float> distances;
+};
+
+void refine_beam_mp(
+        const ResidualQuantizer& rq,
+        size_t n,
+        size_t beam_size,
+        const float* x,
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_residuals,
+        float* out_distances,
+        RefineBeamMemoryPool& pool);
+
+// Preallocated memory chunk for refine_beam_LUT_mp() call
+struct RefineBeamLUTMemoryPool {
+    std::vector<int32_t> new_codes;
+    std::vector<float> new_distances;
+
+    std::vector<int32_t> codes;
+    std::vector<float> distances;
+};
+
+void refine_beam_LUT_mp(
+        const ResidualQuantizer& rq,
+        size_t n,
+        const float* query_norms, // size n
+        const float* query_cp,    //
+        int out_beam_size,
+        int32_t* out_codes,
+        float* out_distances,
+        RefineBeamLUTMemoryPool& pool);
+
+// this is for use_beam_LUT == 0 in compute_codes_add_centroids_mp_lut0() call
+struct ComputeCodesAddCentroidsLUT0MemoryPool {
+    std::vector<int32_t> codes;
+    std::vector<float> norms;
+    std::vector<float> distances;
+    std::vector<float> residuals;
+    RefineBeamMemoryPool refine_beam_pool;
+};
+
+void compute_codes_add_centroids_mp_lut0(
+        const ResidualQuantizer& rq,
+        const float* x,
+        uint8_t* codes_out,
+        size_t n,
+        const float* centroids,
+        ComputeCodesAddCentroidsLUT0MemoryPool& pool);
+
+// this is for use_beam_LUT == 1 in compute_codes_add_centroids_mp_lut1() call
+struct ComputeCodesAddCentroidsLUT1MemoryPool {
+    std::vector<int32_t> codes;
+    std::vector<float> distances;
+    std::vector<float> query_norms;
+    std::vector<float> query_cp;
+    std::vector<float> residuals;
+    RefineBeamLUTMemoryPool refine_beam_lut_pool;
+};
+
+void compute_codes_add_centroids_mp_lut1(
+        const ResidualQuantizer& rq,
+        const float* x,
+        uint8_t* codes_out,
+        size_t n,
+        const float* centroids,
+        ComputeCodesAddCentroidsLUT1MemoryPool& pool);
+
+} // namespace rq_encode_steps
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/impl/simd_result_handlers.h b/thirdparty/faiss/faiss/impl/simd_result_handlers.h
index 6a26dbd00..0c034b415 100644
--- a/thirdparty/faiss/faiss/impl/simd_result_handlers.h
+++ b/thirdparty/faiss/faiss/impl/simd_result_handlers.h
@@ -15,10 +15,10 @@
 #include <faiss/utils/simdlib.h>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/AlignedTable.h>
 #include <faiss/utils/partitioning.h>
-#include <knowhere/bitsetview.h>
 
 /** This file contains callbacks for kernels that compute distances.
  *
@@ -105,15 +105,17 @@ struct SIMDResultHandler {
     int64_t i0 = 0; // query origin
     int64_t j0 = 0; // db origin
     size_t ntotal;  // ignore excess elements after ntotal
-    const BitsetView bitset;
+    const IDSelector* sel;
 
     /// these fields are used mainly for the IVF variants (with_id_map=true)
     const TI* id_map;      // map offset in invlist to vector id
     const int* q_map;      // map q to global query
     const uint16_t* dbias; // table of biases to add to each query
 
-    explicit SIMDResultHandler(size_t ntotal, const BitsetView b = nullptr)
-            : ntotal(ntotal), bitset(b), id_map(nullptr), q_map(nullptr), dbias(nullptr) {}
+    explicit SIMDResultHandler(size_t ntotal, const IDSelector* sel = nullptr)
+            : ntotal(ntotal), id_map(nullptr), q_map(nullptr), dbias(nullptr) {
+        this->sel = sel;
+    }
 
     void set_block_origin(size_t i0, size_t j0) {
         this->i0 = i0;
@@ -209,8 +211,8 @@ struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
     };
     std::vector<Result> results;
 
-    SingleResultHandler(size_t nq, size_t ntotal, const BitsetView b = nullptr)
-            : SIMDResultHandler<C, with_id_map>(ntotal, b), results(nq) {
+    SingleResultHandler(size_t nq, size_t ntotal, const IDSelector* sel = nullptr)
+            : SIMDResultHandler<C, with_id_map>(ntotal, sel), results(nq) {
         for (int i = 0; i < nq; i++) {
             Result res = {C::neutral(), -1};
             results[i] = res;
@@ -234,18 +236,34 @@ struct SingleResultHandler : SIMDResultHandler<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            auto real_idx = this->adjust_id(b, j);
-            lt_mask -= 1 << j;
-            if (this->bitset.empty() || !this->bitset.test(real_idx)) {
+        if (this->sel != nullptr) {
+            // todo aguzhva: additional cost for adjust_id
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    if (C::cmp(res.val, dis)) {
+                        res.val = dis;
+                        res.id = real_idx;
+                    }
+                }
+            }
+        }
+        else {
+            // todo aguzhva: compute adjust_id only whenever is needed
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
                 T dis = d32tab[j];
                 if (C::cmp(res.val, dis)) {
                     res.val = dis;
-                    res.id = real_idx;
+                    res.id = this->adjust_id(b, j);
                 }
-            }
+            }            
         }
     }
 
@@ -284,8 +302,8 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
             TI* heap_ids_tab,
             size_t k,
             size_t ntotal,
-            const BitsetView b = nullptr)
-            : SIMDResultHandler<C, with_id_map>(ntotal, b),
+            const IDSelector* sel = nullptr)
+            : SIMDResultHandler<C, with_id_map>(ntotal, sel),
               nq(nq),
               heap_dis_tab(heap_dis_tab),
               heap_ids_tab(heap_ids_tab),
@@ -321,16 +339,37 @@ struct HeapHandler : SIMDResultHandler<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            auto real_idx = this->adjust_id(b, j);
-            lt_mask -= 1 << j;
-            if (this->bitset.empty() || !this->bitset.test(real_idx)) {
+        if (this->sel != nullptr) {
+            // todo aguzhva: additional cost for adjust_id
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    if (C::cmp(heap_dis[0], dis)) {
+                        // todo aguzhva: add heap_replace_top
+                        // todo aguzhva: faiss does not have it?
+                        heap_pop<C>(k, heap_dis, heap_ids);
+                        heap_push<C>(k, heap_dis, heap_ids, dis, real_idx);
+                    }
+                }
+            }
+        }
+        else {
+            // todo aguzhva: compute adjust_id only whenever is needed
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
                 T dis = d32tab[j];
                 if (C::cmp(heap_dis[0], dis)) {
+                    // todo aguzhva: add heap_replace_top
+                    // todo aguzhva: faiss does not have it?
+                    int64_t idx = this->adjust_id(b, j);
                     heap_pop<C>(k, heap_dis, heap_ids);
-                    heap_push<C>(k, heap_dis, heap_ids, dis, real_idx);
+                    heap_push<C>(k, heap_dis, heap_ids, dis, idx);
                 }
             }
         }
@@ -455,8 +494,8 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
 
     uint64_t times[4];
 
-    ReservoirHandler(size_t nq, size_t ntotal, size_t n, size_t capacity_in, const BitsetView b = nullptr)
-            : SIMDResultHandler<C, with_id_map>(ntotal, b),
+    ReservoirHandler(size_t nq, size_t ntotal, size_t n, size_t capacity_in, const IDSelector* sel = nullptr)
+            : SIMDResultHandler<C, with_id_map>(ntotal, sel),
               capacity((capacity_in + 15) & ~15),
               all_ids(nq * capacity),
               all_vals(nq * capacity) {
@@ -490,16 +529,30 @@ struct ReservoirHandler : SIMDResultHandler<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            auto real_idx = this->adjust_id(b, j);
-            lt_mask -= 1 << j;
-            if (this->bitset.empty() || !this->bitset.test(real_idx)) {
+        if (this->sel != nullptr) {
+            // todo aguzhva: additional cost for adjust_id
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    res.add(dis, real_idx);
+                }
+            }
+        }
+        else {
+            // todo aguzhva: compute adjust_id only whenever is needed
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
                 T dis = d32tab[j];
-                res.add(dis, real_idx);
+                res.add(dis, this->adjust_id(b, j));
             }
         }
+        
         times[1] += get_cy() - t1;
     }
 
@@ -569,8 +622,8 @@ struct RangeSearchResultHandler : SIMDResultHandler<C, with_id_map> {
             RangeSearchResult* res,
             float radius,
             size_t ntotal,
-            const BitsetView b = nullptr)
-            : SIMDResultHandler<C, with_id_map>(ntotal, b),
+            const IDSelector* sel = nullptr)
+            : SIMDResultHandler<C, with_id_map>(ntotal, sel),
               pres(res),
               radius(radius),
               normalizers(nullptr) {
@@ -596,12 +649,34 @@ struct RangeSearchResultHandler : SIMDResultHandler<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            auto real_idx = this->adjust_id(b, j);
-            lt_mask -= 1 << j;
-            if (this->bitset.empty() || !this->bitset.test(real_idx)) {
+        //
+        if (this->sel != nullptr) {
+            // todo aguzhva: additional cost for adjust_id
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    uint16_t dis = d32tab[j];
+                    float real_dis = dis;
+                    if (normalizers) {
+                        real_dis = (1.0 / normalizers[2 * q]) * real_dis +
+                                normalizers[2 * q + 1];
+                    }
+                    if (C::cmp(radius, real_dis)) {
+                        ++in_range_num;
+                        qres.add(real_dis, real_idx);
+                    }
+                }
+            }
+        }
+        else {
+            // todo aguzhva: compute adjust_id only whenever is needed
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
                 uint16_t dis = d32tab[j];
                 float real_dis = dis;
                 if (normalizers) {
@@ -610,6 +685,7 @@ struct RangeSearchResultHandler : SIMDResultHandler<C, with_id_map> {
                 }
                 if (C::cmp(radius, real_dis)) {
                     ++in_range_num;
+                    auto real_idx = this->adjust_id(b, j);
                     qres.add(real_dis, real_idx);
                 }
             }
diff --git a/thirdparty/faiss/faiss/index_factory.cpp b/thirdparty/faiss/faiss/index_factory.cpp
index 4de92bf8b..9f24217a4 100644
--- a/thirdparty/faiss/faiss/index_factory.cpp
+++ b/thirdparty/faiss/faiss/index_factory.cpp
@@ -26,10 +26,12 @@
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexAdditiveQuantizerFastScan.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFAdditiveQuantizer.h>
+#include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
@@ -42,6 +44,7 @@
 #include <faiss/IndexPQFastScan.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/IndexRefine.h>
+#include <faiss/IndexRowwiseMinMax.h>
 #include <faiss/IndexScalarQuantizer.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/VectorTransform.h>
@@ -134,11 +137,11 @@ int mres_to_int(const std::ssub_match& mr, int deflt = -1, int begin = 0) {
     return std::stoi(mr.str().substr(begin));
 }
 
-std::map<std::string, QuantizerType> sq_types = {
-        {"SQ8", QuantizerType::QT_8bit},
-        {"SQ4", QuantizerType::QT_4bit},
-        {"SQ6", QuantizerType::QT_6bit},
-        {"SQfp16", QuantizerType::QT_fp16},
+std::map<std::string, ScalarQuantizer::QuantizerType> sq_types = {
+        {"SQ8", ScalarQuantizer::QT_8bit},
+        {"SQ4", ScalarQuantizer::QT_4bit},
+        {"SQ6", ScalarQuantizer::QT_6bit},
+        {"SQfp16", ScalarQuantizer::QT_fp16},
 };
 const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16)";
 
@@ -149,11 +152,15 @@ std::map<std::string, AdditiveQuantizer::Search_type_t> aq_search_type = {
         {"_Nqint4", AdditiveQuantizer::ST_norm_qint4},
         {"_Ncqint8", AdditiveQuantizer::ST_norm_cqint8},
         {"_Ncqint4", AdditiveQuantizer::ST_norm_cqint4},
+        {"_Nlsq2x4", AdditiveQuantizer::ST_norm_lsq2x4},
+        {"_Nrq2x4", AdditiveQuantizer::ST_norm_rq2x4},
 };
 
 const std::string aq_def_pattern = "[0-9]+x[0-9]+(_[0-9]+x[0-9]+)*";
 const std::string aq_norm_pattern =
-        "(|_Nnone|_Nfloat|_Nqint8|_Nqint4|_Ncqint8|_Ncqint4)";
+        "(|_Nnone|_Nfloat|_Nqint8|_Nqint4|_Ncqint8|_Ncqint4|_Nlsq2x4|_Nrq2x4)";
+
+const std::string paq_def_pattern = "([0-9]+)x([0-9]+)x([0-9]+)";
 
 AdditiveQuantizer::Search_type_t aq_parse_search_type(
         std::string stok,
@@ -169,7 +176,7 @@ AdditiveQuantizer::Search_type_t aq_parse_search_type(
 std::vector<size_t> aq_parse_nbits(std::string stok) {
     std::vector<size_t> nbits;
     std::smatch sm;
-    while (std::regex_search(stok, sm, std::regex("([0-9]+)x([0-9]+)"))) {
+    while (std::regex_search(stok, sm, std::regex("[^q]([0-9]+)x([0-9]+)"))) {
         int M = std::stoi(sm[1].str());
         int nbit = std::stoi(sm[2].str());
         nbits.resize(nbits.size() + M, nbit);
@@ -341,6 +348,53 @@ IndexIVF* parse_IndexIVF(
         }
         return index_ivf;
     }
+    if (match("(PRQ|PLSQ)" + paq_def_pattern + aq_norm_pattern)) {
+        int nsplits = mres_to_int(sm[2]);
+        int Msub = mres_to_int(sm[3]);
+        int nbit = mres_to_int(sm[4]);
+        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), mt);
+        IndexIVF* index_ivf;
+        if (sm[1].str() == "PRQ") {
+            index_ivf = new IndexIVFProductResidualQuantizer(
+                    get_q(), d, nlist, nsplits, Msub, nbit, mt, st);
+        } else {
+            index_ivf = new IndexIVFProductLocalSearchQuantizer(
+                    get_q(), d, nlist, nsplits, Msub, nbit, mt, st);
+        }
+        return index_ivf;
+    }
+    if (match("(RQ|LSQ)([0-9]+)x4fs(r?)(_[0-9]+)?" + aq_norm_pattern)) {
+        int M = std::stoi(sm[2].str());
+        int bbs = mres_to_int(sm[4], 32, 1);
+        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), mt);
+        IndexIVFAdditiveQuantizerFastScan* index_ivf;
+        if (sm[1].str() == "RQ") {
+            index_ivf = new IndexIVFResidualQuantizerFastScan(
+                    get_q(), d, nlist, M, 4, mt, st, bbs);
+        } else {
+            index_ivf = new IndexIVFLocalSearchQuantizerFastScan(
+                    get_q(), d, nlist, M, 4, mt, st, bbs);
+        }
+        index_ivf->by_residual = (sm[3].str() == "r");
+        return index_ivf;
+    }
+    if (match("(PRQ|PLSQ)([0-9]+)x([0-9]+)x4fs(r?)(_[0-9]+)?" +
+              aq_norm_pattern)) {
+        int nsplits = std::stoi(sm[2].str());
+        int Msub = std::stoi(sm[3].str());
+        int bbs = mres_to_int(sm[5], 32, 1);
+        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), mt);
+        IndexIVFAdditiveQuantizerFastScan* index_ivf;
+        if (sm[1].str() == "PRQ") {
+            index_ivf = new IndexIVFProductResidualQuantizerFastScan(
+                    get_q(), d, nlist, nsplits, Msub, 4, mt, st, bbs);
+        } else {
+            index_ivf = new IndexIVFProductLocalSearchQuantizerFastScan(
+                    get_q(), d, nlist, nsplits, Msub, 4, mt, st, bbs);
+        }
+        index_ivf->by_residual = (sm[4].str() == "r");
+        return index_ivf;
+    }
     if (match("(ITQ|PCA|PCAR)([0-9]+)?,SH([-0-9.e]+)?([gcm])?")) {
         int outdim = mres_to_int(sm[2], d); // is also the number of bits
         std::unique_ptr<VectorTransform> vt;
@@ -388,11 +442,13 @@ IndexHNSW* parse_IndexHNSW(
     if (match("Flat|")) {
         return new IndexHNSWFlat(d, hnsw_M, mt);
     }
-    if (match("PQ([0-9]+)(np)?")) {
+
+    if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
         int M = std::stoi(sm[1].str());
-        IndexHNSWPQ* ipq = new IndexHNSWPQ(d, M, hnsw_M);
+        int nbit = mres_to_int(sm[2], 8, 1);
+        IndexHNSWPQ* ipq = new IndexHNSWPQ(d, M, hnsw_M, nbit);
         dynamic_cast<IndexPQ*>(ipq->storage)->do_polysemous_training =
-                sm[2].str() != "np";
+                sm[3].str() != "np";
         return ipq;
     }
     if (match(sq_pattern)) {
@@ -421,6 +477,38 @@ IndexHNSW* parse_IndexHNSW(
     return nullptr;
 }
 
+/***************************************************************
+ * Parse IndexNSG
+ */
+
+IndexNSG* parse_IndexNSG(
+        const std::string code_string,
+        int d,
+        MetricType mt,
+        int nsg_R) {
+    std::smatch sm;
+    auto match = [&sm, &code_string](const std::string& pattern) {
+        return re_match(code_string, pattern, sm);
+    };
+
+    if (match("Flat|")) {
+        return new IndexNSGFlat(d, nsg_R, mt);
+    }
+    if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
+        int M = std::stoi(sm[1].str());
+        int nbit = mres_to_int(sm[2], 8, 1);
+        IndexNSGPQ* ipq = new IndexNSGPQ(d, M, nsg_R, nbit);
+        dynamic_cast<IndexPQ*>(ipq->storage)->do_polysemous_training =
+                sm[3].str() != "np";
+        return ipq;
+    }
+    if (match(sq_pattern)) {
+        return new IndexNSGSQ(d, sq_types[sm[1].str()], nsg_R, mt);
+    }
+
+    return nullptr;
+}
+
 /***************************************************************
  * Parse basic indexes
  */
@@ -454,11 +542,6 @@ Index* parse_other_indexes(
         return new IndexLattice(d, M, nbit, r2);
     }
 
-    // IndexNSGFlat
-    if (match("NSG([0-9]+)(,Flat)?")) {
-        return new IndexNSGFlat(d, std::stoi(sm[1].str()), metric);
-    }
-
     // IndexScalarQuantizer
     if (match(sq_pattern)) {
         return new IndexScalarQuantizer(d, sq_types[description], metric);
@@ -505,6 +588,60 @@ Index* parse_other_indexes(
         return new IndexLocalSearchQuantizer(d, M, nbit, metric, st);
     }
 
+    // IndexProductResidualQuantizer
+    if (match("PRQ" + paq_def_pattern + aq_norm_pattern)) {
+        int nsplits = mres_to_int(sm[1]);
+        int Msub = mres_to_int(sm[2]);
+        int nbit = mres_to_int(sm[3]);
+        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
+        return new IndexProductResidualQuantizer(
+                d, nsplits, Msub, nbit, metric, st);
+    }
+
+    // IndexProductLocalSearchQuantizer
+    if (match("PLSQ" + paq_def_pattern + aq_norm_pattern)) {
+        int nsplits = mres_to_int(sm[1]);
+        int Msub = mres_to_int(sm[2]);
+        int nbit = mres_to_int(sm[3]);
+        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
+        return new IndexProductLocalSearchQuantizer(
+                d, nsplits, Msub, nbit, metric, st);
+    }
+
+    // IndexAdditiveQuantizerFastScan
+    // RQ{M}x4fs_{bbs}_{search_type}
+    pattern = "(LSQ|RQ)([0-9]+)x4fs(_[0-9]+)?" + aq_norm_pattern;
+    if (match(pattern)) {
+        int M = std::stoi(sm[2].str());
+        int bbs = mres_to_int(sm[3], 32, 1);
+        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
+
+        if (sm[1].str() == "RQ") {
+            return new IndexResidualQuantizerFastScan(d, M, 4, metric, st, bbs);
+        } else if (sm[1].str() == "LSQ") {
+            return new IndexLocalSearchQuantizerFastScan(
+                    d, M, 4, metric, st, bbs);
+        }
+    }
+
+    // IndexProductAdditiveQuantizerFastScan
+    // PRQ{nsplits}x{Msub}x4fs_{bbs}_{search_type}
+    pattern = "(PLSQ|PRQ)([0-9]+)x([0-9]+)x4fs(_[0-9]+)?" + aq_norm_pattern;
+    if (match(pattern)) {
+        int nsplits = std::stoi(sm[2].str());
+        int Msub = std::stoi(sm[3].str());
+        int bbs = mres_to_int(sm[4], 32, 1);
+        auto st = aq_parse_search_type(sm[sm.size() - 1].str(), metric);
+
+        if (sm[1].str() == "PRQ") {
+            return new IndexProductResidualQuantizerFastScan(
+                    d, nsplits, Msub, 4, metric, st, bbs);
+        } else if (sm[1].str() == "PLSQ") {
+            return new IndexProductLocalSearchQuantizerFastScan(
+                    d, nsplits, Msub, 4, metric, st, bbs);
+        }
+    }
+
     return nullptr;
 }
 
@@ -531,19 +668,19 @@ std::unique_ptr<Index> index_factory_sub(
         re_match(description, "(.+),Refine\\((.+)\\)", sm)) {
         std::unique_ptr<Index> filter_index =
                 index_factory_sub(d, sm[1].str(), metric);
-        std::unique_ptr<Index> refine_index;
 
+        IndexRefine* index_rf = nullptr;
         if (sm.size() == 3) { // Refine
-            refine_index = index_factory_sub(d, sm[2].str(), metric);
+            std::unique_ptr<Index> refine_index =
+                    index_factory_sub(d, sm[2].str(), metric);
+            index_rf = new IndexRefine(
+                    filter_index.release(), refine_index.release());
+            index_rf->own_refine_index = true;
         } else { // RFlat
-            refine_index.reset(new IndexFlat(d, metric));
+            index_rf = new IndexRefineFlat(filter_index.release(), nullptr);
         }
-        IndexRefine* index_rf =
-                new IndexRefine(filter_index.get(), refine_index.get());
+        FAISS_ASSERT(index_rf != nullptr);
         index_rf->own_fields = true;
-        filter_index.release();
-        refine_index.release();
-        index_rf->own_refine_index = true;
         return std::unique_ptr<Index>(index_rf);
     }
 
@@ -604,6 +741,14 @@ std::unique_ptr<Index> index_factory_sub(
 
     // IndexIDMap -- it turns out is was used both as a prefix and a suffix, so
     // support both
+    if (re_match(description, "(.+),IDMap2", sm) ||
+        re_match(description, "IDMap2,(.+)", sm)) {
+        IndexIDMap2* idmap2 = new IndexIDMap2(
+                index_factory_sub(d, sm[1].str(), metric).release());
+        idmap2->own_fields = true;
+        return std::unique_ptr<Index>(idmap2);
+    }
+
     if (re_match(description, "(.+),IDMap", sm) ||
         re_match(description, "IDMap,(.+)", sm)) {
         IndexIDMap* idmap = new IndexIDMap(
@@ -642,6 +787,53 @@ std::unique_ptr<Index> index_factory_sub(
         return std::unique_ptr<Index>(index);
     }
 
+    // NSG variants (it was unclear in the old version that the separator was a
+    // "," so we support both "_" and ",")
+    if (re_match(description, "NSG([0-9]*)([,_].*)?", sm)) {
+        int nsg_R = mres_to_int(sm[1], 32);
+        // We also accept empty code string (synonym of Flat)
+        std::string code_string =
+                sm[2].length() > 0 ? sm[2].str().substr(1) : "";
+        if (verbose) {
+            printf("parsing NSG string %s code_string=%s nsg_R=%d\n",
+                   description.c_str(),
+                   code_string.c_str(),
+                   nsg_R);
+        }
+
+        IndexNSG* index = parse_IndexNSG(code_string, d, metric, nsg_R);
+        FAISS_THROW_IF_NOT_FMT(
+                index,
+                "could not parse NSG code description %s in %s",
+                code_string.c_str(),
+                description.c_str());
+        return std::unique_ptr<Index>(index);
+    }
+
+    // IndexRowwiseMinMax, fp32 version
+    if (description.compare(0, 7, "MinMax,") == 0) {
+        size_t comma = description.find(",");
+        std::string sub_index_string = description.substr(comma + 1);
+        auto sub_index = index_factory_sub(d, sub_index_string, metric);
+
+        auto index = new IndexRowwiseMinMax(sub_index.release());
+        index->own_fields = true;
+
+        return std::unique_ptr<Index>(index);
+    }
+
+    // IndexRowwiseMinMax, fp16 version
+    if (description.compare(0, 11, "MinMaxFP16,") == 0) {
+        size_t comma = description.find(",");
+        std::string sub_index_string = description.substr(comma + 1);
+        auto sub_index = index_factory_sub(d, sub_index_string, metric);
+
+        auto index = new IndexRowwiseMinMaxFP16(sub_index.release());
+        index->own_fields = true;
+
+        return std::unique_ptr<Index>(index);
+    }
+
     // IndexIVF
     {
         size_t nlist;
diff --git a/thirdparty/faiss/faiss/index_io.h b/thirdparty/faiss/faiss/index_io.h
index 712c90470..634b3066b 100644
--- a/thirdparty/faiss/faiss/index_io.h
+++ b/thirdparty/faiss/faiss/index_io.h
@@ -52,6 +52,8 @@ const int IO_FLAG_READ_ONLY = 2;
 const int IO_FLAG_ONDISK_SAME_DIR = 4;
 // don't load IVF data to RAM, only list sizes
 const int IO_FLAG_SKIP_IVF_DATA = 8;
+// don't initialize precomputed table after loading
+const int IO_FLAG_SKIP_PRECOMPUTE_TABLE = 16;
 // load index data with vectors' norms
 const int IO_FLAG_WITH_NORM = 1 << 8;
 // try to memmap data (useful to load an ArrayInvertedLists as an
@@ -67,7 +69,10 @@ IndexBinary* read_index_binary(FILE* f, int io_flags = 0);
 IndexBinary* read_index_binary(IOReader* reader, int io_flags = 0);
 
 void write_VectorTransform(const VectorTransform* vt, const char* fname);
+void write_VectorTransform(const VectorTransform* vt, IOWriter* f);
+
 VectorTransform* read_VectorTransform(const char* fname);
+VectorTransform* read_VectorTransform(IOReader* f);
 
 ProductQuantizer* read_ProductQuantizer(const char* fname);
 ProductQuantizer* read_ProductQuantizer(IOReader* reader);
diff --git a/thirdparty/faiss/faiss/invlists/BlockInvertedLists.cpp b/thirdparty/faiss/faiss/invlists/BlockInvertedLists.cpp
index d590ea378..a148e0e01 100644
--- a/thirdparty/faiss/faiss/invlists/BlockInvertedLists.cpp
+++ b/thirdparty/faiss/faiss/invlists/BlockInvertedLists.cpp
@@ -7,6 +7,7 @@
 
 #include <faiss/invlists/BlockInvertedLists.h>
 
+#include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
 
 #include <faiss/impl/io.h>
@@ -25,10 +26,17 @@ BlockInvertedLists::BlockInvertedLists(
     codes.resize(nlist);
 }
 
+BlockInvertedLists::BlockInvertedLists(size_t nlist, const CodePacker* packer)
+        : InvertedLists(nlist, InvertedLists::INVALID_CODE_SIZE),
+          n_per_block(packer->nvec),
+          block_size(packer->block_size),
+          packer(packer) {
+    ids.resize(nlist);
+    codes.resize(nlist);
+}
+
 BlockInvertedLists::BlockInvertedLists()
-        : InvertedLists(0, InvertedLists::INVALID_CODE_SIZE),
-          n_per_block(0),
-          block_size(0) {}
+        : InvertedLists(0, InvertedLists::INVALID_CODE_SIZE) {}
 
 size_t BlockInvertedLists::add_entries(
         size_t list_no,
@@ -36,19 +44,26 @@ size_t BlockInvertedLists::add_entries(
         const idx_t* ids_in,
         const uint8_t* code,
         const float* code_norm) {
-    if (n_entry == 0)
+    if (n_entry == 0) {
         return 0;
+    }
     FAISS_THROW_IF_NOT(list_no < nlist);
     size_t o = ids[list_no].size();
-    FAISS_THROW_IF_NOT(
-            o == 0); // not clear how we should handle subsequent adds
     ids[list_no].resize(o + n_entry);
     memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
-
-    // copy whole blocks
-    size_t n_block = (n_entry + n_per_block - 1) / n_per_block;
+    size_t n_block = (o + n_entry + n_per_block - 1) / n_per_block;
     codes[list_no].resize(n_block * block_size);
-    memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+    if (o % block_size == 0) {
+        // copy whole blocks
+        memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+    } else {
+        FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
+        std::vector<uint8_t> buffer(packer->code_size);
+        for (size_t i = 0; i < n_entry; i++) {
+            packer->unpack_1(code, i, buffer.data());
+            packer->pack_1(buffer.data(), i + o, codes[list_no].data());
+        }
+    }
     return o;
 }
 
@@ -62,7 +77,7 @@ const uint8_t* BlockInvertedLists::get_codes(size_t list_no) const {
     return codes[list_no].get();
 }
 
-const InvertedLists::idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
+const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
 }
@@ -96,7 +111,9 @@ void BlockInvertedLists::update_entries(
     */
 }
 
-BlockInvertedLists::~BlockInvertedLists() {}
+BlockInvertedLists::~BlockInvertedLists() {
+    delete packer;
+}
 
 /**************************************************
  * IO hook implementation
diff --git a/thirdparty/faiss/faiss/invlists/BlockInvertedLists.h b/thirdparty/faiss/faiss/invlists/BlockInvertedLists.h
index 66a277dfc..3744ddf82 100644
--- a/thirdparty/faiss/faiss/invlists/BlockInvertedLists.h
+++ b/thirdparty/faiss/faiss/invlists/BlockInvertedLists.h
@@ -14,6 +14,8 @@
 
 namespace faiss {
 
+struct CodePacker;
+
 /** Inverted Lists that are organized by blocks.
  *
  * Different from the regular inverted lists, the codes are organized by blocks
@@ -28,13 +30,17 @@ namespace faiss {
  * data.
  */
 struct BlockInvertedLists : InvertedLists {
-    size_t n_per_block; // nb of vectors stored per block
-    size_t block_size;  // nb bytes per block
+    size_t n_per_block = 0; // nb of vectors stored per block
+    size_t block_size = 0;  // nb bytes per block
+
+    // required to interpret the content of the blocks (owned by this)
+    const CodePacker* packer = nullptr;
 
     std::vector<AlignedTable<uint8_t>> codes;
     std::vector<std::vector<idx_t>> ids;
 
     BlockInvertedLists(size_t nlist, size_t vec_per_block, size_t block_size);
+    BlockInvertedLists(size_t nlist, const CodePacker* packer);
 
     BlockInvertedLists();
 
diff --git a/thirdparty/faiss/faiss/invlists/DirectMap.cpp b/thirdparty/faiss/faiss/invlists/DirectMap.cpp
index 82912a555..e99127abf 100644
--- a/thirdparty/faiss/faiss/invlists/DirectMap.cpp
+++ b/thirdparty/faiss/faiss/invlists/DirectMap.cpp
@@ -79,7 +79,7 @@ void DirectMap::clear() {
     hashtable.clear();
 }
 
-DirectMap::idx_t DirectMap::get(idx_t key) const {
+idx_t DirectMap::get(idx_t key) const {
     if (type == Array) {
         FAISS_THROW_IF_NOT_MSG(key >= 0 && key < array.size(), "invalid key");
         idx_t lo = array[key];
diff --git a/thirdparty/faiss/faiss/invlists/DirectMap.h b/thirdparty/faiss/faiss/invlists/DirectMap.h
index 4aba137ac..2d63d98f3 100644
--- a/thirdparty/faiss/faiss/invlists/DirectMap.h
+++ b/thirdparty/faiss/faiss/invlists/DirectMap.h
@@ -10,6 +10,7 @@
 #ifndef FAISS_DIRECT_MAP_H
 #define FAISS_DIRECT_MAP_H
 
+#include <faiss/impl/IDSelector.h>
 #include <faiss/invlists/InvertedLists.h>
 #include <unordered_map>
 #include "faiss/impl/FaissAssert.h"
@@ -35,8 +36,6 @@ inline uint64_t lo_offset(uint64_t lo) {
  * Direct map: a way to map back from ids to inverted lists
  */
 struct DirectMap {
-    typedef Index::idx_t idx_t;
-
     enum Type {
         NoMap = 0,    // default
         Array = 1,    // sequential ids (only for add, no add_with_ids)
@@ -125,8 +124,6 @@ struct DirectMap {
 
 /// Thread-safe way of updating the direct_map
 struct DirectMapAdd {
-    typedef Index::idx_t idx_t;
-
     using Type = DirectMap::Type;
 
     DirectMap& direct_map;
diff --git a/thirdparty/faiss/faiss/invlists/InvertedLists.cpp b/thirdparty/faiss/faiss/invlists/InvertedLists.cpp
index e3cd101a0..149c9c4a9 100644
--- a/thirdparty/faiss/faiss/invlists/InvertedLists.cpp
+++ b/thirdparty/faiss/faiss/invlists/InvertedLists.cpp
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <cstdio>
+#include <memory>
 #include <numeric>
 
 #include <faiss/impl/FaissAssert.h>
@@ -67,19 +68,30 @@ PageLockMemory::PageLockMemory(PageLockMemory &&other) {
 
 namespace faiss {
 
+InvertedListsIterator::~InvertedListsIterator() {}
+
 /*****************************************
  * InvertedLists implementation
  ******************************************/
 
 InvertedLists::InvertedLists(size_t nlist, size_t code_size)
-        : nlist(nlist), code_size(code_size) {}
+        : nlist(nlist), code_size(code_size), use_iterator(false) {}
 
 InvertedLists::~InvertedLists() {}
 
-InvertedLists::idx_t InvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+bool InvertedLists::is_empty(size_t list_no) const {
+    return use_iterator
+            ? !std::unique_ptr<InvertedListsIterator>(get_iterator(list_no))
+                       ->is_available()
+            : list_size(list_no) == 0;
+}
+
+idx_t InvertedLists::get_single_id(size_t list_no, size_t offset) const {
     assert(offset < list_size(list_no));
-    return get_ids(list_no)[offset];
+    const idx_t* ids = get_ids(list_no);
+    idx_t id = ids[offset];
+    release_ids(list_no, ids);
+    return id;
 }
 
 void InvertedLists::release_codes(size_t, const uint8_t*) const {}
@@ -112,7 +124,7 @@ const uint8_t* InvertedLists::get_codes(size_t list_no, size_t offset)
     return get_codes(list_no) + offset * code_size;
 }
 
-const Index::idx_t* InvertedLists::get_ids(size_t list_no, size_t offset) const {
+const idx_t* InvertedLists::get_ids(size_t list_no, size_t offset) const {
     return get_ids(list_no);
 }
 
@@ -155,17 +167,17 @@ void InvertedLists::reset() {
     }
 }
 
+InvertedListsIterator* InvertedLists::get_iterator(size_t /*list_no*/) const {
+    FAISS_THROW_MSG("get_iterator is not supported");
+}
+
 void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
 #pragma omp parallel for
     for (idx_t i = 0; i < nlist; i++) {
         size_t list_size = oivf->list_size(i);
         ScopedIds ids(oivf, i);
         if (add_id == 0) {
-            add_entries(
-                    i,
-                    list_size,
-                    ids.get(),
-                    ScopedCodes(oivf, i).get());
+            add_entries(i, list_size, ids.get(), ScopedCodes(oivf, i).get());
         } else {
             std::vector<idx_t> new_ids(list_size);
 
@@ -173,15 +185,104 @@ void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
                 new_ids[j] = ids[j] + add_id;
             }
             add_entries(
-                    i,
-                    list_size,
-                    new_ids.data(),
-                    ScopedCodes(oivf, i).get());
+                    i, list_size, new_ids.data(), ScopedCodes(oivf, i).get());
         }
         oivf->resize(i, 0);
     }
 }
 
+size_t InvertedLists::copy_subset_to(
+        InvertedLists& oivf,
+        subset_type_t subset_type,
+        idx_t a1,
+        idx_t a2) const {
+    FAISS_THROW_IF_NOT(nlist == oivf.nlist);
+    FAISS_THROW_IF_NOT(code_size == oivf.code_size);
+    FAISS_THROW_IF_NOT_FMT(
+            subset_type >= 0 && subset_type <= 4,
+            "subset type %d not implemented",
+            subset_type);
+    size_t accu_n = 0;
+    size_t accu_a1 = 0;
+    size_t accu_a2 = 0;
+    size_t n_added = 0;
+
+    size_t ntotal = 0;
+    if (subset_type == 2) {
+        ntotal = compute_ntotal();
+    }
+
+    for (idx_t list_no = 0; list_no < nlist; list_no++) {
+        size_t n = list_size(list_no);
+        ScopedIds ids_in(this, list_no);
+
+        if (subset_type == SUBSET_TYPE_ID_RANGE) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (a1 <= id && id < a2) {
+                    oivf.add_entry(
+                            list_no,
+                            get_single_id(list_no, i),
+                            ScopedCodes(this, list_no, i).get());
+                    n_added++;
+                }
+            }
+        } else if (subset_type == SUBSET_TYPE_ID_MOD) {
+            for (idx_t i = 0; i < n; i++) {
+                idx_t id = ids_in[i];
+                if (id % a1 == a2) {
+                    oivf.add_entry(
+                            list_no,
+                            get_single_id(list_no, i),
+                            ScopedCodes(this, list_no, i).get());
+                    n_added++;
+                }
+            }
+        } else if (subset_type == SUBSET_TYPE_ELEMENT_RANGE) {
+            // see what is allocated to a1 and to a2
+            size_t next_accu_n = accu_n + n;
+            size_t next_accu_a1 = next_accu_n * a1 / ntotal;
+            size_t i1 = next_accu_a1 - accu_a1;
+            size_t next_accu_a2 = next_accu_n * a2 / ntotal;
+            size_t i2 = next_accu_a2 - accu_a2;
+
+            for (idx_t i = i1; i < i2; i++) {
+                oivf.add_entry(
+                        list_no,
+                        get_single_id(list_no, i),
+                        ScopedCodes(this, list_no, i).get());
+            }
+
+            n_added += i2 - i1;
+            accu_a1 = next_accu_a1;
+            accu_a2 = next_accu_a2;
+        } else if (subset_type == SUBSET_TYPE_INVLIST_FRACTION) {
+            size_t i1 = n * a2 / a1;
+            size_t i2 = n * (a2 + 1) / a1;
+
+            for (idx_t i = i1; i < i2; i++) {
+                oivf.add_entry(
+                        list_no,
+                        get_single_id(list_no, i),
+                        ScopedCodes(this, list_no, i).get());
+            }
+
+            n_added += i2 - i1;
+        } else if (subset_type == SUBSET_TYPE_INVLIST) {
+            if (list_no >= a1 && list_no < a2) {
+                oivf.add_entries(
+                        list_no,
+                        n,
+                        ScopedIds(this, list_no).get(),
+                        ScopedCodes(this, list_no).get());
+                n_added += n;
+            }
+        }
+        accu_n += n;
+    }
+    return n_added;
+}
+
 double InvertedLists::imbalance_factor() const {
     std::vector<int> hist(nlist);
 
@@ -204,7 +305,9 @@ void InvertedLists::print_stats() const {
     }
     for (size_t i = 0; i < sizes.size(); i++) {
         if (sizes[i]) {
-            printf("list size in < %d: %d instances\n", 1 << i, sizes[i]);
+            printf("list size in < %zu: %d instances\n",
+                   static_cast<size_t>(1) << i,
+                   sizes[i]);
         }
     }
 }
@@ -224,8 +327,8 @@ size_t InvertedLists::compute_ntotal() const {
 ArrayInvertedLists::ArrayInvertedLists(
         size_t nlist,
         size_t code_size,
-        bool with_norm)
-        : with_norm(with_norm), InvertedLists(nlist, code_size) {
+        bool _with_norm)
+        : with_norm(_with_norm), InvertedLists(nlist, code_size) {
     ids.resize(nlist);
     codes.resize(nlist);
     if (with_norm) {
@@ -237,7 +340,7 @@ size_t ArrayInvertedLists::add_entries(
         size_t list_no,
         size_t n_entry,
         const idx_t* ids_in,
-        const uint8_t* code_in,
+        const uint8_t* code,
         const float* code_norms_in) {
     if (n_entry == 0)
         return 0;
@@ -246,7 +349,7 @@ size_t ArrayInvertedLists::add_entries(
     ids[list_no].resize(o + n_entry);
     memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
     codes[list_no].resize((o + n_entry) * code_size);
-    memcpy(&codes[list_no][o * code_size], code_in, code_size * n_entry);
+    memcpy(&codes[list_no][o * code_size], code, code_size * n_entry);
     if (with_norm) {
         code_norms[list_no].resize(o + n_entry);
         memcpy(&code_norms[list_no][o], code_norms_in, sizeof(float) * n_entry);
@@ -264,7 +367,7 @@ const uint8_t* ArrayInvertedLists::get_codes(size_t list_no) const {
     return codes[list_no].data();
 }
 
-const InvertedLists::idx_t* ArrayInvertedLists::get_ids(size_t list_no) const {
+const idx_t* ArrayInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
 }
@@ -341,6 +444,21 @@ InvertedLists* ArrayInvertedLists::to_readonly() {
     return new ReadOnlyArrayInvertedLists(*this);
 }
 
+void ArrayInvertedLists::permute_invlists(const idx_t* map) {
+    // todo aguzhva: permute norms as well?
+    std::vector<std::vector<uint8_t>> new_codes(nlist);
+    std::vector<std::vector<idx_t>> new_ids(nlist);
+
+    for (size_t i = 0; i < nlist; i++) {
+        size_t o = map[i];
+        FAISS_THROW_IF_NOT(o < nlist);
+        std::swap(new_codes[i], codes[o]);
+        std::swap(new_ids[i], ids[o]);
+    }
+    std::swap(codes, new_codes);
+    std::swap(ids, new_ids);
+}
+
 ArrayInvertedLists::~ArrayInvertedLists() {}
 
 ConcurrentArrayInvertedLists::ConcurrentArrayInvertedLists(
@@ -401,7 +519,7 @@ const uint8_t* ConcurrentArrayInvertedLists::get_codes(size_t list_no) const {
     FAISS_THROW_MSG("not implemented get_codes for non-continuous storage");
 }
 
-const InvertedLists::idx_t* ConcurrentArrayInvertedLists::get_ids(size_t list_no) const {
+const idx_t* ConcurrentArrayInvertedLists::get_ids(size_t list_no) const {
     FAISS_THROW_MSG("not implemented get_ids for non-continuous storage");
 }
 size_t ConcurrentArrayInvertedLists::add_entries(
@@ -616,7 +734,7 @@ const uint8_t* ConcurrentArrayInvertedLists::get_codes(
     return reinterpret_cast<const uint8_t *>(&(codes[list_no][segment_no][segment_off]));
 }
 
-const InvertedLists::idx_t* ConcurrentArrayInvertedLists::get_ids(
+const idx_t* ConcurrentArrayInvertedLists::get_ids(
         size_t list_no,
         size_t offset) const {
     assert(list_no < nlist);
@@ -633,7 +751,7 @@ const uint8_t* ConcurrentArrayInvertedLists::get_single_code(
     return get_codes(list_no, offset);
 }
 
-InvertedLists::idx_t ConcurrentArrayInvertedLists::get_single_id(size_t list_no, size_t offset)
+idx_t ConcurrentArrayInvertedLists::get_single_id(size_t list_no, size_t offset)
         const {
     auto *pItem = get_ids(list_no, offset);
     return *pItem;
@@ -820,7 +938,7 @@ const uint8_t* ReadOnlyArrayInvertedLists::get_codes(
 #endif
 }
 
-const InvertedLists::idx_t* ReadOnlyArrayInvertedLists::get_ids(
+const idx_t* ReadOnlyArrayInvertedLists::get_ids(
         size_t list_no) const {
     FAISS_ASSERT(list_no < nlist && valid);
 #ifdef USE_GPU
@@ -831,7 +949,7 @@ const InvertedLists::idx_t* ReadOnlyArrayInvertedLists::get_ids(
 #endif
 }
 
-const InvertedLists::idx_t* ReadOnlyArrayInvertedLists::get_all_ids() const {
+const idx_t* ReadOnlyArrayInvertedLists::get_all_ids() const {
     FAISS_ASSERT(valid);
 #ifdef USE_GPU
     return (idx_t*)(pin_readonly_ids->data);
@@ -868,7 +986,7 @@ size_t ReadOnlyInvertedLists::add_entries(
         size_t,
         const idx_t*,
         const uint8_t*,
-        const float* code_norm) {
+        const float*) {
     FAISS_THROW_MSG("not implemented");
 }
 
@@ -945,7 +1063,7 @@ void HStackInvertedLists::release_codes(size_t, const uint8_t* codes) const {
     delete[] codes;
 }
 
-const Index::idx_t* HStackInvertedLists::get_ids(size_t list_no) const {
+const idx_t* HStackInvertedLists::get_ids(size_t list_no) const {
     idx_t *ids = new idx_t[list_size(list_no)], *c = ids;
 
     for (int i = 0; i < ils.size(); i++) {
@@ -959,8 +1077,7 @@ const Index::idx_t* HStackInvertedLists::get_ids(size_t list_no) const {
     return ids;
 }
 
-Index::idx_t HStackInvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+idx_t HStackInvertedLists::get_single_id(size_t list_no, size_t offset) const {
     for (int i = 0; i < ils.size(); i++) {
         const InvertedLists* il = ils[i];
         size_t sz = il->list_size(list_no);
@@ -990,8 +1107,6 @@ void HStackInvertedLists::prefetch_lists(const idx_t* list_nos, int nlist)
 
 namespace {
 
-using idx_t = InvertedLists::idx_t;
-
 idx_t translate_list_no(const SliceInvertedLists* sil, idx_t list_no) {
     FAISS_THROW_IF_NOT(list_no >= 0 && list_no < sil->nlist);
     return list_no + sil->i0;
@@ -1027,12 +1142,11 @@ void SliceInvertedLists::release_codes(size_t list_no, const uint8_t* codes)
     return il->release_codes(translate_list_no(this, list_no), codes);
 }
 
-const Index::idx_t* SliceInvertedLists::get_ids(size_t list_no) const {
+const idx_t* SliceInvertedLists::get_ids(size_t list_no) const {
     return il->get_ids(translate_list_no(this, list_no));
 }
 
-Index::idx_t SliceInvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+idx_t SliceInvertedLists::get_single_id(size_t list_no, size_t offset) const {
     return il->get_single_id(translate_list_no(this, list_no), offset);
 }
 
@@ -1058,8 +1172,6 @@ void SliceInvertedLists::prefetch_lists(const idx_t* list_nos, int nlist)
 
 namespace {
 
-using idx_t = InvertedLists::idx_t;
-
 // find the invlist this number belongs to
 int translate_list_no(const VStackInvertedLists* vil, idx_t list_no) {
     FAISS_THROW_IF_NOT(list_no >= 0 && list_no < vil->nlist);
@@ -1127,14 +1239,13 @@ void VStackInvertedLists::release_codes(size_t list_no, const uint8_t* codes)
     return ils[i]->release_codes(list_no, codes);
 }
 
-const Index::idx_t* VStackInvertedLists::get_ids(size_t list_no) const {
+const idx_t* VStackInvertedLists::get_ids(size_t list_no) const {
     int i = translate_list_no(this, list_no);
     list_no -= cumsz[i];
     return ils[i]->get_ids(list_no);
 }
 
-Index::idx_t VStackInvertedLists::get_single_id(size_t list_no, size_t offset)
-        const {
+idx_t VStackInvertedLists::get_single_id(size_t list_no, size_t offset) const {
     int i = translate_list_no(this, list_no);
     list_no -= cumsz[i];
     return ils[i]->get_single_id(list_no, offset);
diff --git a/thirdparty/faiss/faiss/invlists/InvertedLists.h b/thirdparty/faiss/faiss/invlists/InvertedLists.h
index 54b98d628..f87e02ebc 100644
--- a/thirdparty/faiss/faiss/invlists/InvertedLists.h
+++ b/thirdparty/faiss/faiss/invlists/InvertedLists.h
@@ -15,12 +15,14 @@
  * the interface.
  */
 
-#include <memory>
-#include <vector>
 #include <atomic>
-#include <set>
+#include <cassert>
 #include <deque>
-#include <faiss/Index.h>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include <faiss/MetricType.h>
 
 namespace faiss {
 
@@ -48,6 +50,13 @@ using PageLockMemoryPtr = std::shared_ptr<PageLockMemory>;
 
 namespace faiss {
 
+struct InvertedListsIterator {
+    virtual ~InvertedListsIterator();
+    virtual bool is_available() const = 0;
+    virtual void next() = 0;
+    virtual std::pair<idx_t, const uint8_t*> get_id_and_codes() = 0;
+};
+
 /** Table of inverted lists
  * multithreading rules:
  * - concurrent read accesses are allowed
@@ -56,13 +65,14 @@ namespace faiss {
  *   are allowed
  */
 struct InvertedLists {
-    typedef Index::idx_t idx_t;
-
     size_t nlist;     ///< number of possible key values
     size_t code_size; ///< code size per vector in bytes
+    bool use_iterator;
 
     InvertedLists(size_t nlist, size_t code_size);
 
+    virtual ~InvertedLists();
+
     /// used for BlockInvertedLists, where the codes are packed into groups
     /// and the individual code size is meaningless
     static const size_t INVALID_CODE_SIZE = static_cast<size_t>(-1);
@@ -70,6 +80,9 @@ struct InvertedLists {
     /*************************
      *  Read only functions */
 
+    // check if the list is empty
+    bool is_empty(size_t list_no) const;
+
     /// get the size of a list
     virtual size_t list_size(size_t list_no) const = 0;
 
@@ -82,6 +95,9 @@ struct InvertedLists {
     // get the segment minimal number of a list (continuous storage can be regarded as 1-segment storage)
     virtual size_t get_segment_offset(size_t list_no, size_t segment_no) const;
 
+    /// get iterable for lists that use_iterator
+    virtual InvertedListsIterator* get_iterator(size_t list_no) const;
+
     /** get the codes for an inverted list
      * must be released by release_codes
      *
@@ -141,8 +157,8 @@ struct InvertedLists {
 
     /// add one entry to an inverted list
     virtual size_t add_entry(
-            size_t list_no,
-            idx_t theid,
+            size_t list_no, 
+            idx_t theid, 
             const uint8_t* code,
             const float* code_norm = nullptr);
 
@@ -174,10 +190,36 @@ struct InvertedLists {
 
     virtual bool is_readonly() const;
 
+    /*************************
+     * high level functions  */
+
     /// move all entries from oivf (empty on output)
     void merge_from(InvertedLists* oivf, size_t add_id);
 
-    virtual ~InvertedLists();
+    // how to copy a subset of elements from the inverted lists
+    // This depends on two integers, a1 and a2.
+    enum subset_type_t : int {
+        // depends on IDs
+        SUBSET_TYPE_ID_RANGE = 0, // copies ids in [a1, a2)
+        SUBSET_TYPE_ID_MOD = 1,   // copies ids if id % a1 == a2
+        // depends on order within invlists
+        SUBSET_TYPE_ELEMENT_RANGE =
+                2, // copies fractions of invlists so that a1 elements are left
+                   // before and a2 after
+        SUBSET_TYPE_INVLIST_FRACTION =
+                3, // take fraction a2 out of a1 from each invlist, 0 <= a2 < a1
+        // copy only inverted lists a1:a2
+        SUBSET_TYPE_INVLIST = 4
+    };
+
+    /** copy a subset of the entries index to the other index
+     * @return number of entries copied
+     */
+    size_t copy_subset_to(
+            InvertedLists& other,
+            subset_type_t subset_type,
+            idx_t a1,
+            idx_t a2) const;
 
     /*************************
      * statistics            */
@@ -324,6 +366,9 @@ struct ArrayInvertedLists : InvertedLists {
 
     InvertedLists* to_readonly() override;
 
+    /// permute the inverted lists, map maps new_id to old_id
+    void permute_invlists(const idx_t* map);
+
     ~ArrayInvertedLists() override;
 };
 
@@ -334,11 +379,11 @@ struct ConcurrentArrayInvertedLists : InvertedLists {
         Segment(size_t segment_size, size_t code_size) : segment_size_(segment_size), code_size_(code_size) {
             data_.reserve(segment_size_ * code_size_);
         }
-        T& operator[](Index::idx_t idx) {
+        T& operator[](idx_t idx) {
             assert(idx < segment_size_);
             return data_[idx * code_size_];
         }
-        const T& operator[](Index::idx_t idx) const {
+        const T& operator[](idx_t idx) const {
             assert(idx < segment_size_);
             return data_[idx * code_size_];
         }
diff --git a/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.cpp b/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.cpp
index 58fe0cc09..cae56ed32 100644
--- a/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.cpp
+++ b/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.cpp
@@ -154,7 +154,7 @@ struct OnDiskInvertedLists::OngoingPrefetch {
             const OnDiskInvertedLists* od = pf->od;
             od->locks->lock_1(list_no);
             size_t n = od->list_size(list_no);
-            const Index::idx_t* idx = od->get_ids(list_no);
+            const idx_t* idx = od->get_ids(list_no);
             const uint8_t* codes = od->get_codes(list_no);
             int cs = 0;
             for (size_t i = 0; i < n; i++) {
@@ -278,6 +278,8 @@ void OnDiskInvertedLists::do_mmap() {
     uint8_t* ptro =
             (uint8_t*)mmap(nullptr, totsize, prot, MAP_SHARED, fileno(f), 0);
 
+    fclose(f);
+
     FAISS_THROW_IF_NOT_FMT(
             ptro != MAP_FAILED,
             "could not mmap %s: %s",
@@ -285,7 +287,6 @@ void OnDiskInvertedLists::do_mmap() {
             strerror(errno));
     madvise(ptro, totsize, MADV_RANDOM);
     ptr = ptro;
-    fclose(f);
 }
 
 void OnDiskInvertedLists::update_totsize(size_t new_size) {
@@ -389,7 +390,7 @@ const uint8_t* OnDiskInvertedLists::get_codes(size_t list_no) const {
     return ptr + lists[list_no].offset;
 }
 
-const Index::idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
+const idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
     if (lists[list_no].offset == INVALID_OFFSET) {
         return nullptr;
     }
@@ -525,7 +526,7 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
         it++;
     }
 
-    size_t inf = 1ULL << 60;
+    size_t inf = ((size_t)1) << 60;
 
     size_t end_prev = inf;
     if (it != slots.begin()) {
@@ -534,7 +535,7 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
         end_prev = prev->offset + prev->capacity;
     }
 
-    size_t begin_next = 1LL << 60;
+    size_t begin_next = ((size_t)1) << 60;
     if (it != slots.end()) {
         begin_next = it->offset;
     }
@@ -783,7 +784,7 @@ InvertedLists* OnDiskInvertedListsIOHook::read_ArrayInvertedLists(
         OnDiskInvertedLists::List& l = ails->lists[i];
         l.size = l.capacity = sizes[i];
         l.offset = o;
-        o += l.size * (sizeof(OnDiskInvertedLists::idx_t) + ails->code_size);
+        o += l.size * (sizeof(idx_t) + ails->code_size);
     }
     // resume normal reading of file
     fseek(fdesc, o, SEEK_SET);
diff --git a/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.h b/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.h
index fa42d4e33..bf27c0b79 100644
--- a/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.h
+++ b/thirdparty/faiss/faiss/invlists/OnDiskInvertedLists.h
@@ -31,7 +31,7 @@ struct OnDiskOneList {
 
 /** On-disk storage of inverted lists.
  *
- * The data is stored in a mmapped chunk of memory (base ptointer ptr,
+ * The data is stored in a mmapped chunk of memory (base pointer ptr,
  * size totsize). Each list is a range of memory that contains (object
  * List) that contains:
  *
diff --git a/thirdparty/faiss/faiss/utils/AlignedTable.h b/thirdparty/faiss/faiss/utils/AlignedTable.h
index 9ef4cdf2f..05adb1c0d 100644
--- a/thirdparty/faiss/faiss/utils/AlignedTable.h
+++ b/thirdparty/faiss/faiss/utils/AlignedTable.h
@@ -98,7 +98,9 @@ struct AlignedTableTightAlloc {
     AlignedTableTightAlloc<T, A>& operator=(
             const AlignedTableTightAlloc<T, A>& other) {
         resize(other.numel);
-        memcpy(ptr, other.ptr, sizeof(T) * numel);
+        if (numel > 0) {
+            memcpy(ptr, other.ptr, sizeof(T) * numel);
+        }
         return *this;
     }
 
diff --git a/thirdparty/faiss/faiss/utils/Heap.cpp b/thirdparty/faiss/faiss/utils/Heap.cpp
index 75c9bdaea..1907a0b1c 100644
--- a/thirdparty/faiss/faiss/utils/Heap.cpp
+++ b/thirdparty/faiss/faiss/utils/Heap.cpp
@@ -9,6 +9,7 @@
 
 /* Function for soft heap */
 
+#include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/Heap.h>
 
 namespace faiss {
@@ -32,7 +33,7 @@ void HeapArray<C>::addn(size_t nj, const T* vin, TI j0, size_t i0, int64_t ni) {
     if (ni == -1)
         ni = nh;
     assert(i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for
+#pragma omp parallel for if (ni * nj > 100000)
     for (int64_t i = i0; i < i0 + ni; i++) {
         T* __restrict simi = get_val(i);
         TI* __restrict idxi = get_ids(i);
@@ -62,7 +63,7 @@ void HeapArray<C>::addn_with_ids(
     if (ni == -1)
         ni = nh;
     assert(i0 >= 0 && i0 + ni <= nh);
-#pragma omp parallel for
+#pragma omp parallel for if (ni * nj > 100000)
     for (int64_t i = i0; i < i0 + ni; i++) {
         T* __restrict simi = get_val(i);
         TI* __restrict idxi = get_ids(i);
@@ -78,9 +79,38 @@ void HeapArray<C>::addn_with_ids(
     }
 }
 
+template <typename C>
+void HeapArray<C>::addn_query_subset_with_ids(
+        size_t nsubset,
+        const TI* subset,
+        size_t nj,
+        const T* vin,
+        const TI* id_in,
+        int64_t id_stride) {
+    FAISS_THROW_IF_NOT_MSG(id_in, "anonymous ids not supported");
+    if (id_stride < 0) {
+        id_stride = nj;
+    }
+#pragma omp parallel for if (nsubset * nj > 100000)
+    for (int64_t si = 0; si < nsubset; si++) {
+        TI i = subset[si];
+        T* __restrict simi = get_val(i);
+        TI* __restrict idxi = get_ids(i);
+        const T* ip_line = vin + si * nj;
+        const TI* id_line = id_in + si * id_stride;
+
+        for (size_t j = 0; j < nj; j++) {
+            T ip = ip_line[j];
+            if (C::cmp(simi[0], ip)) {
+                heap_replace_top<C>(k, simi, idxi, ip, id_line[j]);
+            }
+        }
+    }
+}
+
 template <typename C>
 void HeapArray<C>::per_line_extrema(T* out_val, TI* out_ids) const {
-#pragma omp parallel for
+#pragma omp parallel for if (nh * k > 100000)
     for (int64_t j = 0; j < nh; j++) {
         int64_t imin = -1;
         typename C::T xval = C::Crev::neutral();
@@ -106,7 +136,115 @@ void HeapArray<C>::per_line_extrema(T* out_val, TI* out_ids) const {
 
 template struct HeapArray<CMin<float, int64_t>>;
 template struct HeapArray<CMax<float, int64_t>>;
+template struct HeapArray<CMin<float, int32_t>>;
+template struct HeapArray<CMax<float, int32_t>>;
 template struct HeapArray<CMin<int, int64_t>>;
 template struct HeapArray<CMax<int, int64_t>>;
 
+/**********************************************************
+ * merge knn search results
+ **********************************************************/
+
+/** Merge result tables from several shards. The per-shard results are assumed
+ * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
+ * element heap because we want the best (ie. lowest for L2) result to be on
+ * top, not the worst.
+ *
+ * @param all_distances  size (nshard, n, k)
+ * @param all_labels     size (nshard, n, k)
+ * @param distances      output distances, size (n, k)
+ * @param labels         output labels, size (n, k)
+ */
+template <class idx_t, class C>
+void merge_knn_results(
+        size_t n,
+        size_t k,
+        typename C::TI nshard,
+        const typename C::T* all_distances,
+        const idx_t* all_labels,
+        typename C::T* distances,
+        idx_t* labels) {
+    using distance_t = typename C::T;
+    if (k == 0) {
+        return;
+    }
+    long stride = n * k;
+#pragma omp parallel if (n * nshard * k > 100000)
+    {
+        std::vector<int> buf(2 * nshard);
+        // index in each shard's result list
+        int* pointer = buf.data();
+        // (shard_ids, heap_vals): heap that indexes
+        // shard -> current distance for this shard
+        int* shard_ids = pointer + nshard;
+        std::vector<distance_t> buf2(nshard);
+        distance_t* heap_vals = buf2.data();
+#pragma omp for
+        for (long i = 0; i < n; i++) {
+            // the heap maps values to the shard where they are
+            // produced.
+            const distance_t* D_in = all_distances + i * k;
+            const idx_t* I_in = all_labels + i * k;
+            int heap_size = 0;
+
+            // push the first element of each shard (if not -1)
+            for (long s = 0; s < nshard; s++) {
+                pointer[s] = 0;
+                if (I_in[stride * s] >= 0) {
+                    heap_push<C>(
+                            ++heap_size,
+                            heap_vals,
+                            shard_ids,
+                            D_in[stride * s],
+                            s);
+                }
+            }
+
+            distance_t* D = distances + i * k;
+            idx_t* I = labels + i * k;
+
+            int j;
+            for (j = 0; j < k && heap_size > 0; j++) {
+                // pop element from best shard
+                int s = shard_ids[0]; // top of heap
+                int& p = pointer[s];
+                D[j] = heap_vals[0];
+                I[j] = I_in[stride * s + p];
+
+                // pop from shard, advance pointer for this shard
+                heap_pop<C>(heap_size--, heap_vals, shard_ids);
+                p++;
+                if (p < k && I_in[stride * s + p] >= 0) {
+                    heap_push<C>(
+                            ++heap_size,
+                            heap_vals,
+                            shard_ids,
+                            D_in[stride * s + p],
+                            s);
+                }
+            }
+            for (; j < k; j++) {
+                I[j] = -1;
+                D[j] = C::Crev::neutral();
+            }
+        }
+    }
+}
+
+// explicit instanciations
+#define INSTANTIATE(C, distance_t)                                \
+    template void merge_knn_results<int64_t, C<distance_t, int>>( \
+            size_t,                                               \
+            size_t,                                               \
+            int,                                                  \
+            const distance_t*,                                    \
+            const int64_t*,                                       \
+            distance_t*,                                          \
+            int64_t*);
+
+INSTANTIATE(CMin, float);
+INSTANTIATE(CMax, float);
+INSTANTIATE(CMin, int32_t);
+INSTANTIATE(CMax, int32_t);
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/Heap.h b/thirdparty/faiss/faiss/utils/Heap.h
index fc3d0ad51..cdb714f4d 100644
--- a/thirdparty/faiss/faiss/utils/Heap.h
+++ b/thirdparty/faiss/faiss/utils/Heap.h
@@ -47,21 +47,25 @@ inline void heap_pop(size_t k, typename C::T* bh_val, typename C::TI* bh_ids) {
     bh_val--; /* Use 1-based indexing for easier node->child translation */
     bh_ids--;
     typename C::T val = bh_val[k];
+    typename C::TI id = bh_ids[k];
     size_t i = 1, i1, i2;
     while (1) {
         i1 = i << 1;
         i2 = i1 + 1;
         if (i1 > k)
             break;
-        if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
-            if (C::cmp(val, bh_val[i1]))
+        if ((i2 == k + 1) ||
+            C::cmp2(bh_val[i1], bh_val[i2], bh_ids[i1], bh_ids[i2])) {
+            if (C::cmp2(val, bh_val[i1], id, bh_ids[i1])) {
                 break;
+            }
             bh_val[i] = bh_val[i1];
             bh_ids[i] = bh_ids[i1];
             i = i1;
         } else {
-            if (C::cmp(val, bh_val[i2]))
+            if (C::cmp2(val, bh_val[i2], id, bh_ids[i2])) {
                 break;
+            }
             bh_val[i] = bh_val[i2];
             bh_ids[i] = bh_ids[i2];
             i = i2;
@@ -80,24 +84,28 @@ inline void heap_push(
         typename C::T* bh_val,
         typename C::TI* bh_ids,
         typename C::T val,
-        typename C::TI ids) {
+        typename C::TI id) {
     bh_val--; /* Use 1-based indexing for easier node->child translation */
     bh_ids--;
     size_t i = k, i_father;
     while (i > 1) {
         i_father = i >> 1;
-        if (!C::cmp(val, bh_val[i_father])) /* the heap structure is ok */
+        if (!C::cmp2(val, bh_val[i_father], id, bh_ids[i_father])) {
+            /* the heap structure is ok */
             break;
+        }
         bh_val[i] = bh_val[i_father];
         bh_ids[i] = bh_ids[i_father];
         i = i_father;
     }
     bh_val[i] = val;
-    bh_ids[i] = ids;
+    bh_ids[i] = id;
 }
 
-/** Replace the top element from the heap defined by bh_val[0..k-1] and
- * bh_ids[0..k-1].
+/**
+ * Replaces the top element from the heap defined by bh_val[0..k-1] and
+ * bh_ids[0..k-1], and for identical bh_val[] values also sorts by bh_ids[]
+ * values.
  */
 template <class C>
 inline void heap_replace_top(
@@ -105,31 +113,39 @@ inline void heap_replace_top(
         typename C::T* bh_val,
         typename C::TI* bh_ids,
         typename C::T val,
-        typename C::TI ids) {
+        typename C::TI id) {
     bh_val--; /* Use 1-based indexing for easier node->child translation */
     bh_ids--;
     size_t i = 1, i1, i2;
     while (1) {
         i1 = i << 1;
         i2 = i1 + 1;
-        if (i1 > k)
+        if (i1 > k) {
             break;
-        if (i2 == k + 1 || C::cmp(bh_val[i1], bh_val[i2])) {
-            if (C::cmp(val, bh_val[i1]))
+        }
+
+        // Note that C::cmp2() is a bool function answering
+        // `(a1 > b1) || ((a1 == b1) && (a2 > b2))` for max
+        // heap and same with the `<` sign for min heap.
+        if ((i2 == k + 1) ||
+            C::cmp2(bh_val[i1], bh_val[i2], bh_ids[i1], bh_ids[i2])) {
+            if (C::cmp2(val, bh_val[i1], id, bh_ids[i1])) {
                 break;
+            }
             bh_val[i] = bh_val[i1];
             bh_ids[i] = bh_ids[i1];
             i = i1;
         } else {
-            if (C::cmp(val, bh_val[i2]))
+            if (C::cmp2(val, bh_val[i2], id, bh_ids[i2])) {
                 break;
+            }
             bh_val[i] = bh_val[i2];
             bh_ids[i] = bh_ids[i2];
             i = i2;
         }
     }
     bh_val[i] = val;
-    bh_ids[i] = ids;
+    bh_ids[i] = id;
 }
 
 /* Partial instanciation for heaps with TI = int64_t */
@@ -294,7 +310,7 @@ inline void maxheap_addn(
  * Heap finalization (reorder elements)
  *******************************************************************/
 
-/* This function maps a binary heap into an sorted structure.
+/* This function maps a binary heap into a sorted structure.
    It returns the number  */
 template <typename C>
 inline size_t heap_reorder(
@@ -397,6 +413,19 @@ struct HeapArray {
             size_t i0 = 0,
             int64_t ni = -1);
 
+    /** same as addn_with_ids, but for just a subset of queries
+     *
+     * @param nsubset  number of query entries to update
+     * @param subset   indexes of queries to update, in 0..nh-1, size nsubset
+     */
+    void addn_query_subset_with_ids(
+            size_t nsubset,
+            const TI* subset,
+            size_t nj,
+            const T* vin,
+            const TI* id_in = nullptr,
+            int64_t id_stride = 0);
+
     /// reorder all the heaps
     void reorder();
 
@@ -415,7 +444,7 @@ typedef HeapArray<CMin<int, int64_t>> int_minheap_array_t;
 typedef HeapArray<CMax<float, int64_t>> float_maxheap_array_t;
 typedef HeapArray<CMax<int, int64_t>> int_maxheap_array_t;
 
-// The heap templates are instanciated explicitly in Heap.cpp
+// The heap templates are instantiated explicitly in Heap.cpp
 
 /*********************************************************************
  * Indirect heaps: instead of having
@@ -476,6 +505,27 @@ inline void indirect_heap_push(
     bh_ids[i] = id;
 }
 
+/** Merge result tables from several shards. The per-shard results are assumed
+ * to be sorted. Note that the C comparator is reversed w.r.t. the usual top-k
+ * element heap because we want the best (ie. lowest for L2) result to be on
+ * top, not the worst. Also, it needs to hold an index of a shard id (ie.
+ * usually int32 is more than enough).
+ *
+ * @param all_distances  size (nshard, n, k)
+ * @param all_labels     size (nshard, n, k)
+ * @param distances      output distances, size (n, k)
+ * @param labels         output labels, size (n, k)
+ */
+template <class idx_t, class C>
+void merge_knn_results(
+        size_t n,
+        size_t k,
+        typename C::TI nshard,
+        const typename C::T* all_distances,
+        const idx_t* all_labels,
+        typename C::T* distances,
+        idx_t* labels);
+
 } // namespace faiss
 
 #endif /* FAISS_Heap_h */
diff --git a/thirdparty/faiss/faiss/utils/approx_topk/approx_topk.h b/thirdparty/faiss/faiss/utils/approx_topk/approx_topk.h
new file mode 100644
index 000000000..f5af1ffea
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/approx_topk/approx_topk.h
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file contains an implementation of approximate top-k search
+// using heap. It was initially created for a beam search.
+//
+// The core idea is the following.
+// Say we need to find beam_size indices with the minimal distance
+// values. It is done via heap (priority_queue) using the following
+// pseudocode:
+//
+//   def baseline():
+//     distances = np.empty([beam_size * n], dtype=float)
+//     indices = np.empty([beam_size * n], dtype=int)
+//
+//     heap = Heap(max_heap_size=beam_size)
+//
+//     for i in range(0, beam_size * n):
+//         heap.push(distances[i], indices[i])
+//
+// Basically, this is what heap_addn() function from utils/Heap.h does.
+//
+// The following scheme can be used for approximate beam search.
+// Say, we need to find elements with min distance.
+// Basically, we split n elements of every beam into NBUCKETS buckets
+// and track the index with the minimal distance for every bucket.
+// This can be effectively SIMD-ed and significantly lowers the number
+// of operations, but yields approximate results for beam_size >= 2.
+//
+//  def approximate_v1():
+//    distances = np.empty([beam_size * n], dtype=float)
+//    indices = np.empty([beam_size * n], dtype=int)
+//
+//    heap = Heap(max_heap_size=beam_size)
+//
+//    for beam in range(0, beam_size):
+//      # The value of 32 is just an example.
+//      # The value may be varied: the larger the value is,
+//      #  the slower and the more precise vs baseline beam search is
+//      NBUCKETS = 32
+//
+//     local_min_distances = [HUGE_VALF] * NBUCKETS
+//     local_min_indices = [0] * NBUCKETS
+//
+//      for i in range(0, n / NBUCKETS):
+//        for j in range(0, NBUCKETS):
+//          idx = beam * n + i * NBUCKETS + j
+//          if distances[idx] < local_min_distances[j]:
+//            local_min_distances[i] = distances[idx]
+//            local_min_indices[i] = indices[idx]
+//
+//    for j in range(0, NBUCKETS):
+//      heap.push(local_min_distances[j], local_min_indices[j])
+//
+// The accuracy can be improved by tracking min-2 elements for every
+// bucket. Such a min-2 implementation with NBUCKETS buckets provides
+// better accuracy than top-1 implementation with 2 * NBUCKETS buckets.
+// Min-3 is also doable. One can use min-N approach, but I'm not sure
+// whether min-4 and above are practical, because of the lack of SIMD
+// registers (unless AVX-512 version is used).
+//
+// C++ template for top-N implementation is provided. The code
+// assumes that indices[idx] == idx. One can write a code that lifts
+// such an assumption easily.
+//
+// Currently, the code that tracks elements with min distances is implemented
+//    (Max Heap). Min Heap option can be added easily.
+
+#pragma once
+
+#include <faiss/impl/platform_macros.h>
+
+// the list of available modes is in the following file
+#include <faiss/utils/approx_topk/mode.h>
+
+#ifdef __AVX2__
+#include <faiss/utils/approx_topk/avx2-inl.h>
+#else
+#include <faiss/utils/approx_topk/generic.h>
+#endif
diff --git a/thirdparty/faiss/faiss/utils/approx_topk/avx2-inl.h b/thirdparty/faiss/faiss/utils/approx_topk/avx2-inl.h
new file mode 100644
index 000000000..09bae6965
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/approx_topk/avx2-inl.h
@@ -0,0 +1,196 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <immintrin.h>
+
+#include <limits>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+template <typename C, uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets<CMax<float, int>, NBUCKETS, N> {
+    static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
+    static_assert(
+            (NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
+            "Number of buckets needs to be 8, 16, 24, ...");
+
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // distances. It is assumed to have n elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int32_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, distances, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // distances. It is assumed to have (n_per_beam * beam_size)
+            // elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int32_t* const __restrict bh_ids) {
+        // // Basically, the function runs beam_size iterations.
+        // // Every iteration NBUCKETS * N elements are added to a regular heap.
+        // // So, maximum number of added elements is beam_size * NBUCKETS * N.
+        // // This number is expected to be less or equal than k.
+        // FAISS_THROW_IF_NOT_FMT(
+        //         beam_size * NBUCKETS * N >= k,
+        //         "Cannot pick %d elements, only %d. "
+        //         "Check the function and template arguments values.",
+        //         k,
+        //         beam_size * NBUCKETS * N);
+
+        using C = CMax<float, int>;
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            __m256 min_distances_i[NBUCKETS_8][N];
+            __m256i min_indices_i[NBUCKETS_8][N];
+
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                for (uint32_t p = 0; p < N; p++) {
+                    min_distances_i[j][p] =
+                            _mm256_set1_ps(std::numeric_limits<float>::max());
+                    min_indices_i[j][p] =
+                            _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+                }
+            }
+
+            __m256i current_indices = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+            __m256i indices_delta = _mm256_set1_epi32(NBUCKETS);
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    const __m256 distances_reg = _mm256_loadu_ps(
+                            distances + j * 8 + ip + n_per_beam * beam_index);
+
+                    // loop. Compiler should get rid of unneeded ops
+                    __m256 distance_candidate = distances_reg;
+                    __m256i indices_candidate = current_indices;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        const __m256 comparison = _mm256_cmp_ps(
+                                min_distances_i[j][p],
+                                distance_candidate,
+                                _CMP_LE_OS);
+
+                        // // blend seems to be slower that min
+                        // const __m256 min_distances_new = _mm256_blendv_ps(
+                        //         distance_candidate,
+                        //         min_distances_i[j][p],
+                        //         comparison);
+                        const __m256 min_distances_new = _mm256_min_ps(
+                                distance_candidate, min_distances_i[j][p]);
+                        const __m256i min_indices_new =
+                                _mm256_castps_si256(_mm256_blendv_ps(
+                                        _mm256_castsi256_ps(indices_candidate),
+                                        _mm256_castsi256_ps(
+                                                min_indices_i[j][p]),
+                                        comparison));
+
+                        // // blend seems to be slower that min
+                        // const __m256 max_distances_new = _mm256_blendv_ps(
+                        //         min_distances_i[j][p],
+                        //         distance_candidate,
+                        //         comparison);
+                        const __m256 max_distances_new = _mm256_max_ps(
+                                min_distances_i[j][p], distances_reg);
+                        const __m256i max_indices_new =
+                                _mm256_castps_si256(_mm256_blendv_ps(
+                                        _mm256_castsi256_ps(
+                                                min_indices_i[j][p]),
+                                        _mm256_castsi256_ps(indices_candidate),
+                                        comparison));
+
+                        distance_candidate = max_distances_new;
+                        indices_candidate = max_indices_new;
+
+                        min_distances_i[j][p] = min_distances_new;
+                        min_indices_i[j][p] = min_indices_new;
+                    }
+                }
+
+                current_indices =
+                        _mm256_add_epi32(current_indices, indices_delta);
+            }
+
+            // fix the indices
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                const __m256i offset =
+                        _mm256_set1_epi32(n_per_beam * beam_index + j * 8);
+                for (uint32_t p = 0; p < N; p++) {
+                    min_indices_i[j][p] =
+                            _mm256_add_epi32(min_indices_i[j][p], offset);
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    int32_t min_indices_scalar[8];
+                    float min_distances_scalar[8];
+
+                    _mm256_storeu_si256(
+                            (__m256i*)min_indices_scalar, min_indices_i[j][p]);
+                    _mm256_storeu_ps(
+                            min_distances_scalar, min_distances_i[j][p]);
+
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+                    for (size_t j8 = 0; j8 < 8; j8++) {
+                        const auto value = min_distances_scalar[j8];
+                        const auto index = min_indices_scalar[j8];
+                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
+                            heap_replace_top<C>(
+                                    k, bh_val, bh_ids, value, index);
+                        }
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const int32_t index = ip + n_per_beam * beam_index;
+                const float value = distances[index];
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/approx_topk/generic.h b/thirdparty/faiss/faiss/utils/approx_topk/generic.h
new file mode 100644
index 000000000..59a8dc8dc
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/approx_topk/generic.h
@@ -0,0 +1,138 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+// This is the implementation of the idea and it is very slow,
+// because a compiler is unable to vectorize it properly.
+
+template <typename C, uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N>
+struct HeapWithBuckets<CMax<float, int>, NBUCKETS, N> {
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // distances. It is assumed to have n elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int32_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, distances, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // distances. It is assumed to have (n_per_beam * beam_size)
+            // elements.
+            const float* const __restrict distances,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            float* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int32_t* const __restrict bh_ids) {
+        // // Basically, the function runs beam_size iterations.
+        // // Every iteration NBUCKETS * N elements are added to a regular heap.
+        // // So, maximum number of added elements is beam_size * NBUCKETS * N.
+        // // This number is expected to be less or equal than k.
+        // FAISS_THROW_IF_NOT_FMT(
+        //         beam_size * NBUCKETS * N >= k,
+        //         "Cannot pick %d elements, only %d. "
+        //         "Check the function and template arguments values.",
+        //         k,
+        //         beam_size * NBUCKETS * N);
+
+        using C = CMax<float, int>;
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            float min_distances_i[N][NBUCKETS];
+            int min_indices_i[N][NBUCKETS];
+
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS; j++) {
+                    min_distances_i[p][j] = std::numeric_limits<float>::max();
+                    min_indices_i[p][j] = 0;
+                }
+            }
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS; j++) {
+                    const int index = j + ip + n_per_beam * beam_index;
+                    const float distance = distances[index];
+
+                    int index_candidate = index;
+                    float distance_candidate = distance;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        if (distance_candidate < min_distances_i[p][j]) {
+                            std::swap(
+                                    distance_candidate, min_distances_i[p][j]);
+                            std::swap(index_candidate, min_indices_i[p][j]);
+                        }
+                    }
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS; j++) {
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+
+                    if (C::cmp2(bh_val[0],
+                                min_distances_i[p][j],
+                                bh_ids[0],
+                                min_indices_i[p][j])) {
+                        heap_replace_top<C>(
+                                k,
+                                bh_val,
+                                bh_ids,
+                                min_distances_i[p][j],
+                                min_indices_i[p][j]);
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const int32_t index = ip + n_per_beam * beam_index;
+                const float value = distances[index];
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/approx_topk/mode.h b/thirdparty/faiss/faiss/utils/approx_topk/mode.h
new file mode 100644
index 000000000..5701b1c7b
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/approx_topk/mode.h
@@ -0,0 +1,34 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/// Represents the mode of use of approximate top-k computations
+/// that allows to trade accuracy vs speed. So, every options
+/// besides EXACT_TOPK increases the speed.
+///
+/// B represents the number of buckets.
+/// D is the number of min-k elements to track within every bucket.
+///
+/// Default option is EXACT_TOPK.
+/// APPROX_TOPK_BUCKETS_B16_D2 is worth starting from, if you'd like
+/// to experiment a bit.
+///
+/// It seems that only the limited number of combinations are
+/// meaningful, because of the limited supply of SIMD registers.
+/// Also, certain combinations, such as B32_D1 and B16_D1, were concluded
+/// to be not very precise in benchmarks, so ones were not introduced.
+///
+/// TODO: Consider d-ary SIMD heap.
+
+enum ApproxTopK_mode_t : int {
+    EXACT_TOPK = 0,
+    APPROX_TOPK_BUCKETS_B32_D2 = 1,
+    APPROX_TOPK_BUCKETS_B8_D3 = 2,
+    APPROX_TOPK_BUCKETS_B16_D2 = 3,
+    APPROX_TOPK_BUCKETS_B8_D2 = 4,
+};
diff --git a/thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h b/thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
new file mode 100644
index 000000000..91c51f2bc
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/approx_topk_hamming/approx_topk_hamming.h
@@ -0,0 +1,367 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <utility>
+
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/simdlib.h>
+
+namespace faiss {
+
+// HeapWithBucketsForHamming32 uses simd8uint32 under the hood.
+
+template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming32 {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming32<
+        CMax<int, int64_t>,
+        NBUCKETS,
+        N,
+        HammingComputerT> {
+    static constexpr uint32_t NBUCKETS_8 = NBUCKETS / 8;
+    static_assert(
+            (NBUCKETS) > 0 && ((NBUCKETS % 8) == 0),
+            "Number of buckets needs to be 8, 16, 24, ...");
+
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used with hc
+            const uint8_t* const __restrict binaryVectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int64_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used against hc
+            const uint8_t* const __restrict binary_vectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int64_t* const __restrict bh_ids) {
+        //
+        using C = CMax<int, int64_t>;
+
+        // Hamming code size
+        const size_t code_size = hc.get_code_size();
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            simd8uint32 min_distances_i[NBUCKETS_8][N];
+            simd8uint32 min_indices_i[NBUCKETS_8][N];
+
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                for (uint32_t p = 0; p < N; p++) {
+                    min_distances_i[j][p] =
+                            simd8uint32(std::numeric_limits<int32_t>::max());
+                    min_indices_i[j][p] = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
+                }
+            }
+
+            simd8uint32 current_indices(0, 1, 2, 3, 4, 5, 6, 7);
+            const simd8uint32 indices_delta(NBUCKETS);
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    uint32_t hamming_distances[8];
+                    for (size_t j8 = 0; j8 < 8; j8++) {
+                        hamming_distances[j8] = hc.compute(
+                                binary_vectors +
+                                (j8 + j * 8 + ip + n_per_beam * beam_index) *
+                                        code_size);
+                    }
+
+                    // loop. Compiler should get rid of unneeded ops
+                    simd8uint32 distance_candidate;
+                    distance_candidate.loadu(hamming_distances);
+                    simd8uint32 indices_candidate = current_indices;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        simd8uint32 min_distances_new;
+                        simd8uint32 min_indices_new;
+                        simd8uint32 max_distances_new;
+                        simd8uint32 max_indices_new;
+
+                        faiss::cmplt_min_max_fast(
+                                distance_candidate,
+                                indices_candidate,
+                                min_distances_i[j][p],
+                                min_indices_i[j][p],
+                                min_distances_new,
+                                min_indices_new,
+                                max_distances_new,
+                                max_indices_new);
+
+                        distance_candidate = max_distances_new;
+                        indices_candidate = max_indices_new;
+
+                        min_distances_i[j][p] = min_distances_new;
+                        min_indices_i[j][p] = min_indices_new;
+                    }
+                }
+
+                current_indices += indices_delta;
+            }
+
+            // fix the indices
+            for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                const simd8uint32 offset(n_per_beam * beam_index + j * 8);
+                for (uint32_t p = 0; p < N; p++) {
+                    min_indices_i[j][p] += offset;
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS_8; j++) {
+                    uint32_t min_indices_scalar[8];
+                    uint32_t min_distances_scalar[8];
+
+                    min_indices_i[j][p].storeu(min_indices_scalar);
+                    min_distances_i[j][p].storeu(min_distances_scalar);
+
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+                    for (size_t j8 = 0; j8 < 8; j8++) {
+                        const auto value = min_distances_scalar[j8];
+                        const auto index = min_indices_scalar[j8];
+
+                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
+                            heap_replace_top<C>(
+                                    k, bh_val, bh_ids, value, index);
+                        }
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const auto index = ip + n_per_beam * beam_index;
+                const auto value =
+                        hc.compute(binary_vectors + (index)*code_size);
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+// HeapWithBucketsForHamming16 uses simd16uint16 under the hood.
+// Less registers needed in total, so higher values of NBUCKETS/N can be used,
+//   but somewhat slower.
+// No more than 32K elements currently, but it can be reorganized a bit
+//   to be limited to 32K elements per beam.
+
+template <typename C, uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming16 {
+    // this case was not implemented yet.
+};
+
+template <uint32_t NBUCKETS, uint32_t N, typename HammingComputerT>
+struct HeapWithBucketsForHamming16<
+        CMax<int, int64_t>,
+        NBUCKETS,
+        N,
+        HammingComputerT> {
+    static constexpr uint32_t NBUCKETS_16 = NBUCKETS / 16;
+    static_assert(
+            (NBUCKETS) > 0 && ((NBUCKETS % 16) == 0),
+            "Number of buckets needs to be 16, 32, 48...");
+
+    static void addn(
+            // number of elements
+            const uint32_t n,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used with hc
+            const uint8_t* const __restrict binaryVectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n) range
+            int64_t* const __restrict bh_ids) {
+        // forward a call to bs_addn with 1 beam
+        bs_addn(1, n, hc, binaryVectors, k, bh_val, bh_ids);
+    }
+
+    static void bs_addn(
+            // beam_size parameter of Beam Search algorithm
+            const uint32_t beam_size,
+            // number of elements per beam
+            const uint32_t n_per_beam,
+            // Hamming computer
+            const HammingComputerT& hc,
+            // n elements that can be used against hc
+            const uint8_t* const __restrict binary_vectors,
+            // number of best elements to keep
+            const uint32_t k,
+            // output distances
+            int* const __restrict bh_val,
+            // output indices, each being within [0, n_per_beam * beam_size)
+            // range
+            int64_t* const __restrict bh_ids) {
+        //
+        using C = CMax<int, int64_t>;
+
+        // Hamming code size
+        const size_t code_size = hc.get_code_size();
+
+        // main loop
+        for (uint32_t beam_index = 0; beam_index < beam_size; beam_index++) {
+            simd16uint16 min_distances_i[NBUCKETS_16][N];
+            simd16uint16 min_indices_i[NBUCKETS_16][N];
+
+            for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                for (uint32_t p = 0; p < N; p++) {
+                    min_distances_i[j][p] =
+                            simd16uint16(std::numeric_limits<int16_t>::max());
+                    min_indices_i[j][p] = simd16uint16(
+                            0,
+                            1,
+                            2,
+                            3,
+                            4,
+                            5,
+                            6,
+                            7,
+                            8,
+                            9,
+                            10,
+                            11,
+                            12,
+                            13,
+                            14,
+                            15);
+                }
+            }
+
+            simd16uint16 current_indices(
+                    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+            const simd16uint16 indices_delta((uint16_t)NBUCKETS);
+
+            const uint32_t nb = (n_per_beam / NBUCKETS) * NBUCKETS;
+
+            // put the data into buckets
+            for (uint32_t ip = 0; ip < nb; ip += NBUCKETS) {
+                for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                    uint16_t hamming_distances[16];
+                    for (size_t j16 = 0; j16 < 16; j16++) {
+                        hamming_distances[j16] = hc.compute(
+                                binary_vectors +
+                                (j16 + j * 16 + ip + n_per_beam * beam_index) *
+                                        code_size);
+                    }
+
+                    // loop. Compiler should get rid of unneeded ops
+                    simd16uint16 distance_candidate;
+                    distance_candidate.loadu(hamming_distances);
+                    simd16uint16 indices_candidate = current_indices;
+
+                    for (uint32_t p = 0; p < N; p++) {
+                        simd16uint16 min_distances_new;
+                        simd16uint16 min_indices_new;
+                        simd16uint16 max_distances_new;
+                        simd16uint16 max_indices_new;
+
+                        faiss::cmplt_min_max_fast(
+                                distance_candidate,
+                                indices_candidate,
+                                min_distances_i[j][p],
+                                min_indices_i[j][p],
+                                min_distances_new,
+                                min_indices_new,
+                                max_distances_new,
+                                max_indices_new);
+
+                        distance_candidate = max_distances_new;
+                        indices_candidate = max_indices_new;
+
+                        min_distances_i[j][p] = min_distances_new;
+                        min_indices_i[j][p] = min_indices_new;
+                    }
+                }
+
+                current_indices += indices_delta;
+            }
+
+            // fix the indices
+            for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                const simd16uint16 offset(
+                        (uint16_t)(n_per_beam * beam_index + j * 16));
+                for (uint32_t p = 0; p < N; p++) {
+                    min_indices_i[j][p] += offset;
+                }
+            }
+
+            // merge every bucket into the regular heap
+            for (uint32_t p = 0; p < N; p++) {
+                for (uint32_t j = 0; j < NBUCKETS_16; j++) {
+                    uint16_t min_indices_scalar[16];
+                    uint16_t min_distances_scalar[16];
+
+                    min_indices_i[j][p].storeu(min_indices_scalar);
+                    min_distances_i[j][p].storeu(min_distances_scalar);
+
+                    // this exact way is needed to maintain the order as if the
+                    // input elements were pushed to the heap sequentially
+                    for (size_t j16 = 0; j16 < 16; j16++) {
+                        const auto value = min_distances_scalar[j16];
+                        const auto index = min_indices_scalar[j16];
+
+                        if (C::cmp2(bh_val[0], value, bh_ids[0], index)) {
+                            heap_replace_top<C>(
+                                    k, bh_val, bh_ids, value, index);
+                        }
+                    }
+                }
+            }
+
+            // process leftovers
+            for (uint32_t ip = nb; ip < n_per_beam; ip++) {
+                const auto index = ip + n_per_beam * beam_index;
+                const auto value =
+                        hc.compute(binary_vectors + (index)*code_size);
+
+                if (C::cmp(bh_val[0], value)) {
+                    heap_replace_top<C>(k, bh_val, bh_ids, value, index);
+                }
+            }
+        }
+    }
+};
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/binary_distances.cpp b/thirdparty/faiss/faiss/utils/binary_distances.cpp
index 767dc95d3..dddedff4a 100644
--- a/thirdparty/faiss/faiss/utils/binary_distances.cpp
+++ b/thirdparty/faiss/faiss/utils/binary_distances.cpp
@@ -16,14 +16,19 @@
 
 #include <omp.h>
 
+#include <faiss/impl/IDSelector.h>
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/jaccard-inl.h>
 #include <faiss/utils/structure-inl.h>
 #include <faiss/utils/utils.h>
 #include <simd/hook.h>
 
+#include <knowhere/bitsetview.h>
+
 namespace faiss {
 
+extern uint8_t lookup8bit[256];
+
 #define fast_loop_imp(fun_u64, fun_u8)                 \
     auto a = reinterpret_cast<const uint64_t*>(data1); \
     auto b = reinterpret_cast<const uint64_t*>(data2); \
@@ -83,6 +88,7 @@ namespace faiss {
     }
 
 int popcnt(const uint8_t* data, const size_t code_size) {
+    // todo aguzhva: improve this code, maybe reuse the code from hamming.h
     auto data1 = data, data2 = data; // for the macro fast_loop_imp
 #define fun_u64 accu += popcount64(a[i])
 #define fun_u8(i) accu += lookup8bit[a[i]]
@@ -97,6 +103,7 @@ int xor_popcnt(
         const uint8_t* data1,
         const uint8_t* data2,
         const size_t code_size) {
+    // todo aguzhva: improve this code, maybe reuse the code from hamming.h
 #define fun_u64 accu += popcount64(a[i] ^ b[i]);
 #define fun_u8(i) accu += lookup8bit[a[i] ^ b[i]];
     int accu = 0;
@@ -110,6 +117,7 @@ int or_popcnt(
         const uint8_t* data1,
         const uint8_t* data2,
         const size_t code_size) {
+    // todo aguzhva: improve this code, maybe reuse the code from hamming.h
 #define fun_u64 accu += popcount64(a[i] | b[i])
 #define fun_u8(i) accu += lookup8bit[a[i] | b[i]]
     int accu = 0;
@@ -123,6 +131,7 @@ int and_popcnt(
         const uint8_t* data1,
         const uint8_t* data2,
         const size_t code_size) {
+    // todo aguzhva: improve this code, maybe reuse the code from hamming.h
 #define fun_u64 accu += popcount64(a[i] & b[i])
 #define fun_u8(i) accu += lookup8bit[a[i] & b[i]]
     int accu = 0;
@@ -138,6 +147,7 @@ bool is_subset(
         const uint8_t* data1,
         const uint8_t* data2,
         const size_t code_size) {
+    // todo aguzhva: improve this code, maybe reuse the code from hamming.h
 #define fun_u64                \
     if ((a[i] & b[i]) != a[i]) \
     return false
@@ -154,6 +164,7 @@ float bvec_jaccard(
         const uint8_t* data1,
         const uint8_t* data2,
         const size_t code_size) {
+    // todo aguzhva: improve this code, maybe reuse the code from hamming.h
 #define fun_u64                          \
     accu_num += popcount64(a[i] & b[i]); \
     accu_den += popcount64(a[i] | b[i])
@@ -179,7 +190,7 @@ void binary_knn_mc(
         size_t k,
         float* distances,
         int64_t* labels,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     int thread_max_num = omp_get_max_threads();
     size_t l3_size = get_l3_size();
 
@@ -201,7 +212,7 @@ void binary_knn_mc(
 
 #pragma omp parallel for
         for (size_t j = 0; j < n2; j++) {
-            if (bitset.empty() || !bitset.test(j)) {
+            if (!sel || sel->is_member(j)) {
                 int thread_no = omp_get_thread_num();
 
                 const uint8_t* bs2_ = bs2 + j * bytes_per_code;
@@ -262,7 +273,7 @@ void binary_knn_mc(
                 T hc(bs1 + i * bytes_per_code, bytes_per_code);
                 const uint8_t* bs2_ = bs2 + j0 * bytes_per_code;
                 for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
-                    if (bitset.empty() || !bitset.test(j)) {
+                    if (!sel || sel->is_member(j)) {
                         if (hc.compute(bs2_)) {
                             dis[num_i] = 0;
                             lab[num_i] = j;
@@ -298,14 +309,14 @@ void binary_knn_mc(
         size_t ncodes,
         float* distances,
         int64_t* labels,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     switch (metric_type) {
         case METRIC_Substructure:
             switch (ncodes) {
 #define binary_knn_mc_Substructure(ncodes)                           \
     case ncodes:                                                     \
         binary_knn_mc<faiss::StructureComputer##ncodes<false>>(      \
-                ncodes, a, b, na, nb, k, distances, labels, bitset); \
+                ncodes, a, b, na, nb, k, distances, labels, sel); \
         break;
                 binary_knn_mc_Substructure(8);
                 binary_knn_mc_Substructure(16);
@@ -317,7 +328,7 @@ void binary_knn_mc(
 #undef binary_knn_mc_Substructure
                 default:
                     binary_knn_mc<faiss::StructureComputerDefault<false>>(
-                            ncodes, a, b, na, nb, k, distances, labels, bitset);
+                            ncodes, a, b, na, nb, k, distances, labels, sel);
                     break;
             }
             break;
@@ -327,7 +338,7 @@ void binary_knn_mc(
 #define binary_knn_mc_Superstructure(ncodes)                         \
     case ncodes:                                                     \
         binary_knn_mc<faiss::StructureComputer##ncodes<true>>(       \
-                ncodes, a, b, na, nb, k, distances, labels, bitset); \
+                ncodes, a, b, na, nb, k, distances, labels, sel); \
         break;
                 binary_knn_mc_Superstructure(8);
                 binary_knn_mc_Superstructure(16);
@@ -339,7 +350,7 @@ void binary_knn_mc(
 #undef binary_knn_mc_Superstructure
                 default:
                     binary_knn_mc<faiss::StructureComputerDefault<true>>(
-                            ncodes, a, b, na, nb, k, distances, labels, bitset);
+                            ncodes, a, b, na, nb, k, distances, labels, sel);
                     break;
             }
             break;
@@ -356,7 +367,7 @@ void binary_knn_hc(
         const uint8_t* bs1,
         const uint8_t* bs2,
         size_t n2,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     typedef typename C::T T;
     size_t k = ha->k;
 
@@ -389,7 +400,7 @@ void binary_knn_hc(
 
 #pragma omp parallel for
         for (size_t j = 0; j < n2; j++) {
-            if (bitset.empty() || !bitset.test(j)) {
+            if (!sel || sel->is_member(j)) {
                 int thread_no = omp_get_thread_num();
 
                 const uint8_t* bs2_ = bs2 + j * bytes_per_code;
@@ -449,7 +460,7 @@ void binary_knn_hc(
                 T* __restrict bh_val_ = ha->val + i * k;
                 int64_t* __restrict bh_ids_ = ha->ids + i * k;
                 for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
-                    if (bitset.empty() || !bitset.test(j)) {
+                    if (!sel || sel->is_member(j)) {
                         dis = hc.compute(bs2_);
                         if (C::cmp(bh_val_[0], dis)) {
                             faiss::heap_replace_top<C>(
@@ -471,7 +482,7 @@ void binary_knn_hc(
         const uint8_t* b,
         size_t nb,
         size_t ncodes,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     switch (metric_type) {
         case METRIC_Jaccard: {
             {
@@ -479,7 +490,7 @@ void binary_knn_hc(
 #define binary_knn_hc_jaccard(ncodes)                     \
     case ncodes:                                          \
         binary_knn_hc<C, faiss::JaccardComputer##ncodes>( \
-                ncodes, ha, a, b, nb, bitset);            \
+                ncodes, ha, a, b, nb, sel);            \
         break;
                     binary_knn_hc_jaccard(8);
                     binary_knn_hc_jaccard(16);
@@ -491,7 +502,7 @@ void binary_knn_hc(
 #undef binary_knn_hc_jaccard
                     default:
                         binary_knn_hc<C, faiss::JaccardComputerDefault>(
-                                ncodes, ha, a, b, nb, bitset);
+                                ncodes, ha, a, b, nb, sel);
                         break;
                 }
             }
@@ -504,7 +515,7 @@ void binary_knn_hc(
 #define binary_knn_hc_hamming(ncodes)                     \
     case ncodes:                                          \
         binary_knn_hc<C, faiss::HammingComputer##ncodes>( \
-                ncodes, ha, a, b, nb, bitset);            \
+                ncodes, ha, a, b, nb, sel);            \
         break;
                     binary_knn_hc_hamming(4);
                     binary_knn_hc_hamming(8);
@@ -515,7 +526,7 @@ void binary_knn_hc(
 #undef binary_knn_hc_hamming
                     default:
                         binary_knn_hc<C, faiss::HammingComputerDefault>(
-                                ncodes, ha, a, b, nb, bitset);
+                                ncodes, ha, a, b, nb, sel);
                         break;
                 }
             }
@@ -534,7 +545,7 @@ template void binary_knn_hc<CMax<int, int64_t>>(
         const uint8_t* b,
         size_t nb,
         size_t ncodes,
-        const BitsetView bitset);
+        const IDSelector* sel);
 
 template void binary_knn_hc<CMax<float, int64_t>>(
         MetricType metric_type,
@@ -543,7 +554,7 @@ template void binary_knn_hc<CMax<float, int64_t>>(
         const uint8_t* b,
         size_t nb,
         size_t ncodes,
-        const BitsetView bitset);
+        const IDSelector* sel);
 
 template <class C, typename T, class MetricComputer>
 void binary_range_search(
@@ -554,7 +565,7 @@ void binary_range_search(
         T radius,
         size_t code_size,
         RangeSearchResult* res,
-        const BitsetView bitset = nullptr) {
+        const IDSelector* sel = nullptr) {
 #pragma omp parallel
     {
         RangeSearchPartialResult pres(res);
@@ -563,7 +574,7 @@ void binary_range_search(
             MetricComputer mc(a + i * code_size, code_size);
             RangeQueryResult& qres = pres.new_result(i);
             for (size_t j = 0; j < nb; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
                     T dis = mc.compute(b + j * code_size);
                     if (C::cmp(dis, radius)) {
                         qres.add(dis, j);
@@ -585,7 +596,7 @@ void binary_range_search(
         T radius,
         size_t code_size,
         RangeSearchResult* res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     switch (metric_type) {
         case METRIC_Jaccard: {
             {
@@ -593,7 +604,7 @@ void binary_range_search(
 #define binary_range_search_jaccard(ncodes)                        \
     case ncodes:                                                   \
         binary_range_search<C, T, faiss::JaccardComputer##ncodes>( \
-                a, b, na, nb, radius, code_size, res, bitset);     \
+                a, b, na, nb, radius, code_size, res, sel);     \
         break;
                     binary_range_search_jaccard(8);
                     binary_range_search_jaccard(16);
@@ -608,7 +619,7 @@ void binary_range_search(
                                 C,
                                 T,
                                 faiss::JaccardComputerDefault>(
-                                a, b, na, nb, radius, code_size, res, bitset);
+                                a, b, na, nb, radius, code_size, res, sel);
                         break;
                 }
             }
@@ -621,7 +632,7 @@ void binary_range_search(
 #define binary_range_search_hamming(ncodes)                        \
     case ncodes:                                                   \
         binary_range_search<C, T, faiss::HammingComputer##ncodes>( \
-                a, b, na, nb, radius, code_size, res, bitset);     \
+                a, b, na, nb, radius, code_size, res, sel);     \
         break;
                     binary_range_search_hamming(4);
                     binary_range_search_hamming(8);
@@ -635,7 +646,7 @@ void binary_range_search(
                                 C,
                                 T,
                                 faiss::HammingComputerDefault>(
-                                a, b, na, nb, radius, code_size, res, bitset);
+                                a, b, na, nb, radius, code_size, res, sel);
                         break;
                 }
             }
@@ -657,7 +668,7 @@ template void binary_range_search<CMin<int, int64_t>, int>(
         int radius,
         size_t code_size,
         RangeSearchResult* res,
-        const BitsetView bitset);
+        const IDSelector* sel);
 
 template void binary_range_search<CMin<float, int64_t>, float>(
         MetricType metric_type,
@@ -668,6 +679,6 @@ template void binary_range_search<CMin<float, int64_t>, float>(
         float radius,
         size_t code_size,
         RangeSearchResult* res,
-        const BitsetView bitset);
+        const IDSelector* sel);
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/binary_distances.h b/thirdparty/faiss/faiss/utils/binary_distances.h
index 2e629df3d..acb059a43 100644
--- a/thirdparty/faiss/faiss/utils/binary_distances.h
+++ b/thirdparty/faiss/faiss/utils/binary_distances.h
@@ -18,13 +18,15 @@
 #include <faiss/MetricType.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/utils/Heap.h>
-#include <knowhere/bitsetview.h>
 #include <stdint.h>
 
 /* The binary distance type */
 typedef float tadis_t;
 
 namespace faiss {
+
+struct IDSelector;
+
 /**
  * Calculate the number of bit 1
  */
@@ -101,7 +103,7 @@ void binary_knn_mc(
         size_t ncodes,
         float* distances,
         int64_t* labels,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
 /** Return the k smallest distances for a set of binary query vectors,
  * using a heap.
@@ -120,7 +122,7 @@ void binary_knn_hc(
         const uint8_t* b,
         size_t nb,
         size_t ncodes,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
 template <class C, typename T>
 void binary_range_search(
@@ -132,7 +134,7 @@ void binary_range_search(
         T radius,
         size_t code_size,
         RangeSearchResult* res,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
 } // namespace faiss
 
diff --git a/thirdparty/faiss/faiss/utils/bit_table.cpp b/thirdparty/faiss/faiss/utils/bit_table.cpp
index 8d90a53ca..fb5b4a498 100644
--- a/thirdparty/faiss/faiss/utils/bit_table.cpp
+++ b/thirdparty/faiss/faiss/utils/bit_table.cpp
@@ -1,5 +1,6 @@
 #include <cstdint>
 namespace faiss {
+// todo aguzhva: this is duplicated in utils/hamming_distance/common.h
 uint8_t lookup8bit[256] = {
         /*  0 */ 0, /*  1 */ 1, /*  2 */ 1, /*  3 */ 2,
         /*  4 */ 1, /*  5 */ 2, /*  6 */ 2, /*  7 */ 3,
diff --git a/thirdparty/faiss/faiss/utils/distances.cpp b/thirdparty/faiss/faiss/utils/distances.cpp
index 789f07db4..252490598 100644
--- a/thirdparty/faiss/faiss/utils/distances.cpp
+++ b/thirdparty/faiss/faiss/utils/distances.cpp
@@ -8,19 +8,22 @@
 // -*- c++ -*-
 
 #include <faiss/utils/distances.h>
+#include <faiss/utils/distances_if.h>
 
 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
-#include "simd/hook.h"
 
 #include <omp.h>
 
+#include "knowhere/bitsetview_idselector.h"
+
 #include <faiss/FaissHook.h>
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/impl/ResultHandler.h>
 #include <faiss/utils/utils.h>
 
@@ -98,7 +101,38 @@ void fvec_renorm_L2(size_t d, size_t nx, float* __restrict x) {
 
 namespace {
 
+// Helpers are used in search functions to help specialize various
+// performance-related use cases, such as adding some extra
+// support for a particular kind of IDSelector classes. This
+// may be useful if the lion's share of samples are filtered out.
+
+struct IDSelectorAll {
+    inline bool is_member(const size_t idx) const {
+        return true;
+    }
+};
+
+struct IDSelectorHelper {
+    const IDSelector* sel;
+
+    inline bool is_member(const size_t idx) const {
+        return sel->is_member(idx);
+    }
+};
+
+struct BitsetViewSelectorHelper {
+    // todo aguzhva: use avx gather instruction
+    const knowhere::BitsetView bitset;
+
+    inline bool is_member(const size_t idx) const {
+        return !bitset.test(idx);
+    }
+};
+
 /* Find the nearest neighbors for nx queries in a set of ny vectors */
+
+/*
+/// Baseline implementation of exhaustive_inner_product_seq
 template <class ResultHandler>
 void exhaustive_inner_product_seq(
         const float* x,
@@ -107,7 +141,7 @@ void exhaustive_inner_product_seq(
         size_t nx,
         size_t ny,
         ResultHandler& res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     using SingleResultHandler = typename ResultHandler::SingleResultHandler;
     int nt = std::min(int(nx), omp_get_max_threads());
 
@@ -120,7 +154,9 @@ void exhaustive_inner_product_seq(
             const float* y_j = y;
             resi.begin(i);
             for (size_t j = 0; j < ny; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                // todo aguzhva: bitset was here
+                //if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
                     float ip = fvec_inner_product(x_i, y_j, d);
                     resi.add_result(ip, j);
                 }
@@ -130,7 +166,90 @@ void exhaustive_inner_product_seq(
         }
     }
 }
+*/
+
+// An improved implementation that
+// 1. helps the branch predictor,
+// 2. computes distances for 4 elements per loop
+template <class ResultHandler, class SelectorHelper>
+void exhaustive_inner_product_seq(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        ResultHandler& res,
+        const SelectorHelper selector) {
+    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+    int nt = std::min(int(nx), omp_get_max_threads());
+
+#pragma omp parallel num_threads(nt)
+    {
+        SingleResultHandler resi(res);
+#pragma omp for
+        for (int64_t i = 0; i < nx; i++) {
+            const float* x_i = x + i * d;
+            resi.begin(i);
+
+            // the lambda that filters acceptable elements.
+            auto filter = [&selector](const size_t j) { 
+                return selector.is_member(j); 
+            };
+
+            // the lambda that applies a filtered element.
+            auto apply = [&resi](const float ip, const idx_t j) {
+                resi.add_result(ip, j);
+            };
+
+            // compute distances
+            fvec_inner_products_ny_if(x_i, y, d, ny, filter, apply);
+
+            resi.end();
+        }
+    }
+}
+
+
+template <class ResultHandler>
+void exhaustive_inner_product_seq(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        ResultHandler& res,
+        const IDSelector* __restrict sel) {
+    // add different specialized cases here via introducing
+    //   helpers which are converted into templates.
+
+    // bitset.empty() translates into sel=nullptr
+
+    if (const auto* bitsetview_sel = dynamic_cast<const knowhere::BitsetViewIDSelector*>(sel)) {
+        // A specialized case for Knowhere
+        auto bitset = bitsetview_sel->bitset_view;
+        if (!bitset.empty()) {
+            BitsetViewSelectorHelper bitset_helper{bitset};
+            exhaustive_inner_product_seq<ResultHandler, BitsetViewSelectorHelper>(
+                x, y, d, nx, ny, res, bitset_helper);
+            return;
+        }
+    }
+    else if (sel != nullptr) {
+        // default Faiss case if sel is defined
+        IDSelectorHelper ids_helper{sel};
+        exhaustive_inner_product_seq<ResultHandler, IDSelectorHelper>(
+            x, y, d, nx, ny, res, ids_helper);
+        return;
+    }
+
+    // default case if no filter is needed or if it is empty
+    IDSelectorAll helper;
+    exhaustive_inner_product_seq<ResultHandler, IDSelectorAll>(
+        x, y, d, nx, ny, res, helper);
+}
 
+/*
+// Baseline implementation of exhaustive_L2sqr_seq
 template <class ResultHandler>
 void exhaustive_L2sqr_seq(
         const float* x,
@@ -139,7 +258,7 @@ void exhaustive_L2sqr_seq(
         size_t nx,
         size_t ny,
         ResultHandler& res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     using SingleResultHandler = typename ResultHandler::SingleResultHandler;
     int nt = std::min(int(nx), omp_get_max_threads());
 
@@ -152,7 +271,9 @@ void exhaustive_L2sqr_seq(
             const float* y_j = y;
             resi.begin(i);
             for (size_t j = 0; j < ny; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                // todo aguzhva: bitset was here
+                //if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
                     float disij = fvec_L2sqr(x_i, y_j, d);
                     resi.add_result(disij, j);
                 }
@@ -162,6 +283,86 @@ void exhaustive_L2sqr_seq(
         }
     }
 }
+*/
+
+// An improved implementation that
+// 1. helps the branch predictor,
+// 2. computes distances for 4 elements per loop
+template <class ResultHandler, class SelectorHelper>
+void exhaustive_L2sqr_seq(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        ResultHandler& res,
+        const SelectorHelper selector) {
+    using SingleResultHandler = typename ResultHandler::SingleResultHandler;
+    int nt = std::min(int(nx), omp_get_max_threads());
+
+#pragma omp parallel num_threads(nt)
+    {
+        SingleResultHandler resi(res);
+#pragma omp for
+        for (int64_t i = 0; i < nx; i++) {
+            const float* x_i = x + i * d;
+            resi.begin(i);
+
+            // the lambda that filters acceptable elements.
+            auto filter = [&selector](const size_t j) { 
+                return selector.is_member(j); 
+            };
+
+            // the lambda that applies a filtered element.
+            auto apply = [&resi](const float dis, const idx_t j) {
+                resi.add_result(dis, j);
+            };
+
+            // compute distances
+            fvec_L2sqr_ny_if(x_i, y, d, ny, filter, apply);
+
+            resi.end();
+        }
+    }
+}
+
+template <class ResultHandler>
+void exhaustive_L2sqr_seq(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        ResultHandler& res,
+        const IDSelector* __restrict sel) {
+    // add different specialized cases here via introducing
+    //   helpers which are converted into templates.
+
+    // bitset.empty() translates into sel=nullptr
+
+    if (const auto* bitsetview_sel = dynamic_cast<const knowhere::BitsetViewIDSelector*>(sel)) {
+        // A specialized case for Knowhere
+        auto bitset = bitsetview_sel->bitset_view;
+        if (!bitset.empty()) {
+            BitsetViewSelectorHelper bitset_helper{bitset};
+            exhaustive_L2sqr_seq<ResultHandler, BitsetViewSelectorHelper>(
+                x, y, d, nx, ny, res, bitset_helper);
+            return;
+        }
+    }
+    else if (sel != nullptr) {
+        // default Faiss case if sel is defined
+        IDSelectorHelper ids_helper{sel};
+        exhaustive_L2sqr_seq<ResultHandler, IDSelectorHelper>(
+            x, y, d, nx, ny, res, ids_helper);
+        return;
+    }
+
+    // default case if no filter is needed or if it is empty
+    IDSelectorAll helper;
+    exhaustive_L2sqr_seq<ResultHandler, IDSelectorAll>(
+        x, y, d, nx, ny, res, helper);
+}
 
 template <class ResultHandler>
 void exhaustive_cosine_seq(
@@ -172,7 +373,7 @@ void exhaustive_cosine_seq(
         size_t nx,
         size_t ny,
         ResultHandler& res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     using SingleResultHandler = typename ResultHandler::SingleResultHandler;
     int nt = std::min(int(nx), omp_get_max_threads());
 
@@ -185,7 +386,8 @@ void exhaustive_cosine_seq(
             const float* y_j = y;
             resi.begin(i);
             for (size_t j = 0; j < ny; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
+                    // todo aguzhva: what if a norm == 0 ?
                     float norm =
                         (y_norms != nullptr) ? y_norms[j]
                                              : sqrtf(fvec_norm_L2sqr(y_j, d));
@@ -208,7 +410,7 @@ void exhaustive_inner_product_blas(
         size_t nx,
         size_t ny,
         ResultHandler& res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0)
         return;
@@ -248,7 +450,7 @@ void exhaustive_inner_product_blas(
                        &nyi);
             }
 
-            res.add_results(j0, j1, ip_block.get(), bitset);
+            res.add_results(j0, j1, ip_block.get(), sel);
         }
         res.end_multiple();
         InterruptCallback::check();
@@ -266,7 +468,7 @@ void exhaustive_L2sqr_blas(
         size_t ny,
         ResultHandler& res,
         const float* y_norms = nullptr,
-        const BitsetView bitset = nullptr) {
+        const IDSelector* sel = nullptr) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0)
         return;
@@ -334,7 +536,7 @@ void exhaustive_L2sqr_blas(
                     ip_line++;
                 }
             }
-            res.add_results(j0, j1, ip_block.get(), bitset);
+            res.add_results(j0, j1, ip_block.get(), sel);
         }
         res.end_multiple();
         InterruptCallback::check();
@@ -350,7 +552,7 @@ void exhaustive_cosine_blas(
         size_t nx,
         size_t ny,
         ResultHandler& res,
-        const BitsetView bitset = nullptr) {
+        const IDSelector* sel = nullptr) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0)
         return;
@@ -408,7 +610,7 @@ void exhaustive_cosine_blas(
                     ip_line++;
                 }
             }
-            res.add_results(j0, j1, ip_block.get(), bitset);
+            res.add_results(j0, j1, ip_block.get(), sel);
         }
         res.end_multiple();
         InterruptCallback::check();
@@ -424,7 +626,7 @@ static void knn_jaccard_blas(
         size_t ny,
         ResultHandler& res,
         const DistanceCorrection& corr,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     // BLAS does not like empty matrices
     if (nx == 0 || ny == 0)
         return;
@@ -476,7 +678,7 @@ static void knn_jaccard_blas(
                 float* ip_line = ip_block + (i - i0) * (j1 - j0);
 
                 for (size_t j = j0; j < j1; j++) {
-                    if (bitset.empty() || !bitset.test(j)) {
+                    if (!sel || sel->is_member(j)) {
                         float ip = *ip_line;
                         float dis = 1.0 - ip / (x_norms[i] + y_norms[j] - ip);
 
@@ -515,22 +717,22 @@ void knn_inner_product(
         size_t nx,
         size_t ny,
         float_minheap_array_t* ha,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     if (ha->k < distance_compute_min_k_reservoir) {
         HeapResultHandler<CMin<float, int64_t>> res(
                 ha->nh, ha->val, ha->ids, ha->k);
         if (nx < distance_compute_blas_threshold) {
-            exhaustive_inner_product_seq(x, y, d, nx, ny, res, bitset);
+            exhaustive_inner_product_seq(x, y, d, nx, ny, res, sel);
         } else {
-            exhaustive_inner_product_blas(x, y, d, nx, ny, res, bitset);
+            exhaustive_inner_product_blas(x, y, d, nx, ny, res, sel);
         }
     } else {
         ReservoirResultHandler<CMin<float, int64_t>> res(
                 ha->nh, ha->val, ha->ids, ha->k);
         if (nx < distance_compute_blas_threshold) {
-            exhaustive_inner_product_seq(x, y, d, nx, ny, res, bitset);
+            exhaustive_inner_product_seq(x, y, d, nx, ny, res, sel);
         } else {
-            exhaustive_inner_product_blas(x, y, d, nx, ny, res, bitset);
+            exhaustive_inner_product_blas(x, y, d, nx, ny, res, sel);
         }
     }
 }
@@ -543,23 +745,22 @@ void knn_L2sqr(
         size_t ny,
         float_maxheap_array_t* ha,
         const float* y_norm2,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     if (ha->k < distance_compute_min_k_reservoir) {
         HeapResultHandler<CMax<float, int64_t>> res(
                 ha->nh, ha->val, ha->ids, ha->k);
-
         if (nx < distance_compute_blas_threshold) {
-            exhaustive_L2sqr_seq(x, y, d, nx, ny, res, bitset);
+            exhaustive_L2sqr_seq(x, y, d, nx, ny, res, sel);
         } else {
-            exhaustive_L2sqr_blas(x, y, d, nx, ny, res, y_norm2, bitset);
+            exhaustive_L2sqr_blas(x, y, d, nx, ny, res, y_norm2, sel);
         }
     } else {
         ReservoirResultHandler<CMax<float, int64_t>> res(
                 ha->nh, ha->val, ha->ids, ha->k);
         if (nx < distance_compute_blas_threshold) {
-            exhaustive_L2sqr_seq(x, y, d, nx, ny, res, bitset);
+            exhaustive_L2sqr_seq(x, y, d, nx, ny, res, sel);
         } else {
-            exhaustive_L2sqr_blas(x, y, d, nx, ny, res, y_norm2, bitset);
+            exhaustive_L2sqr_blas(x, y, d, nx, ny, res, y_norm2, sel);
         }
     }
 }
@@ -572,22 +773,22 @@ void knn_cosine(
         size_t nx,
         size_t ny,
         float_minheap_array_t* ha,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     if (ha->k < distance_compute_min_k_reservoir) {
         HeapResultHandler<CMin<float, int64_t>> res(
                 ha->nh, ha->val, ha->ids, ha->k);
         if (nx < distance_compute_blas_threshold) {
-            exhaustive_cosine_seq(x, y, y_norms, d, nx, ny, res, bitset);
+            exhaustive_cosine_seq(x, y, y_norms, d, nx, ny, res, sel);
         } else {
-            exhaustive_cosine_blas(x, y, y_norms, d, nx, ny, res, bitset);
+            exhaustive_cosine_blas(x, y, y_norms, d, nx, ny, res, sel);
         }
     } else {
         ReservoirResultHandler<CMin<float, int64_t>> res(
                 ha->nh, ha->val, ha->ids, ha->k);
         if (nx < distance_compute_blas_threshold) {
-            exhaustive_cosine_seq(x, y, y_norms, d, nx, ny, res, bitset);
+            exhaustive_cosine_seq(x, y, y_norms, d, nx, ny, res, sel);
         } else {
-            exhaustive_cosine_blas(x, y, y_norms, d, nx, ny, res, bitset);
+            exhaustive_cosine_blas(x, y, y_norms, d, nx, ny, res, sel);
         }
     }
 }
@@ -605,7 +806,7 @@ void knn_jaccard(
         size_t nx,
         size_t ny,
         float_maxheap_array_t* ha,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     if (d % 4 != 0) {
         // knn_jaccard_sse(x, y, d, nx, ny, res);
         FAISS_ASSERT_MSG(false, "dim is not multiple of 4!");
@@ -613,7 +814,7 @@ void knn_jaccard(
         NopDistanceCorrection nop;
         HeapResultHandler<CMax<float, int64_t>> res(
                 ha->nh, ha->val, ha->ids, ha->k);
-        knn_jaccard_blas(x, y, d, nx, ny, res, nop, bitset);
+        knn_jaccard_blas(x, y, d, nx, ny, res, nop, sel);
     }
 }
 
@@ -629,12 +830,12 @@ void range_search_L2sqr(
         size_t ny,
         float radius,
         RangeSearchResult* res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     RangeSearchResultHandler<CMax<float, int64_t>> resh(res, radius);
     if (nx < distance_compute_blas_threshold) {
-        exhaustive_L2sqr_seq(x, y, d, nx, ny, resh, bitset);
+        exhaustive_L2sqr_seq(x, y, d, nx, ny, resh, sel);
     } else {
-        exhaustive_L2sqr_blas(x, y, d, nx, ny, resh, nullptr, bitset);
+        exhaustive_L2sqr_blas(x, y, d, nx, ny, resh, nullptr, sel);
     }
 }
 
@@ -646,12 +847,12 @@ void range_search_inner_product(
         size_t ny,
         float radius,
         RangeSearchResult* res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     RangeSearchResultHandler<CMin<float, int64_t>> resh(res, radius);
     if (nx < distance_compute_blas_threshold) {
-        exhaustive_inner_product_seq(x, y, d, nx, ny, resh, bitset);
+        exhaustive_inner_product_seq(x, y, d, nx, ny, resh, sel);
     } else {
-        exhaustive_inner_product_blas(x, y, d, nx, ny, resh, bitset);
+        exhaustive_inner_product_blas(x, y, d, nx, ny, resh, sel);
     }
 }
 
@@ -664,12 +865,12 @@ void range_search_cosine(
         size_t ny,
         float radius,
         RangeSearchResult* res,
-        const BitsetView bitset) {
+        const IDSelector* sel) {
     RangeSearchResultHandler<CMin<float, int64_t>> resh(res, radius);
     if (nx < distance_compute_blas_threshold) {
-        exhaustive_cosine_seq(x, y, y_norms, d, nx, ny, resh, bitset);
+        exhaustive_cosine_seq(x, y, y_norms, d, nx, ny, resh, sel);
     } else {
-        exhaustive_cosine_blas(x, y, y_norms, d, nx, ny, resh, bitset);
+        exhaustive_cosine_blas(x, y, y_norms, d, nx, ny, resh, sel);
     }
 }
 
@@ -692,11 +893,32 @@ void fvec_inner_products_by_idx(
         const int64_t* __restrict idsj = ids + j * ny;
         const float* xj = x + j * d;
         float* __restrict ipj = ip + j * ny;
-        for (size_t i = 0; i < ny; i++) {
-            if (idsj[i] < 0)
-                continue;
-            ipj[i] = fvec_inner_product(xj, y + d * idsj[i], d);
-        }
+
+        // // baseline version
+        // for (size_t i = 0; i < ny; i++) {
+        //     if (idsj[i] < 0)
+        //         continue;
+        //     ipj[i] = fvec_inner_product(xj, y + d * idsj[i], d);
+        // }
+
+        // the lambda that filters acceptable elements.
+        auto filter = [=](const size_t i) { return (idsj[i] >= 0); };
+        
+        // the lambda that applies a filtered element.
+        auto apply = [=](const float dis, const size_t i) {
+            ipj[i] = dis;
+        };
+
+        // compute distances
+        fvec_inner_products_ny_by_idx_if(
+            xj,
+            y,
+            idsj,
+            d,
+            ny,
+            filter,
+            apply
+        );
     }
 }
 
@@ -715,11 +937,32 @@ void fvec_L2sqr_by_idx(
         const int64_t* __restrict idsj = ids + j * ny;
         const float* xj = x + j * d;
         float* __restrict disj = dis + j * ny;
-        for (size_t i = 0; i < ny; i++) {
-            if (idsj[i] < 0)
-                continue;
-            disj[i] = fvec_L2sqr(xj, y + d * idsj[i], d);
-        }
+
+        // // baseline version
+        // for (size_t i = 0; i < ny; i++) {
+        //     if (idsj[i] < 0)
+        //         continue;
+        //     disj[i] = fvec_L2sqr(xj, y + d * idsj[i], d);
+        // }
+
+        // the lambda that filters acceptable elements.
+        auto filter = [=](const size_t i) { return (idsj[i] >= 0); };
+
+        // the lambda that applies a filtered element.
+        auto apply = [=](const float dis, const size_t i) {
+            disj[i] = dis;
+        };
+
+        // compute distances
+        fvec_L2sqr_ny_by_idx_if(
+            xj,
+            y,
+            idsj,
+            d,
+            ny,
+            filter,
+            apply
+        );
     }
 }
 
@@ -890,6 +1133,7 @@ void inner_product_to_L2sqr(
     }
 }
 
+// todo aguzhva: Faiss 1.7.4, no longer used in IndexFlat::assign and Clustering.
 void elkan_L2_sse(
         const float* x,
         const float* y,
diff --git a/thirdparty/faiss/faiss/utils/distances.h b/thirdparty/faiss/faiss/utils/distances.h
index 1e06d51ac..30b631673 100644
--- a/thirdparty/faiss/faiss/utils/distances.h
+++ b/thirdparty/faiss/faiss/utils/distances.h
@@ -15,10 +15,11 @@
 
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
-#include <knowhere/bitsetview.h>
-using knowhere::BitsetView;
+
 namespace faiss {
 
+struct IDSelector;
+
 /*********************************************************
  * Optimized distance/norm/inner prod computations
  *********************************************************/
@@ -168,11 +169,11 @@ FAISS_API extern int distance_compute_blas_database_bs;
 FAISS_API extern int distance_compute_min_k_reservoir;
 
 /** Return the k nearest neighors of each of the nx vectors x among the ny
- *  vector y, w.r.t to max inner product
+ *  vector y, w.r.t to max inner product.
  *
  * @param x    query vectors, size nx * d
  * @param y    database vectors, size ny * d
- * @param res  result array, which also provides k. Sorted on output
+ * @param res  result heap structure, which also provides k. Sorted on output
  */
 void knn_inner_product(
         const float* x,
@@ -181,10 +182,34 @@ void knn_inner_product(
         size_t nx,
         size_t ny,
         float_minheap_array_t* res,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
-/** Same as knn_inner_product, for the L2 distance
- *  @param y_norm2    norms for the y vectors (nullptr or size ny)
+/**  Return the k nearest neighors of each of the nx vectors x among the ny
+ *  vector y, for the inner product metric.
+ *
+ * @param x    query vectors, size nx * d
+ * @param y    database vectors, size ny * d
+ * @param distances  output distances, size nq * k
+ * @param indexes    output vector ids, size nq * k
+ */
+void knn_inner_product(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        size_t k,
+        float* distances,
+        int64_t* indexes,
+        const IDSelector* sel = nullptr);
+
+/** Return the k nearest neighors of each of the nx vectors x among the ny
+ *  vector y, for the L2 distance
+ * @param x    query vectors, size nx * d
+ * @param y    database vectors, size ny * d
+ * @param res  result heap strcture, which also provides k. Sorted on output
+ * @param y_norm2    (optional) norms for the y vectors (nullptr or size ny)
+ * @param sel  search in this subset of vectors
  */
 void knn_L2sqr(
         const float* x,
@@ -194,8 +219,31 @@ void knn_L2sqr(
         size_t ny,
         float_maxheap_array_t* res,
         const float* y_norm2 = nullptr,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
+/**  Return the k nearest neighors of each of the nx vectors x among the ny
+ *  vector y, for the L2 distance
+ *
+ * @param x    query vectors, size nx * d
+ * @param y    database vectors, size ny * d
+ * @param distances  output distances, size nq * k
+ * @param indexes    output vector ids, size nq * k
+ * @param y_norm2    (optional) norms for the y vectors (nullptr or size ny)
+ * @param sel  search in this subset of vectors
+ */
+void knn_L2sqr(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        size_t k,
+        float* distances,
+        int64_t* indexes,
+        const float* y_norm2 = nullptr,
+        const IDSelector* sel = nullptr);
+
+// Knowhere-specific function
 void knn_cosine(
         const float* x,
         const float* y,
@@ -204,8 +252,9 @@ void knn_cosine(
         size_t nx,
         size_t ny,
         float_minheap_array_t* ha,
-        const BitsetView bitset);
+        const IDSelector* sel = nullptr);
 
+// Knowhere-specific function
 void knn_jaccard(
         const float* x,
         const float* y,
@@ -213,11 +262,30 @@ void knn_jaccard(
         size_t nx,
         size_t ny,
         float_maxheap_array_t* res,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
-/* Find the nearest neighbors for nx queries in a set of ny vectors
+/** Find the max inner product neighbors for nx queries in a set of ny vectors
  * indexed by ids. May be useful for re-ranking a pre-selected vector list
+ *
+ * @param x    query vectors, size nx * d
+ * @param y    database vectors, size (max(ids) + 1) * d
+ * @param ids  subset of database vectors to consider, size (nx, nsubset)
+ * @param res  result structure
+ * @param ld_ids stride for the ids array. -1: use nsubset, 0: all queries
+ * process the same subset
  */
+void knn_inner_products_by_idx(
+        const float* x,
+        const float* y,
+        const int64_t* subset,
+        size_t d,
+        size_t nx,
+        size_t nsubset,
+        size_t k,
+        float* vals,
+        int64_t* ids,
+        int64_t ld_ids = -1);
+
 void knn_inner_products_by_idx(
         const float* x,
         const float* y,
@@ -227,6 +295,28 @@ void knn_inner_products_by_idx(
         size_t ny,
         float_minheap_array_t* res);
 
+/** Find the nearest neighbors for nx queries in a set of ny vectors
+ * indexed by ids. May be useful for re-ranking a pre-selected vector list
+ *
+ * @param x    query vectors, size nx * d
+ * @param y    database vectors, size (max(ids) + 1) * d
+ * @param subset subset of database vectors to consider, size (nx, nsubset)
+ * @param res  rIDesult structure
+ * @param ld_subset stride for the subset array. -1: use nsubset, 0: all queries
+ * process the same subset
+ */
+void knn_L2sqr_by_idx(
+        const float* x,
+        const float* y,
+        const int64_t* subset,
+        size_t d,
+        size_t nx,
+        size_t nsubset,
+        size_t k,
+        float* vals,
+        int64_t* ids,
+        int64_t ld_subset = -1);
+
 void knn_L2sqr_by_idx(
         const float* x,
         const float* y,
@@ -259,7 +349,7 @@ void range_search_L2sqr(
         size_t ny,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
 /// same as range_search_L2sqr for the inner product similarity
 void range_search_inner_product(
@@ -270,8 +360,9 @@ void range_search_inner_product(
         size_t ny,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
+// Knowhere-specific function
 void range_search_cosine(
         const float* x,
         const float* y,
@@ -281,7 +372,7 @@ void range_search_cosine(
         size_t ny,
         float radius,
         RangeSearchResult* result,
-        const BitsetView bitset = nullptr);
+        const IDSelector* sel = nullptr);
 
 /***************************************************************************
  * PQ tables computations
diff --git a/thirdparty/faiss/faiss/utils/distances_fused/avx512.cpp b/thirdparty/faiss/faiss/utils/distances_fused/avx512.cpp
new file mode 100644
index 000000000..6ae8cb046
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/distances_fused/avx512.cpp
@@ -0,0 +1,346 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/distances_fused/avx512.h>
+
+#ifdef __AVX512__
+
+#include <immintrin.h>
+
+namespace faiss {
+
+namespace {
+
+// It makes sense to like to overload certain cases because the further
+// kernels are in need of AVX512 registers. So, let's tell compiler
+// not to waste registers on a bit faster code, if needed.
+template <size_t DIM>
+float l2_sqr(const float* const x) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * x[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * x[i];
+    }
+
+    return output;
+}
+
+template <>
+float l2_sqr<4>(const float* const x) {
+    __m128 v = _mm_loadu_ps(x);
+    __m128 v2 = _mm_mul_ps(v, v);
+    v2 = _mm_hadd_ps(v2, v2);
+    v2 = _mm_hadd_ps(v2, v2);
+
+    return _mm_cvtss_f32(v2);
+}
+
+template <size_t DIM>
+float dot_product(
+        const float* const __restrict x,
+        const float* const __restrict y) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * y[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * y[i];
+    }
+
+    return output;
+}
+
+// The kernel for low dimensionality vectors.
+// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
+//
+// DIM is the dimensionality of the data
+// NX_POINTS_PER_LOOP is the number of x points that get processed
+//   simultaneously.
+// NY_POINTS_PER_LOOP is the number of y points that get processed
+//   simultaneously.
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void kernel(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        const float* const __restrict y_transposed,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms,
+        size_t i) {
+    const size_t ny_p =
+            (ny / (16 * NY_POINTS_PER_LOOP)) * (16 * NY_POINTS_PER_LOOP);
+
+    // compute
+    const float* const __restrict xd_0 = x + i * DIM;
+
+    // prefetch the next point
+    _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
+
+    // load a single point from x
+    // load -2 * value
+    __m512 x_i[NX_POINTS_PER_LOOP][DIM];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        for (size_t dd = 0; dd < DIM; dd++) {
+            x_i[nx_k][dd] = _mm512_set1_ps(-2 * *(xd_0 + nx_k * DIM + dd));
+        }
+    }
+
+    // compute x_norm
+    float x_norm_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
+    }
+
+    // distances and indices
+    __m512 min_distances_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_distances_i[nx_k] =
+                _mm512_set1_ps(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
+    }
+
+    __m512i min_indices_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_indices_i[nx_k] = _mm512_set1_epi32(0);
+    }
+
+    //
+    __m512i current_indices = _mm512_setr_epi32(
+            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    const __m512i indices_delta = _mm512_set1_epi32(16);
+
+    // main loop
+    size_t j = 0;
+    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 16) {
+        // compute dot products for NX_POINTS from x and NY_POINTS from y
+        // technically, we're multiplying -2x and y
+        __m512 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
+
+        // DIM 0 that uses MUL
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            __m512 y_i = _mm512_loadu_ps(y_transposed + j + ny_k * 16 + ny * 0);
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = _mm512_mul_ps(x_i[nx_k][0], y_i);
+            }
+        }
+
+        // other DIMs that use FMA
+        for (size_t dd = 1; dd < DIM; dd++) {
+            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+                __m512 y_i =
+                        _mm512_loadu_ps(y_transposed + j + ny_k * 16 + ny * dd);
+
+                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                    dp_i[nx_k][ny_k] = _mm512_fmadd_ps(
+                            x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
+                }
+            }
+        }
+
+        // compute y^2 - 2 * (x,y)
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            __m512 y_l2_sqr = _mm512_loadu_ps(y_norms + j + ny_k * 16);
+
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = _mm512_add_ps(dp_i[nx_k][ny_k], y_l2_sqr);
+            }
+        }
+
+        // do the comparisons and alter the min indices
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                const __mmask16 comparison = _mm512_cmp_ps_mask(
+                        dp_i[nx_k][ny_k], min_distances_i[nx_k], _CMP_LT_OS);
+                min_distances_i[nx_k] = _mm512_mask_blend_ps(
+                        comparison, min_distances_i[nx_k], dp_i[nx_k][ny_k]);
+                min_indices_i[nx_k] = _mm512_castps_si512(_mm512_mask_blend_ps(
+                        comparison,
+                        _mm512_castsi512_ps(min_indices_i[nx_k]),
+                        _mm512_castsi512_ps(current_indices)));
+            }
+
+            current_indices = _mm512_add_epi32(current_indices, indices_delta);
+        }
+    }
+
+    // dump values and find the minimum distance / minimum index
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        float min_distances_scalar[16];
+        uint32_t min_indices_scalar[16];
+        _mm512_storeu_ps(min_distances_scalar, min_distances_i[nx_k]);
+        _mm512_storeu_si512(
+                (__m512i*)(min_indices_scalar), min_indices_i[nx_k]);
+
+        float current_min_distance = res.dis_tab[i + nx_k];
+        uint32_t current_min_index = res.ids_tab[i + nx_k];
+
+        // This unusual comparison is needed to maintain the behavior
+        // of the original implementation: if two indices are
+        // represented with equal distance values, then
+        // the index with the min value is returned.
+        for (size_t jv = 0; jv < 16; jv++) {
+            // add missing x_norms[i]
+            float distance_candidate =
+                    min_distances_scalar[jv] + x_norm_i[nx_k];
+
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (distance_candidate < 0)
+                distance_candidate = 0;
+
+            const int64_t index_candidate = min_indices_scalar[jv];
+
+            if (current_min_distance > distance_candidate) {
+                current_min_distance = distance_candidate;
+                current_min_index = index_candidate;
+            } else if (
+                    current_min_distance == distance_candidate &&
+                    current_min_index > index_candidate) {
+                current_min_index = index_candidate;
+            }
+        }
+
+        // process leftovers
+        for (size_t j0 = j; j0 < ny; j0++) {
+            const float dp =
+                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
+            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (dis < 0) {
+                dis = 0;
+            }
+
+            if (current_min_distance > dis) {
+                current_min_distance = dis;
+                current_min_index = j0;
+            }
+        }
+
+        // done
+        res.add_result(i + nx_k, current_min_distance, current_min_index);
+    }
+}
+
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void exhaustive_L2sqr_fused_cmax(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms) {
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) {
+        return;
+    }
+
+    // compute norms for y
+    std::unique_ptr<float[]> del2;
+    if (!y_norms) {
+        float* y_norms2 = new float[ny];
+        del2.reset(y_norms2);
+
+        for (size_t i = 0; i < ny; i++) {
+            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
+        }
+
+        y_norms = y_norms2;
+    }
+
+    // initialize res
+    res.begin_multiple(0, nx);
+
+    // transpose y
+    std::vector<float> y_transposed(DIM * ny);
+    for (size_t j = 0; j < DIM; j++) {
+        for (size_t i = 0; i < ny; i++) {
+            y_transposed[j * ny + i] = y[j + i * DIM];
+        }
+    }
+
+    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
+    // the main loop.
+#pragma omp parallel for schedule(dynamic)
+    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
+        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    for (size_t i = nx_p; i < nx; i++) {
+        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    // Does nothing for SingleBestResultHandler, but
+    // keeping the call for the consistency.
+    res.end_multiple();
+    InterruptCallback::check();
+}
+
+} // namespace
+
+bool exhaustive_L2sqr_fused_cmax_AVX512(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+    // process only cases with certain dimensionalities
+
+#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
+    case DIM: {                                                  \
+        exhaustive_L2sqr_fused_cmax<                             \
+                DIM,                                             \
+                NX_POINTS_PER_LOOP,                              \
+                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
+        return true;                                             \
+    }
+
+    switch (d) {
+        DISPATCH(1, 8, 1)
+        DISPATCH(2, 8, 1)
+        DISPATCH(3, 8, 1)
+        DISPATCH(4, 8, 1)
+        DISPATCH(5, 8, 1)
+        DISPATCH(6, 8, 1)
+        DISPATCH(7, 8, 1)
+        DISPATCH(8, 8, 1)
+        DISPATCH(9, 8, 1)
+        DISPATCH(10, 8, 1)
+        DISPATCH(11, 8, 1)
+        DISPATCH(12, 8, 1)
+        DISPATCH(13, 8, 1)
+        DISPATCH(14, 8, 1)
+        DISPATCH(15, 8, 1)
+        DISPATCH(16, 8, 1)
+        DISPATCH(17, 8, 1)
+        DISPATCH(18, 8, 1)
+        DISPATCH(19, 8, 1)
+        DISPATCH(20, 8, 1)
+        DISPATCH(21, 8, 1)
+        DISPATCH(22, 8, 1)
+        DISPATCH(23, 8, 1)
+        DISPATCH(24, 8, 1)
+        DISPATCH(25, 8, 1)
+        DISPATCH(26, 8, 1)
+        DISPATCH(27, 8, 1)
+        DISPATCH(28, 8, 1)
+        DISPATCH(29, 8, 1)
+        DISPATCH(30, 8, 1)
+        DISPATCH(31, 8, 1)
+        DISPATCH(32, 8, 1)
+    }
+
+    return false;
+#undef DISPATCH
+}
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/distances_fused/avx512.h b/thirdparty/faiss/faiss/utils/distances_fused/avx512.h
new file mode 100644
index 000000000..d730e3b61
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/distances_fused/avx512.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// AVX512 might be not used, but this version provides ~2x speedup
+// over AVX2 kernel, say, for training PQx10 or PQx12, and speeds up
+// additional cases with larger dimensionalities.
+
+#pragma once
+
+#include <faiss/impl/ResultHandler.h>
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/Heap.h>
+
+#ifdef __AVX512__
+
+namespace faiss {
+
+// Returns true if the fused kernel is available and the data was processed.
+// Returns false if the fused kernel is not available.
+bool exhaustive_L2sqr_fused_cmax_AVX512(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms);
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/distances_fused/distances_fused.cpp b/thirdparty/faiss/faiss/utils/distances_fused/distances_fused.cpp
new file mode 100644
index 000000000..650e24810
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/distances_fused/distances_fused.cpp
@@ -0,0 +1,42 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/utils/distances_fused/distances_fused.h>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/distances_fused/avx512.h>
+#include <faiss/utils/distances_fused/simdlib_based.h>
+
+namespace faiss {
+
+bool exhaustive_L2sqr_fused_cmax(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+    if (nx == 0 || ny == 0) {
+        // nothing to do
+        return true;
+    }
+
+#ifdef __AVX512__
+    // avx512 kernel
+    return exhaustive_L2sqr_fused_cmax_AVX512(x, y, d, nx, ny, res, y_norms);
+#elif defined(__AVX2__) || defined(__aarch64__)
+    // avx2 or arm neon kernel
+    return exhaustive_L2sqr_fused_cmax_simdlib(x, y, d, nx, ny, res, y_norms);
+#else
+    // not supported, please use a general-purpose kernel
+    return false;
+#endif
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/distances_fused/distances_fused.h b/thirdparty/faiss/faiss/utils/distances_fused/distances_fused.h
new file mode 100644
index 000000000..e6e35c209
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/distances_fused/distances_fused.h
@@ -0,0 +1,40 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file contains a fused kernel that combines distance computation
+// and the search for the CLOSEST point. Currently, this is done for small
+// dimensionality vectors when it is beneficial to avoid storing temporary
+// dot product information in RAM. This is particularly effective
+// when training PQx10 or PQx12 with the default parameters.
+//
+// InterruptCallback::check() is not used, because it is assumed that the
+// kernel takes a little time because of a tiny dimensionality.
+//
+// Later on, similar optimization can be implemented for large size vectors,
+// but a different kernel is needed.
+//
+
+#pragma once
+
+#include <faiss/impl/ResultHandler.h>
+
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+// Returns true if the fused kernel is available and the data was processed.
+// Returns false if the fused kernel is not available.
+bool exhaustive_L2sqr_fused_cmax(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms);
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.cpp b/thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.cpp
new file mode 100644
index 000000000..97ededd2f
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.cpp
@@ -0,0 +1,352 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/distances_fused/simdlib_based.h>
+
+#if defined(__AVX2__) || defined(__aarch64__)
+
+#include <faiss/utils/simdlib.h>
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+namespace faiss {
+
+namespace {
+
+// It makes sense to like to overload certain cases because the further
+// kernels are in need of registers. So, let's tell compiler
+// not to waste registers on a bit faster code, if needed.
+template <size_t DIM>
+float l2_sqr(const float* const x) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * x[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * x[i];
+    }
+
+    return output;
+}
+
+template <size_t DIM>
+float dot_product(
+        const float* const __restrict x,
+        const float* const __restrict y) {
+    // compiler should be smart enough to handle that
+    float output = x[0] * y[0];
+    for (size_t i = 1; i < DIM; i++) {
+        output += x[i] * y[i];
+    }
+
+    return output;
+}
+
+// The kernel for low dimensionality vectors.
+// Finds the closest one from y for every given NX_POINTS_PER_LOOP points from x
+//
+// DIM is the dimensionality of the data
+// NX_POINTS_PER_LOOP is the number of x points that get processed
+//   simultaneously.
+// NY_POINTS_PER_LOOP is the number of y points that get processed
+//   simultaneously.
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void kernel(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        const float* const __restrict y_transposed,
+        const size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms,
+        const size_t i) {
+    const size_t ny_p =
+            (ny / (8 * NY_POINTS_PER_LOOP)) * (8 * NY_POINTS_PER_LOOP);
+
+    // compute
+    const float* const __restrict xd_0 = x + i * DIM;
+
+    // prefetch the next point
+#if defined(__AVX2__)
+    _mm_prefetch(xd_0 + DIM * sizeof(float), _MM_HINT_NTA);
+#endif
+
+    // load a single point from x
+    // load -2 * value
+    simd8float32 x_i[NX_POINTS_PER_LOOP][DIM];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        for (size_t dd = 0; dd < DIM; dd++) {
+            x_i[nx_k][dd] = simd8float32(-2 * *(xd_0 + nx_k * DIM + dd));
+        }
+    }
+
+    // compute x_norm
+    float x_norm_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        x_norm_i[nx_k] = l2_sqr<DIM>(xd_0 + nx_k * DIM);
+    }
+
+    // distances and indices
+    simd8float32 min_distances_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_distances_i[nx_k] =
+                simd8float32(res.dis_tab[i + nx_k] - x_norm_i[nx_k]);
+    }
+
+    simd8uint32 min_indices_i[NX_POINTS_PER_LOOP];
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        min_indices_i[nx_k] = simd8uint32((uint32_t)0);
+    }
+
+    //
+    simd8uint32 current_indices = simd8uint32(0, 1, 2, 3, 4, 5, 6, 7);
+    const simd8uint32 indices_delta = simd8uint32(8);
+
+    // main loop
+    size_t j = 0;
+    for (; j < ny_p; j += NY_POINTS_PER_LOOP * 8) {
+        // compute dot products for NX_POINTS from x and NY_POINTS from y
+        // technically, we're multiplying -2x and y
+        simd8float32 dp_i[NX_POINTS_PER_LOOP][NY_POINTS_PER_LOOP];
+
+        // DIM 0 that uses MUL
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            simd8float32 y_i =
+                    simd8float32(y_transposed + j + ny_k * 8 + ny * 0);
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = x_i[nx_k][0] * y_i;
+            }
+        }
+
+        // other DIMs that use FMA
+        for (size_t dd = 1; dd < DIM; dd++) {
+            for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+                simd8float32 y_i =
+                        simd8float32(y_transposed + j + ny_k * 8 + ny * dd);
+
+                for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                    dp_i[nx_k][ny_k] =
+                            fmadd(x_i[nx_k][dd], y_i, dp_i[nx_k][ny_k]);
+                }
+            }
+        }
+
+        // compute y^2 + (-2x,y)
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            simd8float32 y_l2_sqr = simd8float32(y_norms + j + ny_k * 8);
+
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                dp_i[nx_k][ny_k] = dp_i[nx_k][ny_k] + y_l2_sqr;
+            }
+        }
+
+        // do the comparisons and alter the min indices
+        for (size_t ny_k = 0; ny_k < NY_POINTS_PER_LOOP; ny_k++) {
+            for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+                // cmpps
+                cmplt_and_blend_inplace(
+                        dp_i[nx_k][ny_k],
+                        current_indices,
+                        min_distances_i[nx_k],
+                        min_indices_i[nx_k]);
+            }
+
+            current_indices = current_indices + indices_delta;
+        }
+    }
+
+    // dump values and find the minimum distance / minimum index
+    for (size_t nx_k = 0; nx_k < NX_POINTS_PER_LOOP; nx_k++) {
+        float min_distances_scalar[8];
+        uint32_t min_indices_scalar[8];
+
+        min_distances_i[nx_k].storeu(min_distances_scalar);
+        min_indices_i[nx_k].storeu(min_indices_scalar);
+
+        float current_min_distance = res.dis_tab[i + nx_k];
+        uint32_t current_min_index = res.ids_tab[i + nx_k];
+
+        // This unusual comparison is needed to maintain the behavior
+        // of the original implementation: if two indices are
+        // represented with equal distance values, then
+        // the index with the min value is returned.
+        for (size_t jv = 0; jv < 8; jv++) {
+            // add missing x_norms[i]
+            float distance_candidate =
+                    min_distances_scalar[jv] + x_norm_i[nx_k];
+
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (distance_candidate < 0) {
+                distance_candidate = 0;
+            }
+
+            const int64_t index_candidate = min_indices_scalar[jv];
+
+            if (current_min_distance > distance_candidate) {
+                current_min_distance = distance_candidate;
+                current_min_index = index_candidate;
+            } else if (
+                    current_min_distance == distance_candidate &&
+                    current_min_index > index_candidate) {
+                current_min_index = index_candidate;
+            }
+        }
+
+        // process leftovers
+        for (size_t j0 = j; j0 < ny; j0++) {
+            const float dp =
+                    dot_product<DIM>(x + (i + nx_k) * DIM, y + j0 * DIM);
+            float dis = x_norm_i[nx_k] + y_norms[j0] - 2 * dp;
+            // negative values can occur for identical vectors
+            //    due to roundoff errors.
+            if (dis < 0) {
+                dis = 0;
+            }
+
+            if (current_min_distance > dis) {
+                current_min_distance = dis;
+                current_min_index = j0;
+            }
+        }
+
+        // done
+        res.add_result(i + nx_k, current_min_distance, current_min_index);
+    }
+}
+
+template <size_t DIM, size_t NX_POINTS_PER_LOOP, size_t NY_POINTS_PER_LOOP>
+void exhaustive_L2sqr_fused_cmax(
+        const float* const __restrict x,
+        const float* const __restrict y,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* __restrict y_norms) {
+    // BLAS does not like empty matrices
+    if (nx == 0 || ny == 0) {
+        return;
+    }
+
+    // compute norms for y
+    std::unique_ptr<float[]> del2;
+    if (!y_norms) {
+        float* y_norms2 = new float[ny];
+        del2.reset(y_norms2);
+
+        for (size_t i = 0; i < ny; i++) {
+            y_norms2[i] = l2_sqr<DIM>(y + i * DIM);
+        }
+
+        y_norms = y_norms2;
+    }
+
+    // initialize res
+    res.begin_multiple(0, nx);
+
+    // transpose y
+    std::vector<float> y_transposed(DIM * ny);
+    for (size_t j = 0; j < DIM; j++) {
+        for (size_t i = 0; i < ny; i++) {
+            y_transposed[j * ny + i] = y[j + i * DIM];
+        }
+    }
+
+    const size_t nx_p = (nx / NX_POINTS_PER_LOOP) * NX_POINTS_PER_LOOP;
+    // the main loop.
+#pragma omp parallel for schedule(dynamic)
+    for (size_t i = 0; i < nx_p; i += NX_POINTS_PER_LOOP) {
+        kernel<DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    for (size_t i = nx_p; i < nx; i++) {
+        kernel<DIM, 1, NY_POINTS_PER_LOOP>(
+                x, y, y_transposed.data(), ny, res, y_norms, i);
+    }
+
+    // Does nothing for SingleBestResultHandler, but
+    // keeping the call for the consistency.
+    res.end_multiple();
+    InterruptCallback::check();
+}
+
+} // namespace
+
+bool exhaustive_L2sqr_fused_cmax_simdlib(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms) {
+    // Process only cases with certain dimensionalities.
+    // An acceptable dimensionality value is limited by the number of
+    // available registers.
+
+#define DISPATCH(DIM, NX_POINTS_PER_LOOP, NY_POINTS_PER_LOOP)    \
+    case DIM: {                                                  \
+        exhaustive_L2sqr_fused_cmax<                             \
+                DIM,                                             \
+                NX_POINTS_PER_LOOP,                              \
+                NY_POINTS_PER_LOOP>(x, y, nx, ny, res, y_norms); \
+        return true;                                             \
+    }
+
+    // faiss/benchs/bench_quantizer.py was used for benchmarking
+    // and tuning 2nd and 3rd parameters values.
+    // Basically, the larger the values for 2nd and 3rd parameters are,
+    // the faster the execution is, but the more SIMD registers are needed.
+    // This can be compensated with L1 cache, this is why this
+    // code might operate with more registers than available
+    // because of concurrent ports operations for ALU and LOAD/STORE.
+
+#if defined(__AVX2__)
+    // It was possible to tweak these parameters on x64 machine.
+    switch (d) {
+        DISPATCH(1, 6, 1)
+        DISPATCH(2, 6, 1)
+        DISPATCH(3, 6, 1)
+        DISPATCH(4, 8, 1)
+        DISPATCH(5, 8, 1)
+        DISPATCH(6, 8, 1)
+        DISPATCH(7, 8, 1)
+        DISPATCH(8, 8, 1)
+        DISPATCH(9, 8, 1)
+        DISPATCH(10, 8, 1)
+        DISPATCH(11, 8, 1)
+        DISPATCH(12, 8, 1)
+        DISPATCH(13, 6, 1)
+        DISPATCH(14, 6, 1)
+        DISPATCH(15, 6, 1)
+        DISPATCH(16, 6, 1)
+    }
+#else
+    // Please feel free to alter 2nd and 3rd parameters if you have access
+    // to ARM-based machine so that you are able to benchmark this code.
+    // Or to enable other dimensions.
+    switch (d) {
+        DISPATCH(1, 4, 2)
+        DISPATCH(2, 2, 2)
+        DISPATCH(3, 2, 2)
+        DISPATCH(4, 2, 1)
+        DISPATCH(5, 1, 1)
+        DISPATCH(6, 1, 1)
+        DISPATCH(7, 1, 1)
+        DISPATCH(8, 1, 1)
+    }
+#endif
+
+    return false;
+#undef DISPATCH
+}
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.h b/thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.h
new file mode 100644
index 000000000..b60da7b19
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/distances_fused/simdlib_based.h
@@ -0,0 +1,32 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/ResultHandler.h>
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/Heap.h>
+
+#if defined(__AVX2__) || defined(__aarch64__)
+
+namespace faiss {
+
+// Returns true if the fused kernel is available and the data was processed.
+// Returns false if the fused kernel is not available.
+bool exhaustive_L2sqr_fused_cmax_simdlib(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        SingleBestResultHandler<CMax<float, int64_t>>& res,
+        const float* y_norms);
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/distances_if.h b/thirdparty/faiss/faiss/utils/distances_if.h
new file mode 100644
index 000000000..3fb8cb685
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/distances_if.h
@@ -0,0 +1,573 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <array>
+#include <cstddef>
+#include <optional>
+#include <tuple>
+
+#include <faiss/impl/DistanceComputer.h>
+#include <faiss/utils/distances.h>
+#include "simd/hook.h"
+
+namespace faiss {
+
+/*********************************************************
+ * Facilities that are used for batch distance computation
+ *   for the case of a presence of a condition for the 
+ *   acceptable elements.
+ *********************************************************/
+
+namespace {
+
+constexpr size_t DEFAULT_BUFFER_SIZE = 8;
+
+// Checks groups of BUFFER_SIZE elements and process acceptable
+//   ones in groups of N. Process leftovers elements one by one. 
+// This can be rewritten using <ranges> once an appropriate 
+//   C++ standard is used.
+// Concept constraints may be added once an appropriate 
+//   C++ standard is used.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // process 1 element. 
+    //   void Process1(const size_t idx);
+    typename Process1, 
+    // process N elements. 
+    //   void ProcessN(const std::array<size_t, N> ids);
+    typename ProcessN,
+    size_t N, 
+    size_t BUFFER_SIZE>
+void buffered_if(
+        const size_t ny,
+        Pred pred,
+        Process1 process1,
+        ProcessN processN) {
+    static_assert((BUFFER_SIZE % N) == 0);
+
+    // // the most generic version of the following code that is
+    // //   suitable for the debugging is the following:
+    //
+    // for (size_t j = 0; j < ny; j++) {
+    //     const std::optional<bool> outcome = pred(j);
+    //     if (!outcome.has_value()) {
+    //         break;
+    //     }
+    //     if (outcome.value()) {
+    //         process1(j);
+    //     }
+    // }
+
+    // todo: maybe add a special case "ny < N" right here
+
+    const size_t ny_buffer_size = (ny / BUFFER_SIZE) * BUFFER_SIZE;
+    size_t saved_j[2 * BUFFER_SIZE + N];
+    size_t counter = 0;
+    
+    for (size_t j = 0; j < ny_buffer_size; j += BUFFER_SIZE) {
+        for (size_t jj = 0; jj < BUFFER_SIZE; jj++) {
+            const std::optional<bool> outcome = pred(j + jj);
+            if (!outcome.has_value()) {
+                // pred() wants to stop the iteration. 
+                // It is a bad code style, but it makes clear 
+                //   of what happens next.
+                goto leftovers;
+            }
+
+            const bool is_acceptable = outcome.value();
+            saved_j[counter] = j + jj; counter += is_acceptable ? 1 : 0;
+        }
+
+        if (counter >= N) {
+            const size_t counter_n = (counter / N) * N;
+            for (size_t i_counter = 0; i_counter < counter_n; i_counter += N) {
+                std::array<size_t, N> tmp;
+                std::copy(saved_j + i_counter, saved_j + i_counter + N, tmp.begin());
+                
+                processN(tmp);
+            }
+
+            // copy leftovers to the beginning of the buffer.
+            // todo: use ring buffer instead, maybe?
+            // for (size_t jk = counter_n; jk < counter; jk++) {
+            //     saved_j[jk - counter_n] = saved_j[jk];
+            // }
+            for (size_t jk = counter_n; jk < counter_n + N; jk++) {
+                saved_j[jk - counter_n] = saved_j[jk];
+            }
+
+            // rewind
+            counter -= counter_n;
+        }
+    }
+
+    for (size_t j = ny_buffer_size; j < ny; j++) {
+        const std::optional<bool> outcome = pred(j);
+        if (!outcome.has_value()) {
+            // pred() wants to stop the iteration. 
+            break;
+        }
+
+        const bool is_acceptable = outcome.value();
+        saved_j[counter] = j; counter += is_acceptable ? 1 : 0;
+    }
+
+    // process leftovers
+leftovers:
+    for (size_t jj = 0; jj < counter; jj++) {
+        const size_t j = saved_j[jj];
+        process1(j);
+    }
+}
+
+// does nothing
+struct NoRemapping {
+    inline size_t operator()(const size_t idx) const {
+        return idx;
+    }
+};
+
+// maps idx to indices[idx]
+template<typename IdxT>
+struct ByIdxRemapping {
+    const IdxT* const mapping;
+    inline IdxT operator()(const size_t idx) const {
+        return mapping[idx];
+    }    
+};
+
+} // namespace
+
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Compute distance from a query vector to 1 element.
+    //   float Distance1(const idx_t idx);
+    typename Distance1, 
+    // Compute distance from a query vector to N elements
+    //   void DistanceN(
+    //      const std::array<idx_t, N> idx,
+    //      std::array<float, N>& dis);
+    typename DistanceN,
+    // Maps an iteration for-loop index to a database index.
+    // It is needed for calls with indirect indexing like fvec_L2sqr_by_idx().
+    //   auto IndexRemapper(const size_t idx);
+    typename IndexRemapper,
+    // Apply an element.
+    //   void Apply(const float dis, const auto idx);
+    typename Apply,
+    size_t N,
+    size_t BUFFER_SIZE>
+void fvec_distance_ny_if(
+        const size_t ny,
+        Pred pred,
+        Distance1 distance1,
+        DistanceN distanceN,
+        IndexRemapper remapper,
+        Apply apply
+) {
+    using idx_type = std::invoke_result_t<IndexRemapper, size_t>;
+
+    // process 1 element
+    auto process1 = [&](const size_t idx) {
+        const auto remapped_idx = remapper(idx);
+        const float distance = distance1(remapped_idx);
+        apply(distance, idx);
+    };
+
+    // process N elements
+    auto processN = [&](const std::array<size_t, N> indices) {
+        std::array<float, N> dis;
+        std::array<idx_type, N> remapped_indices;
+        for (size_t i = 0; i < N; i++) {
+            remapped_indices[i] = remapper(indices[i]);
+        }
+
+        distanceN(remapped_indices, dis);
+
+        for (size_t i = 0; i < N; i++) {
+            apply(dis[i], indices[i]);
+        }
+    };
+
+    // process
+    buffered_if<Pred, decltype(process1), decltype(processN), N, BUFFER_SIZE>(
+        ny,
+        pred,
+        process1,
+        processN
+    );
+}
+
+// an internal implementation
+namespace {
+// compute ny inner product between x vectors x and a set of contiguous y vectors
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred,
+    // Maps an iteration for-loop index to a database index.
+    // It is needed for calls with indirect indexing like fvec_L2sqr_by_idx().
+    //   auto IndexRemapper(const size_t idx);
+    typename IndexRemapper,
+    // Apply an element.
+    //   void Apply(const float dis, const auto idx);
+    typename Apply>
+void internal_fvec_inner_products_ny_if(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        const size_t ny,
+        Pred pred,
+        IndexRemapper remapper,
+        Apply apply) {
+    using idx_type = std::invoke_result_t<IndexRemapper, size_t>;
+
+    // compute a distance from the query to 1 element
+    auto distance1 = [x, y, d](const idx_type idx) { 
+        return fvec_inner_product(x, y + idx * d, d); 
+    };
+
+    // compute distances from the query to 4 elements
+    auto distance4 = [x, y, d](const std::array<idx_type, 4> indices, std::array<float, 4>& dis) { 
+        fvec_inner_product_batch_4(
+            x,
+            y + indices[0] * d,
+            y + indices[1] * d,
+            y + indices[2] * d,
+            y + indices[3] * d,
+            d,
+            dis[0],
+            dis[1],
+            dis[2],
+            dis[3]
+        );
+    };
+
+    fvec_distance_ny_if<Pred, decltype(distance1), decltype(distance4), IndexRemapper, Apply, 4, DEFAULT_BUFFER_SIZE>(
+        ny,
+        pred,
+        distance1,
+        distance4,
+        remapper,
+        apply
+    );
+}
+
+// compute ny square L2 distance between x vectors x and a set of contiguous y vectors
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Maps an iteration for-loop index to a database index.
+    // It is needed for calls with indirect indexing like fvec_L2sqr_by_idx().
+    //   auto IndexRemapper(const size_t idx);
+    typename IndexRemapper,
+    // Apply an element.
+    //   void Apply(const float dis, const auto idx);
+    typename Apply>
+void internal_fvec_L2sqr_ny_if(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        const size_t ny,
+        Pred pred,
+        IndexRemapper remapper,
+        Apply apply) {    
+    using idx_type = std::invoke_result_t<IndexRemapper, size_t>;
+
+    // compute a distance from the query to 1 element
+    auto distance1 = [x, y, d](const idx_type idx) { 
+        return fvec_L2sqr(x, y + idx * d, d); 
+    };
+
+    // compute distances from the query to 4 elements
+    auto distance4 = [x, y, d](const std::array<idx_type, 4> indices, std::array<float, 4>& dis) { 
+        fvec_L2sqr_batch_4(
+            x,
+            y + indices[0] * d,
+            y + indices[1] * d,
+            y + indices[2] * d,
+            y + indices[3] * d,
+            d,
+            dis[0],
+            dis[1],
+            dis[2],
+            dis[3]
+        );
+    };
+
+    fvec_distance_ny_if<Pred, decltype(distance1), decltype(distance4), IndexRemapper, Apply, 4, DEFAULT_BUFFER_SIZE>(
+        ny,
+        pred,
+        distance1,
+        distance4,
+        remapper,
+        apply
+    );
+}
+
+
+// compute ny distance between x vectors x and a set of contiguous y vectors
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Maps an iteration for-loop index to a database index.
+    // It is needed for calls with indirect indexing like fvec_L2sqr_by_idx().
+    //   auto IndexRemapper(const size_t idx);
+    typename IndexRemapper,
+    // Apply an element.
+    //   void Apply(const float dis, const idx_t idx);
+    typename Apply>
+void internal_distance_compute_if(
+        const size_t ny,
+        DistanceComputer* __restrict dc,
+        Pred pred,
+        IndexRemapper remapper,
+        Apply apply) {
+    //using idx_type = typename IndexRemapper::idx_type;
+    using idx_type = std::invoke_result_t<IndexRemapper, size_t>;
+
+    // compute a distance from the query to 1 element
+    auto distance1 = [dc](const idx_type idx) { 
+        return dc->operator()(idx);
+    };
+
+    // compute distances from the query to 4 elements
+    auto distance4 = [dc](const std::array<idx_type, 4> indices, std::array<float, 4>& dis) { 
+        dc->distances_batch_4(
+            indices[0],
+            indices[1],
+            indices[2],
+            indices[3],
+            dis[0],
+            dis[1],
+            dis[2],
+            dis[3]
+        );
+    };
+
+    fvec_distance_ny_if<Pred, decltype(distance1), decltype(distance4), IndexRemapper, Apply, 4, DEFAULT_BUFFER_SIZE>(
+        ny,
+        pred,
+        distance1,
+        distance4,
+        remapper,
+        apply
+    );
+}
+
+}
+
+// compute ny inner product between x vectors x and a set of contiguous y vectors
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const size_t idx);
+    typename Apply>
+void fvec_inner_products_ny_if(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        const size_t ny,
+        Pred pred,
+        Apply apply) {
+    internal_fvec_inner_products_ny_if(x, y, d, ny, pred, NoRemapping(), apply);
+}
+
+// compute ny square L2 distance between x vectors x and a set of contiguous y vectors
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const size_t idx);
+    typename Apply>
+void fvec_L2sqr_ny_if(
+        const float* __restrict x,
+        const float* __restrict y,
+        size_t d,
+        const size_t ny,
+        Pred pred,
+        Apply apply) {    
+    internal_fvec_L2sqr_ny_if(x, y, d, ny, pred, NoRemapping(), apply);
+}
+
+// compute ny inner product between x vectors x and a set of contiguous y vectors
+//   whose indices are given by idy with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const int64_t idx);
+    typename Apply>
+void fvec_inner_products_ny_by_idx_if(
+        const float* __restrict x,
+        const float* __restrict y,
+        const int64_t* __restrict ids, /* ids of y vecs */
+        size_t d,
+        const size_t ny,
+        Pred pred,
+        Apply apply) {
+    ByIdxRemapping<int64_t> remapper{ids};
+    internal_fvec_inner_products_ny_if(x, y, d, ny, pred, remapper, apply);
+}
+
+// compute ny square L2 distance between x vectors x and a set of contiguous y vectors
+//   whose indices are given by idy with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const int64_t idx);
+    typename Apply>
+void fvec_L2sqr_ny_by_idx_if(
+        const float* __restrict x,
+        const float* __restrict y,
+        const int64_t* __restrict ids, /* ids of y vecs */
+        size_t d,
+        const size_t ny,
+        Pred pred,
+        Apply apply) {    
+    ByIdxRemapping<int64_t> remapper{ids};
+    internal_fvec_L2sqr_ny_if(x, y, d, ny, pred, remapper, apply);
+}
+
+// compute ny distance between x vectors x and a set of contiguous y vectors
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const idx_t idx);
+    typename Apply>
+void internal_distance_compute_if(
+        const idx_t* __restrict query_indices,
+        const size_t ny,
+        DistanceComputer* __restrict dc,
+        Pred pred,
+        Apply apply) {
+    // compute a distance from the query to 1 element
+    auto distance1 = [dc](const idx_t idx) { 
+        return dc->operator()(idx);
+    };
+
+    // compute distances from the query to 4 elements
+    auto distance4 = [dc](const std::array<idx_t, 4> indices, std::array<float, 4>& dis) { 
+        dc->distances_batch_4(
+            indices[0],
+            indices[1],
+            indices[2],
+            indices[3],
+            dis[0],
+            dis[1],
+            dis[2],
+            dis[3]
+        );
+    };
+
+    ByIdxRemapping<idx_t> remapper{query_indices};
+    fvec_distance_ny_if<Pred, decltype(distance1), decltype(distance4), Apply, 4, DEFAULT_BUFFER_SIZE>(
+        ny,
+        pred,
+        distance1,
+        distance4,
+        remapper,
+        apply
+    );
+}
+
+// compute ny distance between x vectors x and a set of contiguous y vectors
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const idx_t idx);
+    typename Apply>
+void distance_compute_if(
+        const size_t ny,
+        DistanceComputer* const __restrict dc,
+        Pred pred,
+        Apply apply) {
+    NoRemapping remapper;
+    internal_distance_compute_if(ny, dc, pred, remapper, apply);
+}
+
+// compute ny distance between x vectors x and a set of contiguous y vectors
+//   whose indices are given by query_indices
+//   with filtering and applying filtered elements.
+template<
+    // A predicate for filtering elements. 
+    //   std::optional<bool> Pred(const size_t idx);
+    // * return true to accept an element.
+    // * return false to reject an element.
+    // * return std::nullopt to break the iteration loop.
+    typename Pred, 
+    // Apply an element.
+    //   void Apply(const float dis, const idx_t idx);
+    typename Apply>
+void distance_compute_by_idx_if(
+        const idx_t* const __restrict query_indices,
+        const size_t ny,
+        DistanceComputer* const __restrict dc,
+        Pred pred,
+        Apply apply) {
+    ByIdxRemapping<idx_t> remapper{query_indices};
+    internal_distance_compute_if(ny, dc, pred, remapper, apply);
+}
+
+} //namespace faiss
+
diff --git a/thirdparty/faiss/faiss/utils/extra_distances-inl.h b/thirdparty/faiss/faiss/utils/extra_distances-inl.h
index f371f3e22..4df72b0d7 100644
--- a/thirdparty/faiss/faiss/utils/extra_distances-inl.h
+++ b/thirdparty/faiss/faiss/utils/extra_distances-inl.h
@@ -11,6 +11,7 @@
 #include <type_traits>
 
 #include <faiss/FaissHook.h>
+#include <faiss/MetricType.h>
 #include <faiss/utils/distances.h>
 
 namespace faiss {
@@ -19,12 +20,13 @@ template <MetricType mt>
 struct VectorDistance {
     size_t d;
     float metric_arg;
+    static constexpr bool is_similarity = is_similarity_metric(mt);
 
     inline float operator()(const float* x, const float* y) const;
 
     // heap template to use for this type of metric
     using C = typename std::conditional<
-            mt == METRIC_INNER_PRODUCT,
+            is_similarity_metric(mt),
             CMin<float, int64_t>,
             CMax<float, int64_t>>::type;
 };
@@ -120,19 +122,17 @@ template <>
 inline float VectorDistance<METRIC_Jaccard>::operator()(
         const float* x,
         const float* y) const {
+    // todo aguzhva: knowhere implementation is different,
+    //   compare ones
+
+    // WARNING: this distance is defined only for positive input vectors.
+    // Providing vectors with negative values would lead to incorrect results.
     float accu_num = 0, accu_den = 0;
-    const float EPSILON = 0.000001;
     for (size_t i = 0; i < d; i++) {
-        float xi = x[i], yi = y[i];
-        if (fabs (xi - yi) < EPSILON) {
-            accu_num += xi;
-            accu_den += xi;
-        } else {
-            accu_den += xi;
-            accu_den += yi;
-        }
+        accu_num += fmin(x[i], y[i]);
+        accu_den += fmax(x[i], y[i]);
     }
-    return 1 - accu_num / accu_den;
+    return accu_num / accu_den;
 }
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/extra_distances.cpp b/thirdparty/faiss/faiss/utils/extra_distances.cpp
index edd2a8a07..520ed6737 100644
--- a/thirdparty/faiss/faiss/utils/extra_distances.cpp
+++ b/thirdparty/faiss/faiss/utils/extra_distances.cpp
@@ -14,7 +14,9 @@
 #include <cmath>
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/DistanceComputer.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
@@ -49,15 +51,15 @@ void pairwise_extra_distances_template(
     }
 }
 
-template <class VD>
+template <class VD, class C>
 void knn_extra_metrics_template(
         VD vd,
         const float* x,
         const float* y,
         size_t nx,
         size_t ny,
-        float_maxheap_array_t* res,
-        const BitsetView bitset) {
+        HeapArray<C>* res,
+        const IDSelector* sel = nullptr) {
     size_t k = res->k;
     size_t d = vd.d;
     size_t check_period = InterruptCallback::get_period_hint(ny * d);
@@ -74,36 +76,41 @@ void knn_extra_metrics_template(
             float* simi = res->get_val(i);
             int64_t* idxi = res->get_ids(i);
 
-            maxheap_heapify(k, simi, idxi);
+            // maxheap_heapify(k, simi, idxi);
+            heap_heapify<C>(k, simi, idxi);
             for (j = 0; j < ny; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
+                if (!sel || sel->is_member(j)) {
                     float disij = vd(x_i, y_j);
 
-                    if (disij < simi[0]) {
-                        maxheap_replace_top(k, simi, idxi, disij, j);
+                    // if (disij < simi[0]) {
+                    if ((!vd.is_similarity && (disij < simi[0])) ||
+                        (vd.is_similarity && (disij > simi[0]))) {
+                        // maxheap_replace_top(k, simi, idxi, disij, j);
+                        heap_replace_top<C>(k, simi, idxi, disij, j);
                     }
                 }
                 y_j += d;
             }
-            maxheap_reorder(k, simi, idxi);
+            // maxheap_reorder(k, simi, idxi);
+            heap_reorder<C>(k, simi, idxi);
         }
         InterruptCallback::check();
     }
 }
 
 template <class VD>
-struct ExtraDistanceComputer : DistanceComputer {
+struct ExtraDistanceComputer : FlatCodesDistanceComputer {
     VD vd;
-    Index::idx_t nb;
+    idx_t nb;
     const float* q;
     const float* b;
 
-    float operator()(idx_t i) override {
-        return vd(q, b + i * vd.d);
+    float symmetric_dis(idx_t i, idx_t j) final {
+        return vd(b + j * vd.d, b + i * vd.d);
     }
 
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return vd(b + j * vd.d, b + i * vd.d);
+    float distance_to_code(const uint8_t* code) final {
+        return vd(q, (float*)code);
     }
 
     ExtraDistanceComputer(
@@ -111,7 +118,11 @@ struct ExtraDistanceComputer : DistanceComputer {
             const float* xb,
             size_t nb,
             const float* q = nullptr)
-            : vd(vd), nb(nb), q(q), b(xb) {}
+            : FlatCodesDistanceComputer((uint8_t*)xb, vd.d * sizeof(float)),
+              vd(vd),
+              nb(nb),
+              q(q),
+              b(xb) {}
 
     void set_query(const float* x) override {
         q = x;
@@ -163,6 +174,7 @@ void pairwise_extra_distances(
     }
 }
 
+template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -171,13 +183,13 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        float_maxheap_array_t* res,
-        const BitsetView bitset) {
+        HeapArray<C>* res,
+        const IDSelector* sel) {
     switch (mt) {
 #define HANDLE_VAR(kw)                                            \
     case METRIC_##kw: {                                           \
         VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg}; \
-        knn_extra_metrics_template(vd, x, y, nx, ny, res, bitset);\
+        knn_extra_metrics_template(vd, x, y, nx, ny, res, sel);   \
         break;                                                    \
     }
         HANDLE_VAR(L2);
@@ -194,7 +206,29 @@ void knn_extra_metrics(
     }
 }
 
-DistanceComputer* get_extra_distance_computer(
+template void knn_extra_metrics<CMax<float, int64_t>>(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        MetricType mt,
+        float metric_arg,
+        HeapArray<CMax<float, int64_t>>* res,
+        const IDSelector* sel = nullptr);
+
+template void knn_extra_metrics<CMin<float, int64_t>>(
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t nx,
+        size_t ny,
+        MetricType mt,
+        float metric_arg,
+        HeapArray<CMin<float, int64_t>>* res,
+        const IDSelector* sel = nullptr);
+
+FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
         float metric_arg,
diff --git a/thirdparty/faiss/faiss/utils/extra_distances.h b/thirdparty/faiss/faiss/utils/extra_distances.h
index 4e3269d8a..800b85a92 100644
--- a/thirdparty/faiss/faiss/utils/extra_distances.h
+++ b/thirdparty/faiss/faiss/utils/extra_distances.h
@@ -13,11 +13,14 @@
 #include <stdint.h>
 
 #include <faiss/Index.h>
+
 #include <faiss/utils/Heap.h>
-#include <knowhere/bitsetview.h>
 
 namespace faiss {
 
+struct FlatCodesDistanceComputer;
+struct IDSelector;
+
 void pairwise_extra_distances(
         int64_t d,
         int64_t nq,
@@ -31,6 +34,7 @@ void pairwise_extra_distances(
         int64_t ldb = -1,
         int64_t ldd = -1);
 
+template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -39,12 +43,12 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        float_maxheap_array_t* res,
-        const BitsetView bitset = nullptr);
+        HeapArray<C>* res,
+        const IDSelector* sel = nullptr);
 
 /** get a DistanceComputer that refers to this type of distance and
  *  indexes a flat array of size nb */
-DistanceComputer* get_extra_distance_computer(
+FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
         float metric_arg,
diff --git a/thirdparty/faiss/faiss/utils/fp16-fp16c.h b/thirdparty/faiss/faiss/utils/fp16-fp16c.h
new file mode 100644
index 000000000..571d52752
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/fp16-fp16c.h
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <immintrin.h>
+#include <cstdint>
+
+namespace faiss {
+
+inline uint16_t encode_fp16(float x) {
+    __m128 xf = _mm_set1_ps(x);
+    __m128i xi =
+            _mm_cvtps_ph(xf, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+    return _mm_cvtsi128_si32(xi) & 0xffff;
+}
+
+inline float decode_fp16(uint16_t x) {
+    __m128i xi = _mm_set1_epi16(x);
+    __m128 xf = _mm_cvtph_ps(xi);
+    return _mm_cvtss_f32(xf);
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/fp16-inl.h b/thirdparty/faiss/faiss/utils/fp16-inl.h
new file mode 100644
index 000000000..c07d36f5e
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/fp16-inl.h
@@ -0,0 +1,108 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+namespace faiss {
+
+// non-intrinsic FP16 <-> FP32 code adapted from
+// https://github.com/ispc/ispc/blob/master/stdlib.ispc
+
+namespace {
+
+inline float floatbits(uint32_t x) {
+    void* xptr = &x;
+    return *(float*)xptr;
+}
+
+inline uint32_t intbits(float f) {
+    void* fptr = &f;
+    return *(uint32_t*)fptr;
+}
+
+} // namespace
+
+inline uint16_t encode_fp16(float f) {
+    // via Fabian "ryg" Giesen.
+    // https://gist.github.com/2156668
+    uint32_t sign_mask = 0x80000000u;
+    int32_t o;
+
+    uint32_t fint = intbits(f);
+    uint32_t sign = fint & sign_mask;
+    fint ^= sign;
+
+    // NOTE all the integer compares in this function can be safely
+    // compiled into signed compares since all operands are below
+    // 0x80000000. Important if you want fast straight SSE2 code (since
+    // there's no unsigned PCMPGTD).
+
+    // Inf or NaN (all exponent bits set)
+    // NaN->qNaN and Inf->Inf
+    // unconditional assignment here, will override with right value for
+    // the regular case below.
+    uint32_t f32infty = 255u << 23;
+    o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
+
+    // (De)normalized number or zero
+    // update fint unconditionally to save the blending; we don't need it
+    // anymore for the Inf/NaN case anyway.
+
+    const uint32_t round_mask = ~0xfffu;
+    const uint32_t magic = 15u << 23;
+
+    // Shift exponent down, denormalize if necessary.
+    // NOTE This represents half-float denormals using single
+    // precision denormals.  The main reason to do this is that
+    // there's no shift with per-lane variable shifts in SSE*, which
+    // we'd otherwise need. It has some funky side effects though:
+    // - This conversion will actually respect the FTZ (Flush To Zero)
+    //   flag in MXCSR - if it's set, no half-float denormals will be
+    //   generated. I'm honestly not sure whether this is good or
+    //   bad. It's definitely interesting.
+    // - If the underlying HW doesn't support denormals (not an issue
+    //   with Intel CPUs, but might be a problem on GPUs or PS3 SPUs),
+    //   you will always get flush-to-zero behavior. This is bad,
+    //   unless you're on a CPU where you don't care.
+    // - Denormals tend to be slow. FP32 denormals are rare in
+    //   practice outside of things like recursive filters in DSP -
+    //   not a typical half-float application. Whether FP16 denormals
+    //   are rare in practice, I don't know. Whatever slow path your
+    //   HW may or may not have for denormals, this may well hit it.
+    float fscale = floatbits(fint & round_mask) * floatbits(magic);
+    fscale = std::min(fscale, floatbits((31u << 23) - 0x1000u));
+    int32_t fint2 = intbits(fscale) - round_mask;
+
+    if (fint < f32infty)
+        o = fint2 >> 13; // Take the bits!
+
+    return (o | (sign >> 16));
+}
+
+inline float decode_fp16(uint16_t h) {
+    // https://gist.github.com/2144712
+    // Fabian "ryg" Giesen.
+
+    const uint32_t shifted_exp = 0x7c00u << 13; // exponent mask after shift
+
+    int32_t o = ((int32_t)(h & 0x7fffu)) << 13; // exponent/mantissa bits
+    int32_t exp = shifted_exp & o;              // just the exponent
+    o += (int32_t)(127 - 15) << 23;             // exponent adjust
+
+    int32_t infnan_val = o + ((int32_t)(128 - 16) << 23);
+    int32_t zerodenorm_val =
+            intbits(floatbits(o + (1u << 23)) - floatbits(113u << 23));
+    int32_t reg_val = (exp == 0) ? zerodenorm_val : o;
+
+    int32_t sign_bit = ((int32_t)(h & 0x8000u)) << 16;
+    return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/fp16.h b/thirdparty/faiss/faiss/utils/fp16.h
new file mode 100644
index 000000000..90691d8ff
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/fp16.h
@@ -0,0 +1,18 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+#if defined(__F16C__)
+#include <faiss/utils/fp16-fp16c.h>
+#else
+#include <faiss/utils/fp16-inl.h>
+#endif
diff --git a/thirdparty/faiss/faiss/utils/hamming-inl.h b/thirdparty/faiss/faiss/utils/hamming-inl.h
index 2f1ac9ca7..5bfd3153b 100644
--- a/thirdparty/faiss/faiss/utils/hamming-inl.h
+++ b/thirdparty/faiss/faiss/utils/hamming-inl.h
@@ -9,8 +9,7 @@
 
 namespace faiss {
 
-extern const uint8_t hamdis_tab_ham_bytes[256];
-
+// BitstringWriter and BitstringReader functions
 inline BitstringWriter::BitstringWriter(uint8_t* code, size_t code_size)
         : code(code), code_size(code_size), i(0) {
     memset(code, 0, code_size);
@@ -67,296 +66,6 @@ inline uint64_t BitstringReader::read(int nbit) {
     }
 }
 
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int compute(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int compute(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int compute(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int compute(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int compute(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int compute(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int n;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        this->a8 = a8;
-        this->n = code_size;
-    }
-
-    int compute(const uint8_t* b8) const {
-        return xor_popcnt(a8, b8, n);
-    }
-};
-
-/***************************************************************************
- * Equivalence with a template class when code size is known at compile time
- **************************************************************************/
-
-// default template
-template <int CODE_SIZE>
-struct HammingComputer : HammingComputerDefault {
-    HammingComputer(const uint8_t* a, int code_size)
-            : HammingComputerDefault(a, code_size) {}
-};
-
-#define SPECIALIZED_HC(CODE_SIZE)                                    \
-    template <>                                                      \
-    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
-        HammingComputer(const uint8_t* a)                            \
-                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
-    }
-
-SPECIALIZED_HC(4);
-SPECIALIZED_HC(8);
-SPECIALIZED_HC(16);
-SPECIALIZED_HC(20);
-SPECIALIZED_HC(32);
-SPECIALIZED_HC(64);
-
-#undef SPECIALIZED_HC
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int compute(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-};
-
-struct GenHammingComputer16 {
-    uint64_t a0, a1;
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int compute(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return generalized_hamming_64(b[0] ^ a0) +
-                generalized_hamming_64(b[1] ^ a1);
-    }
-};
-
-struct GenHammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int compute(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return generalized_hamming_64(b[0] ^ a0) +
-                generalized_hamming_64(b[1] ^ a1) +
-                generalized_hamming_64(b[2] ^ a2) +
-                generalized_hamming_64(b[3] ^ a3);
-    }
-};
-
-struct GenHammingComputerM8 {
-    const uint64_t* a;
-    int n;
-
-    GenHammingComputerM8(const uint8_t* a8, int code_size) {
-        assert(code_size % 8 == 0);
-        a = (uint64_t*)a8;
-        n = code_size / 8;
-    }
-
-    int compute(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        int accu = 0;
-        for (int i = 0; i < n; i++)
-            accu += generalized_hamming_64(a[i] ^ b[i]);
-        return accu;
-    }
-};
-
-/** generalized Hamming distances (= count number of code bytes that
-    are the same) */
-void generalized_hammings_knn_hc(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t nb,
-        size_t code_size,
-        int ordered = true);
-
 /** This class maintains a list of best distances seen so far.
  *
  * Since the distances are in a limited range (0 to nbit), the
diff --git a/thirdparty/faiss/faiss/utils/hamming.cpp b/thirdparty/faiss/faiss/utils/hamming.cpp
index 1b40c6290..00ac8dc7a 100644
--- a/thirdparty/faiss/faiss/utils/hamming.cpp
+++ b/thirdparty/faiss/faiss/utils/hamming.cpp
@@ -5,14 +5,13 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 /*
  * Implementation of Hamming related functions (distances, smallest distance
  * selection with regular heap|radix and probabilistic heap|radix.
  *
  * IMPLEMENTATION NOTES
- * Bitvectors are generally assumed to be multiples of 64 bits.
+ * Optimal speed is typically obtained for vector sizes of multiples of 64
+ * bits.
  *
  * hamdis_t is used for distances because at this time
  * it is not clear how we will need to balance
@@ -20,86 +19,36 @@
  * - memory usage
  * - cache-misses when dealing with large volumes of data (lower bits is better)
  *
- * The hamdis_t should optimally be compatibe with one of the Torch Storage
- * (Byte,Short,Long) and therefore should be signed for 2-bytes and 4-bytes
  */
 
 #include <faiss/utils/hamming.h>
 
-#include <math.h>
-#include <omp.h>
-#include <stdio.h>
 #include <algorithm>
+#include <cmath>
+#include <cstdio>
 #include <memory>
 #include <vector>
 
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/utils/Heap.h>
+#include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
 #include <faiss/utils/utils.h>
 
 static const size_t BLOCKSIZE_QUERY = 8192;
-static const size_t size_1M = 1 * 1024 * 1024;
 
 namespace faiss {
 
 size_t hamming_batch_size = 65536;
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++)
-        h += (T)lookup8bit[bs1[i] ^ bs2[i]];
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++)
-        h += popcount64(bs1[i] ^ bs2[i]);
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2, size_t nwords) {
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++)
-        h += popcount64(bs1[i] ^ bs2[i]);
-    return h;
-}
-
 template <size_t nbits>
 void hammings(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
-        hamdis_t* dis)
+        hamdis_t* __restrict dis)
 
 {
     size_t i, j;
@@ -113,8 +62,8 @@ void hammings(
 }
 
 void hammings(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
         size_t nwords,
@@ -132,12 +81,12 @@ void hammings(
 /* Count number of matches given a max threshold */
 template <size_t nbits>
 void hamming_count_thres(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
         hamdis_t ht,
-        size_t* nptr) {
+        size_t* __restrict nptr) {
     const size_t nwords = nbits / 64;
     size_t i, j, posm = 0;
     const uint64_t* bs2_ = bs2;
@@ -157,10 +106,10 @@ void hamming_count_thres(
 
 template <size_t nbits>
 void crosshamming_count_thres(
-        const uint64_t* dbs,
+        const uint64_t* __restrict dbs,
         size_t n,
         int ht,
-        size_t* nptr) {
+        size_t* __restrict nptr) {
     const size_t nwords = nbits / 64;
     size_t i, j, posm = 0;
     const uint64_t* bs1 = dbs;
@@ -179,13 +128,13 @@ void crosshamming_count_thres(
 
 template <size_t nbits>
 size_t match_hamming_thres(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
         size_t n1,
         size_t n2,
         int ht,
-        int64_t* idx,
-        hamdis_t* hams) {
+        int64_t* __restrict idx,
+        hamdis_t* __restrict hams) {
     const size_t nwords = nbits / 64;
     size_t i, j, posm = 0;
     hamdis_t h;
@@ -214,6 +163,190 @@ size_t match_hamming_thres(
     return posm;
 }
 
+namespace {
+
+/* Return closest neighbors w.r.t Hamming distance, using a heap. */
+template <class HammingComputer>
+void hammings_knn_hc(
+        int bytes_per_code,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict bs1,
+        const uint8_t* __restrict bs2,
+        size_t n2,
+        bool order = true,
+        bool init_heap = true,
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK) {
+    size_t k = ha->k;
+    if (init_heap)
+        ha->heapify();
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
+        const size_t j1 = std::min(j0 + block_size, n2);
+#pragma omp parallel for
+        for (int64_t i = 0; i < ha->nh; i++) {
+            HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code);
+
+            const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code;
+            hamdis_t dis;
+            hamdis_t* __restrict bh_val_ = ha->val + i * k;
+            int64_t* __restrict bh_ids_ = ha->ids + i * k;
+
+            // if larger number of k is required, then ::bs_addn() needs to be
+            // used instead of ::addn()
+#define HANDLE_APPROX(NB, BD)                                                \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD:               \
+        FAISS_THROW_IF_NOT_FMT(                                              \
+                k <= NB * BD,                                                \
+                "The chosen mode (%d) of approximate top-k supports "        \
+                "up to %d values, but %zd is requested.",                    \
+                (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \
+                NB * BD,                                                     \
+                k);                                                          \
+        HeapWithBucketsForHamming32<                                         \
+                CMax<hamdis_t, int64_t>,                                     \
+                NB,                                                          \
+                BD,                                                          \
+                HammingComputer>::                                           \
+                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_);                \
+        break;
+
+            switch (approx_topk_mode) {
+                HANDLE_APPROX(8, 3)
+                HANDLE_APPROX(8, 2)
+                HANDLE_APPROX(16, 2)
+                HANDLE_APPROX(32, 2)
+                default: {
+                    for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
+                        dis = hc.compute(bs2_);
+                        if (dis < bh_val_[0]) {
+                            faiss::maxheap_replace_top<hamdis_t>(
+                                    k, bh_val_, bh_ids_, dis, j);
+                        }
+                    }
+                } break;
+            }
+        }
+    }
+    if (order)
+        ha->reorder();
+}
+
+/* Return closest neighbors w.r.t Hamming distance, using max count. */
+template <class HammingComputer>
+void hammings_knn_mc(
+        int bytes_per_code,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        int32_t* __restrict distances,
+        int64_t* __restrict labels) {
+    const int nBuckets = bytes_per_code * 8 + 1;
+    std::vector<int> all_counters(na * nBuckets, 0);
+    std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
+
+    std::vector<HCounterState<HammingComputer>> cs;
+    for (size_t i = 0; i < na; ++i) {
+        cs.push_back(HCounterState<HammingComputer>(
+                all_counters.data() + i * nBuckets,
+                all_ids_per_dis.get() + i * nBuckets * k,
+                a + i * bytes_per_code,
+                8 * bytes_per_code,
+                k));
+    }
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < nb; j0 += block_size) {
+        const size_t j1 = std::min(j0 + block_size, nb);
+#pragma omp parallel for
+        for (int64_t i = 0; i < na; ++i) {
+            for (size_t j = j0; j < j1; ++j) {
+                cs[i].update_counter(b + j * bytes_per_code, j);
+            }
+        }
+    }
+
+    for (size_t i = 0; i < na; ++i) {
+        HCounterState<HammingComputer>& csi = cs[i];
+
+        int nres = 0;
+        for (int b = 0; b < nBuckets && nres < k; b++) {
+            for (int l = 0; l < csi.counters[b] && nres < k; l++) {
+                labels[i * k + nres] = csi.ids_per_dis[b * k + l];
+                distances[i * k + nres] = b;
+                nres++;
+            }
+        }
+        while (nres < k) {
+            labels[i * k + nres] = -1;
+            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+            ++nres;
+        }
+    }
+}
+
+template <class HammingComputer>
+void hamming_range_search(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t code_size,
+        RangeSearchResult* res,
+        const IDSelector* sel = nullptr) {
+#pragma omp parallel
+    {
+        RangeSearchPartialResult pres(res);
+
+#pragma omp for
+        for (int64_t i = 0; i < na; i++) {
+            HammingComputer hc(a + i * code_size, code_size);
+            const uint8_t* yi = b;
+            RangeQueryResult& qres = pres.new_result(i);
+
+            for (size_t j = 0; j < nb; j++) {
+                if (!sel || sel->is_member(j)) {
+                    int dis = hc.compute(yi);
+                    if (dis < radius) {
+                        qres.add(dis, j);
+                    }
+                }
+                yi += code_size;
+            }
+        }
+        pres.finalize();
+    }
+}
+
+struct Run_hammings_knn_hc {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hammings_knn_hc<HammingComputer>(args...);
+    }
+};
+
+struct Run_hammings_knn_mc {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hammings_knn_mc<HammingComputer>(args...);
+    }
+};
+
+struct Run_hamming_range_search {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hamming_range_search<HammingComputer>(args...);
+    }
+};
+
+} // namespace
+
 /* Functions to maps vectors to bits. Assume proper allocation done beforehand,
    meaning that b should be be able to receive as many bits as x may produce. */
 
@@ -221,7 +354,7 @@ size_t match_hamming_thres(
  * dimension 0 corresponds to the least significant bit of b[0], or
  * equivalently to the lsb of the first byte that is stored.
  */
-void fvec2bitvec(const float* x, uint8_t* b, size_t d) {
+void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) {
     for (int i = 0; i < d; i += 8) {
         uint8_t w = 0;
         uint8_t mask = 1;
@@ -238,14 +371,22 @@ void fvec2bitvec(const float* x, uint8_t* b, size_t d) {
 
 /* Same but for n vectors.
    Ensure that the ouptut b is byte-aligned (pad with 0s). */
-void fvecs2bitvecs(const float* x, uint8_t* b, size_t d, size_t n) {
+void fvecs2bitvecs(
+        const float* __restrict x,
+        uint8_t* __restrict b,
+        size_t d,
+        size_t n) {
     const int64_t ncodes = ((d + 7) / 8);
 #pragma omp parallel for if (n > 100000)
     for (int64_t i = 0; i < n; i++)
         fvec2bitvec(x + i * d, b + i * ncodes, d);
 }
 
-void bitvecs2fvecs(const uint8_t* b, float* x, size_t d, size_t n) {
+void bitvecs2fvecs(
+        const uint8_t* __restrict b,
+        float* __restrict x,
+        size_t d,
+        size_t n) {
     const int64_t ncodes = ((d + 7) / 8);
 #pragma omp parallel for if (n > 100000)
     for (int64_t i = 0; i < n; i++) {
@@ -283,9 +424,9 @@ void bitvec_shuffle(
         size_t n,
         size_t da,
         size_t db,
-        const int* order,
-        const uint8_t* a,
-        uint8_t* b) {
+        const int* __restrict order,
+        const uint8_t* __restrict a,
+        uint8_t* __restrict b) {
     for (size_t i = 0; i < db; i++) {
         FAISS_THROW_IF_NOT(order[i] >= 0 && order[i] < da);
     }
@@ -312,8 +453,8 @@ void bitvec_shuffle(
 
 /* Compute a set of Hamming distances */
 void hammings(
-        const uint8_t* a,
-        const uint8_t* b,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t na,
         size_t nb,
         size_t ncodes,
@@ -338,38 +479,41 @@ void hammings(
     }
 }
 
-template <class HammingComputer>
-static void hamming_range_search_template(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
+void hammings_knn(
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t nb,
-        int radius,
-        size_t code_size,
-        RangeSearchResult* res,
-        const BitsetView bitset = nullptr) {
-#pragma omp parallel
-    {
-        RangeSearchPartialResult pres(res);
+        size_t ncodes,
+        int order) {
+    hammings_knn_hc(ha, a, b, nb, ncodes, order);
+}
 
-#pragma omp for
-        for (int64_t i = 0; i < na; i++) {
-            HammingComputer hc(a + i * code_size, code_size);
-            const uint8_t* yi = b;
-            RangeQueryResult& qres = pres.new_result(i);
+void hammings_knn_hc(
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
+        size_t nb,
+        size_t ncodes,
+        int order,
+        ApproxTopK_mode_t approx_topk_mode) {
+    Run_hammings_knn_hc r;
+    dispatch_HammingComputer(
+            ncodes, r, ncodes, ha, a, b, nb, order, true, approx_topk_mode);
+}
 
-            for (size_t j = 0; j < nb; j++) {
-                if (bitset.empty() || !bitset.test(j)) {
-                    int dis = hc.compute(yi);
-                    if (dis < radius) {
-                        qres.add(dis, j);
-                    }
-                }
-                yi += code_size;
-            }
-        }
-        pres.finalize();
-    }
+void hammings_knn_mc(
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        size_t ncodes,
+        int32_t* __restrict distances,
+        int64_t* __restrict labels) {
+    Run_hammings_knn_mc r;
+    dispatch_HammingComputer(
+            ncodes, r, ncodes, a, b, na, nb, k, distances, labels);
 }
 
 void hamming_range_search(
@@ -380,29 +524,10 @@ void hamming_range_search(
         int radius,
         size_t code_size,
         RangeSearchResult* result,
-        const BitsetView bitset = nullptr) {
-#define HC(name)                         \
-    hamming_range_search_template<name>( \
-            a, b, na, nb, radius, code_size, result, bitset)
-
-    switch (code_size) {
-        case 4:
-            HC(HammingComputer4);
-            break;
-        case 8:
-            HC(HammingComputer8);
-            break;
-        case 16:
-            HC(HammingComputer16);
-            break;
-        case 32:
-            HC(HammingComputer32);
-            break;
-        default:
-            HC(HammingComputerDefault);
-            break;
-    }
-#undef HC
+        const IDSelector* sel) {
+    Run_hamming_range_search r;
+    dispatch_HammingComputer(
+            code_size, r, a, b, na, nb, radius, code_size, result, sel);
 }
 
 /* Count number of matches given a max threshold            */
@@ -498,13 +623,13 @@ size_t match_hamming_thres(
 
 template <class HammingComputer>
 static void hamming_dis_inner_loop(
-        const uint8_t* ca,
-        const uint8_t* cb,
+        const uint8_t* __restrict ca,
+        const uint8_t* __restrict cb,
         size_t nb,
         size_t code_size,
         int k,
-        hamdis_t* bh_val_,
-        int64_t* bh_ids_) {
+        hamdis_t* __restrict bh_val_,
+        int64_t* __restrict bh_ids_) {
     HammingComputer hc(ca, code_size);
 
     for (size_t j = 0; j < nb; j++) {
@@ -517,9 +642,9 @@ static void hamming_dis_inner_loop(
 }
 
 void generalized_hammings_knn_hc(
-        int_maxheap_array_t* ha,
-        const uint8_t* a,
-        const uint8_t* b,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
         size_t nb,
         size_t code_size,
         int ordered) {
@@ -531,11 +656,11 @@ void generalized_hammings_knn_hc(
 
 #pragma omp parallel for
     for (int i = 0; i < na; i++) {
-        const uint8_t* ca = a + i * code_size;
-        const uint8_t* cb = b;
+        const uint8_t* __restrict ca = a + i * code_size;
+        const uint8_t* __restrict cb = b;
 
-        hamdis_t* bh_val_ = ha->val + i * k;
-        int64_t* bh_ids_ = ha->ids + i * k;
+        hamdis_t* __restrict bh_val_ = ha->val + i * k;
+        int64_t* __restrict bh_ids_ = ha->ids + i * k;
 
         switch (code_size) {
             case 8:
diff --git a/thirdparty/faiss/faiss/utils/hamming.h b/thirdparty/faiss/faiss/utils/hamming.h
index c7dda71f2..661b49e49 100644
--- a/thirdparty/faiss/faiss/utils/hamming.h
+++ b/thirdparty/faiss/faiss/utils/hamming.h
@@ -19,6 +19,7 @@
  * - memory usage
  * - cache-misses when dealing with large volumes of data (fewer bits is better)
  *
+ * hamdis_t is defined in utils/hamming_distance/common.h
  */
 
 #ifndef FAISS_hamming_h
@@ -26,16 +27,18 @@
 
 #include <stdint.h>
 
-#include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
-#include <knowhere/bitsetview.h>
 
-/* The Hamming distance type */
-typedef int32_t hamdis_t;
+// Low-level Hamming distance computations and hamdis_t.
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <faiss/utils/approx_topk/mode.h>
 
 namespace faiss {
-extern uint8_t lookup8bit[256];
+
+struct IDSelector;
+
 /**************************************************
  * General bit vector functions
  **************************************************/
@@ -101,10 +104,6 @@ struct BitstringReader {
 
 FAISS_API extern size_t hamming_batch_size;
 
-inline int popcount64(uint64_t x) {
-    return __builtin_popcountl(x);
-}
-
 /** Compute a set of Hamming distances between na and nb binary vectors
  *
  * @param  a             size na * nbytespercode
@@ -127,14 +126,18 @@ void hammings(
  * @param nb      number of database vectors
  * @param ncodes  size of the binary codes (bytes)
  * @param ordered if != 0: order the results by decreasing distance
- *                (may be bottleneck for k/n > 0.01) */
+ *                (may be bottleneck for k/n > 0.01)
+ * @param approx_topk_mode allows to use approximate top-k facilities
+ *                         to speedup heap
+ */
 void hammings_knn_hc(
         int_maxheap_array_t* ha,
         const uint8_t* a,
         const uint8_t* b,
         size_t nb,
         size_t ncodes,
-        int ordered);
+        int ordered,
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK);
 
 /* Legacy alias to hammings_knn_hc. */
 void hammings_knn(
@@ -175,9 +178,8 @@ void hamming_range_search(
         size_t nb,
         int radius,
         size_t ncodes,
-        std::vector<faiss::RangeSearchPartialResult*>& result,
-        size_t buffer_size,
-        const BitsetView bitset = nullptr);
+        RangeSearchResult* result,
+        const IDSelector* sel = nullptr);
 
 /* Counting the number of matches or of cross-matches (without returning them)
    For use with function that assume pre-allocated memory */
@@ -213,9 +215,17 @@ void crosshamming_count_thres(
 /* compute the Hamming distances between two codewords of nwords*64 bits */
 hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2, size_t nwords);
 
-} // namespace faiss
+/** generalized Hamming distances (= count number of code bytes that
+    are the same) */
+void generalized_hammings_knn_hc(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t code_size,
+        int ordered = true);
 
-// inlined definitions of HammingComputerXX and GenHammingComputerXX
+} // namespace faiss
 
 #include <faiss/utils/hamming-inl.h>
 
diff --git a/thirdparty/faiss/faiss/utils/hamming_distance/avx2-inl.h b/thirdparty/faiss/faiss/utils/hamming_distance/avx2-inl.h
new file mode 100644
index 000000000..54aabb99f
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/hamming_distance/avx2-inl.h
@@ -0,0 +1,462 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef HAMMING_AVX2_INL_H
+#define HAMMING_AVX2_INL_H
+
+// AVX2 version
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <immintrin.h>
+
+namespace faiss {
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
+            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(
+        const uint64_t* bs1,
+        const uint64_t* bs2,
+        size_t nwords) {
+    hamdis_t h = 0;
+    for (size_t i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*(uint32_t*)(b + 2) ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        this->a8 = a8;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int compute(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0, len = quotient8;
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                    [[fallthrough]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                    [[fallthrough]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                    [[fallthrough]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                    [[fallthrough]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                    [[fallthrough]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                    [[fallthrough]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                    [[fallthrough]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return generalized_hamming_64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+// I'm not sure whether this version is faster of slower, tbh
+// todo: test on different CPUs
+struct GenHammingComputer16 {
+    __m128i a;
+
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        a = _mm_loadu_si128((const __m128i_u*)a8);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const __m128i b = _mm_loadu_si128((const __m128i_u*)b8);
+        const __m128i cmp = _mm_cmpeq_epi8(a, b);
+        const auto movemask = _mm_movemask_epi8(cmp);
+        return 16 - popcount32(movemask);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+struct GenHammingComputer32 {
+    __m256i a;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        a = _mm256_loadu_si256((const __m256i_u*)a8);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8);
+        const __m256i cmp = _mm256_cmpeq_epi8(a, b);
+        const uint32_t movemask = _mm256_movemask_epi8(cmp);
+        return 32 - popcount32(movemask);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+// A specialized version might be needed for the very long
+// GenHamming code_size. In such a case, one may accumulate
+// counts using _mm256_sub_epi8 and then compute a horizontal
+// sum (using _mm256_sad_epu8, maybe, in blocks of no larger
+// than 256 * 32 bytes).
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+
+        int i = 0;
+        int n4 = (n / 4) * 4;
+        for (; i < n4; i += 4) {
+            const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i));
+            const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i));
+            const __m256i cmp = _mm256_cmpeq_epi8(av, bv);
+            const uint32_t movemask = _mm256_movemask_epi8(cmp);
+            accu += 32 - popcount32(movemask);
+        }
+
+        for (; i < n; i++)
+            accu += generalized_hamming_64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/hamming_distance/common.h b/thirdparty/faiss/faiss/utils/hamming_distance/common.h
new file mode 100644
index 000000000..0a2de08d1
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/hamming_distance/common.h
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_hamming_common_h
+#define FAISS_hamming_common_h
+
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+/* The Hamming distance type */
+using hamdis_t = int32_t;
+
+namespace faiss {
+
+// trust the compiler to provide efficient popcount implementations
+inline int popcount32(uint32_t x) {
+    return __builtin_popcount(x);
+}
+
+// popcount
+inline int popcount64(uint64_t x) {
+    return __builtin_popcountl(x);
+}
+
+// This table was moved from .cpp to .h file, because
+// otherwise it was causing compilation errors while trying to
+// compile swig modules on Windows.
+// todo for C++17: switch to 'inline constexpr'
+static constexpr uint8_t hamdis_tab_ham_bytes[256] = {
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
+        2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
+        2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+        4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
+        3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
+        4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+        4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/hamming_distance/generic-inl.h b/thirdparty/faiss/faiss/utils/hamming_distance/generic-inl.h
new file mode 100644
index 000000000..e810c9be6
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/hamming_distance/generic-inl.h
@@ -0,0 +1,432 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef HAMMING_GENERIC_INL_H
+#define HAMMING_GENERIC_INL_H
+
+// A general-purpose version of hamming distance computation.
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
+            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(
+        const uint64_t* bs1,
+        const uint64_t* bs2,
+        size_t nwords) {
+    hamdis_t h = 0;
+    for (size_t i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*(uint32_t*)(b + 2) ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        this->a8 = a8;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int compute(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0, len = quotient8;
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return generalized_hamming_64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct GenHammingComputer16 {
+    uint64_t a0, a1;
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return generalized_hamming_64(b[0] ^ a0) +
+                generalized_hamming_64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+struct GenHammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return generalized_hamming_64(b[0] ^ a0) +
+                generalized_hamming_64(b[1] ^ a1) +
+                generalized_hamming_64(b[2] ^ a2) +
+                generalized_hamming_64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+        for (int i = 0; i < n; i++)
+            accu += generalized_hamming_64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/hamming_distance/hamdis-inl.h b/thirdparty/faiss/faiss/utils/hamming_distance/hamdis-inl.h
new file mode 100644
index 000000000..b830df38b
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/hamming_distance/hamdis-inl.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file contains low level inline facilities for computing
+// Hamming distances, such as HammingComputerXX and GenHammingComputerXX.
+
+#ifndef FAISS_hamming_inl_h
+#define FAISS_hamming_inl_h
+
+#include <faiss/utils/hamming_distance/common.h>
+
+#ifdef __aarch64__
+// ARM compilers may produce inoptimal code for Hamming distance somewhy.
+#include <faiss/utils/hamming_distance/neon-inl.h>
+#elif __AVX2__
+// better versions for GenHammingComputer
+#include <faiss/utils/hamming_distance/avx2-inl.h>
+#else
+#include <faiss/utils/hamming_distance/generic-inl.h>
+#endif
+
+namespace faiss {
+
+/***************************************************************************
+ * Equivalence with a template class when code size is known at compile time
+ **************************************************************************/
+
+// default template
+template <int CODE_SIZE>
+struct HammingComputer : HammingComputerDefault {
+    HammingComputer(const uint8_t* a, int code_size)
+            : HammingComputerDefault(a, code_size) {}
+};
+
+#define SPECIALIZED_HC(CODE_SIZE)                                    \
+    template <>                                                      \
+    struct HammingComputer<CODE_SIZE> : HammingComputer##CODE_SIZE { \
+        HammingComputer(const uint8_t* a)                            \
+                : HammingComputer##CODE_SIZE(a, CODE_SIZE) {}        \
+    }
+
+SPECIALIZED_HC(4);
+SPECIALIZED_HC(8);
+SPECIALIZED_HC(16);
+SPECIALIZED_HC(20);
+SPECIALIZED_HC(32);
+SPECIALIZED_HC(64);
+
+#undef SPECIALIZED_HC
+
+/***************************************************************************
+ * Dispatching function that takes a code size and a consumer object
+ * the consumer object should contain a retun type t and a operation template
+ * function f() that to be called to perform the operation.
+ **************************************************************************/
+
+template <class Consumer, class... Types>
+typename Consumer::T dispatch_HammingComputer(
+        int code_size,
+        Consumer& consumer,
+        Types... args) {
+    switch (code_size) {
+#define DISPATCH_HC(CODE_SIZE) \
+    case CODE_SIZE:            \
+        return consumer.template f<HammingComputer##CODE_SIZE>(args...);
+        DISPATCH_HC(4);
+        DISPATCH_HC(8);
+        DISPATCH_HC(16);
+        DISPATCH_HC(20);
+        DISPATCH_HC(32);
+        DISPATCH_HC(64);
+        default:
+            return consumer.template f<HammingComputerDefault>(args...);
+    }
+}
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/hamming_distance/neon-inl.h b/thirdparty/faiss/faiss/utils/hamming_distance/neon-inl.h
new file mode 100644
index 000000000..4669c9140
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/hamming_distance/neon-inl.h
@@ -0,0 +1,511 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef HAMMING_NEON_INL_H
+#define HAMMING_NEON_INL_H
+
+// a specialized version of hamming is needed here, because both
+// gcc, clang and msvc seem to generate suboptimal code sometimes.
+
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/hamming_distance/common.h>
+
+namespace faiss {
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb) {
+    constexpr size_t nwords256 = nbits / 256;
+    constexpr size_t nwords128 = (nbits - nwords256 * 256) / 128;
+    constexpr size_t nwords64 =
+            (nbits - nwords256 * 256 - nwords128 * 128) / 64;
+
+    hamdis_t h = 0;
+    if (nwords256 > 0) {
+        for (size_t i = 0; i < nwords256; i++) {
+            h += hamming<256>(pa, pb);
+            pa += 4;
+            pb += 4;
+        }
+    }
+
+    if (nwords128 > 0) {
+        h += hamming<128>(pa, pb);
+        pa += 2;
+        pb += 2;
+    }
+
+    if (nwords64 > 0) {
+        h += hamming<64>(pa, pb);
+    }
+
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    const uint8_t* pa8 = reinterpret_cast<const uint8_t*>(pa);
+    const uint8_t* pb8 = reinterpret_cast<const uint8_t*>(pb);
+    uint8x16_t or0 = veorq_u8(vld1q_u8(pa8), vld1q_u8(pb8));
+    uint8x16_t c0 = vcntq_u8(or0);
+    auto dis = vaddvq_u8(c0);
+    return dis;
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    const uint8_t* pa8 = reinterpret_cast<const uint8_t*>(pa);
+    const uint8_t* pb8 = reinterpret_cast<const uint8_t*>(pb);
+    uint8x16_t or0 = veorq_u8(vld1q_u8(pa8), vld1q_u8(pb8));
+    uint8x16_t or1 = veorq_u8(vld1q_u8(pa8 + 16), vld1q_u8(pb8 + 16));
+    uint8x16_t c0 = vcntq_u8(or0);
+    uint8x16_t c1 = vcntq_u8(or1);
+    uint8x16_t ca = vpaddq_u8(c0, c1);
+    auto dis = vaddvq_u8(ca);
+    return dis;
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) {
+    const size_t nwords256 = nwords / 256;
+    const size_t nwords128 = (nwords - nwords256 * 256) / 128;
+    const size_t nwords64 = (nwords - nwords256 * 256 - nwords128 * 128) / 64;
+
+    hamdis_t h = 0;
+    if (nwords256 > 0) {
+        for (size_t i = 0; i < nwords256; i++) {
+            h += hamming<256>(pa, pb);
+            pa += 4;
+            pb += 4;
+        }
+    }
+
+    if (nwords128 > 0) {
+        h += hamming<128>(pa, pb);
+        pa += 2;
+        pb += 2;
+    }
+
+    if (nwords64 > 0) {
+        h += hamming<64>(pa, pb);
+    }
+
+    return h;
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int compute(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct HammingComputer16 {
+    uint8x16_t a0;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        a0 = vld1q_u8(a8);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        uint8x16_t b0 = vld1q_u8(b8);
+
+        uint8x16_t or0 = veorq_u8(a0, b0);
+        uint8x16_t c0 = vcntq_u8(or0);
+        auto dis = vaddvq_u8(c0);
+        return dis;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint8x16_t a0;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+
+        a0 = vld1q_u8(a8);
+
+        const uint32_t* a = (uint32_t*)a8;
+        a2 = a[4];
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        uint8x16_t b0 = vld1q_u8(b8);
+
+        uint8x16_t or0 = veorq_u8(a0, b0);
+        uint8x16_t c0 = vcntq_u8(or0);
+        auto dis = vaddvq_u8(c0);
+
+        const uint32_t* b = (uint32_t*)b8;
+        return dis + popcount64(b[4] ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint8x16_t a0;
+    uint8x16_t a1;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        a0 = vld1q_u8(a8);
+        a1 = vld1q_u8(a8 + 16);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        uint8x16_t b0 = vld1q_u8(b8);
+        uint8x16_t b1 = vld1q_u8(b8 + 16);
+
+        uint8x16_t or0 = veorq_u8(a0, b0);
+        uint8x16_t or1 = veorq_u8(a1, b1);
+        uint8x16_t c0 = vcntq_u8(or0);
+        uint8x16_t c1 = vcntq_u8(or1);
+        uint8x16_t ca = vpaddq_u8(c0, c1);
+        auto dis = vaddvq_u8(ca);
+        return dis;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct HammingComputer64 {
+    HammingComputer32 hc0, hc1;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        hc0.set(a8, 32);
+        hc1.set(a8 + 32, 32);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        return hc0.hamming(b8) + hc1.hamming(b8 + 32);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        this->a8 = a8;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int compute(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        int i = 0, len = quotient8;
+
+        int len256 = (quotient8 / 4) * 4;
+        for (; i < len256; i += 4) {
+            accu += ::faiss::hamming<256>(a64 + i, b64 + i);
+            len -= 4;
+        }
+
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint8x8_t a0;
+
+    GenHammingComputer8(const uint8_t* a8, int code_size) {
+        assert(code_size == 8);
+        a0 = vld1_u8(a8);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        uint8x8_t b0 = vld1_u8(b8);
+        uint8x8_t reg = vceq_u8(a0, b0);
+        uint8x8_t c0 = vcnt_u8(reg);
+        return 8 - vaddv_u8(c0) / 8;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct GenHammingComputer16 {
+    uint8x16_t a0;
+
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        a0 = vld1q_u8(a8);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        uint8x16_t b0 = vld1q_u8(b8);
+        uint8x16_t reg = vceqq_u8(a0, b0);
+        uint8x16_t c0 = vcntq_u8(reg);
+        return 16 - vaddvq_u8(c0) / 8;
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+struct GenHammingComputer32 {
+    GenHammingComputer16 a0, a1;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size)
+            : a0(a8, 16), a1(a8 + 16, 16) {
+        assert(code_size == 32);
+    }
+
+    inline int compute(const uint8_t* b8) const {
+        return a0.hamming(b8) + a1.hamming(b8 + 16);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int compute(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+
+        int n2 = (n / 2) * 2;
+        int i = 0;
+        for (; i < n2; i += 2) {
+            uint8x16_t a0 = vld1q_u8((const uint8_t*)(a + i));
+            uint8x16_t b0 = vld1q_u8((const uint8_t*)(b + i));
+            uint8x16_t reg = vceqq_u8(a0, b0);
+            uint8x16_t c0 = vcntq_u8(reg);
+            auto dis = 16 - vaddvq_u8(c0) / 8;
+            accu += dis;
+        }
+
+        for (; i < n; i++) {
+            uint8x8_t a0 = vld1_u8((const uint8_t*)(a + i));
+            uint8x8_t b0 = vld1_u8((const uint8_t*)(b + i));
+            uint8x8_t reg = vceq_u8(a0, b0);
+            uint8x8_t c0 = vcnt_u8(reg);
+            auto dis = 8 - vaddv_u8(c0) / 8;
+            accu += dis;
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() {
+        return n * 8;
+    }
+};
+
+} // namespace faiss
+
+#endif
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/jaccard-inl.h b/thirdparty/faiss/faiss/utils/jaccard-inl.h
index f6e673580..f1d175d9b 100644
--- a/thirdparty/faiss/faiss/utils/jaccard-inl.h
+++ b/thirdparty/faiss/faiss/utils/jaccard-inl.h
@@ -17,8 +17,12 @@
 
 #include <faiss/utils/binary_distances.h>
 
+#include <faiss/utils/hamming_distance/common.h>
+
 namespace faiss {
 
+// todo aguzhva: upgrade code
+
 struct JaccardComputer8 {
     uint64_t a0;
 
diff --git a/thirdparty/faiss/faiss/utils/ordered_key_value.h b/thirdparty/faiss/faiss/utils/ordered_key_value.h
index ba3d0cbce..2f19f3a3f 100644
--- a/thirdparty/faiss/faiss/utils/ordered_key_value.h
+++ b/thirdparty/faiss/faiss/utils/ordered_key_value.h
@@ -46,6 +46,11 @@ struct CMin {
     inline static bool cmp(T a, T b) {
         return a < b;
     }
+    // Similar to cmp(), but also breaks ties
+    // by comparing the second pair of arguments.
+    inline static bool cmp2(T a1, T b1, TI a2, TI b2) {
+        return (a1 < b1) || ((a1 == b1) && (a2 < b2));
+    }
     inline static T neutral() {
         return std::numeric_limits<T>::lowest();
     }
@@ -64,6 +69,11 @@ struct CMax {
     inline static bool cmp(T a, T b) {
         return a > b;
     }
+    // Similar to cmp(), but also breaks ties
+    // by comparing the second pair of arguments.
+    inline static bool cmp2(T a1, T b1, TI a2, TI b2) {
+        return (a1 > b1) || ((a1 == b1) && (a2 > b2));
+    }
     inline static T neutral() {
         return std::numeric_limits<T>::max();
     }
diff --git a/thirdparty/faiss/faiss/utils/partitioning.cpp b/thirdparty/faiss/faiss/utils/partitioning.cpp
index 45e7dbce5..955bf2da9 100644
--- a/thirdparty/faiss/faiss/utils/partitioning.cpp
+++ b/thirdparty/faiss/faiss/utils/partitioning.cpp
@@ -821,7 +821,7 @@ template uint16_t partition_fuzzy<CMax<uint16_t, int>>(
  * Histogram subroutines
  ******************************************************************/
 
-#ifdef __AVX2__
+#if defined(__AVX2__) || defined(__aarch64__)
 /// FIXME when MSB of uint16 is set
 // this code does not compile properly with GCC 7.4.0
 
@@ -837,7 +837,7 @@ simd32uint8 accu4to8(simd16uint16 a4) {
     simd16uint16 a8_0 = a4 & mask4;
     simd16uint16 a8_1 = (a4 >> 4) & mask4;
 
-    return simd32uint8(_mm256_hadd_epi16(a8_0.i, a8_1.i));
+    return simd32uint8(hadd(a8_0, a8_1));
 }
 
 simd16uint16 accu8to16(simd32uint8 a8) {
@@ -846,10 +846,10 @@ simd16uint16 accu8to16(simd32uint8 a8) {
     simd16uint16 a8_0 = simd16uint16(a8) & mask8;
     simd16uint16 a8_1 = (simd16uint16(a8) >> 8) & mask8;
 
-    return simd16uint16(_mm256_hadd_epi16(a8_0.i, a8_1.i));
+    return hadd(a8_0, a8_1);
 }
 
-static const simd32uint8 shifts(_mm256_setr_epi8(
+static const simd32uint8 shifts = simd32uint8::create<
         1,
         16,
         0,
@@ -881,12 +881,12 @@ static const simd32uint8 shifts(_mm256_setr_epi8(
         0,
         0,
         4,
-        64));
+        64>();
 
 // 2-bit accumulator: we can add only up to 3 elements
 // on output we return 2*4-bit results
 // preproc returns either an index in 0..7 or 0xffff
-// that yeilds a 0 when used in the table look-up
+// that yields a 0 when used in the table look-up
 template <int N, class Preproc>
 void compute_accu2(
         const uint16_t*& data,
@@ -941,7 +941,7 @@ simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
     simd16uint16 a16lo = accu8to16(a8lo);
     simd16uint16 a16hi = accu8to16(a8hi);
 
-    simd16uint16 a16 = simd16uint16(_mm256_hadd_epi16(a16lo.i, a16hi.i));
+    simd16uint16 a16 = hadd(a16lo, a16hi);
 
     // the 2 lanes must still be combined
     return a16;
@@ -951,7 +951,7 @@ simd16uint16 histogram_8(const uint16_t* data, Preproc pp, size_t n_in) {
  * 16 bins
  ************************************************************/
 
-static const simd32uint8 shifts2(_mm256_setr_epi8(
+static const simd32uint8 shifts2 = simd32uint8::create<
         1,
         2,
         4,
@@ -959,7 +959,7 @@ static const simd32uint8 shifts2(_mm256_setr_epi8(
         16,
         32,
         64,
-        (char)128,
+        128,
         1,
         2,
         4,
@@ -967,7 +967,7 @@ static const simd32uint8 shifts2(_mm256_setr_epi8(
         16,
         32,
         64,
-        (char)128,
+        128,
         1,
         2,
         4,
@@ -975,7 +975,7 @@ static const simd32uint8 shifts2(_mm256_setr_epi8(
         16,
         32,
         64,
-        (char)128,
+        128,
         1,
         2,
         4,
@@ -983,19 +983,12 @@ static const simd32uint8 shifts2(_mm256_setr_epi8(
         16,
         32,
         64,
-        (char)128));
+        128>();
 
 simd32uint8 shiftr_16(simd32uint8 x, int n) {
     return simd32uint8(simd16uint16(x) >> n);
 }
 
-inline simd32uint8 combine_2x2(simd32uint8 a, simd32uint8 b) {
-    __m256i a1b0 = _mm256_permute2f128_si256(a.i, b.i, 0x21);
-    __m256i a0b1 = _mm256_blend_epi32(a.i, b.i, 0xF0);
-
-    return simd32uint8(a1b0) + simd32uint8(a0b1);
-}
-
 // 2-bit accumulator: we can add only up to 3 elements
 // on output we return 2*4-bit results
 template <int N, class Preproc>
@@ -1022,7 +1015,7 @@ void compute_accu2_16(
         // contains 0s for out-of-bounds elements
 
         simd16uint16 lt8 = (v >> 3) == simd16uint16(0);
-        lt8.i = _mm256_xor_si256(lt8.i, _mm256_set1_epi16(0xff00));
+        lt8 = lt8 ^ simd16uint16(0xff00);
 
         a1 = a1 & lt8;
 
@@ -1040,11 +1033,15 @@ void compute_accu2_16(
 simd32uint8 accu4to8_2(simd32uint8 a4_0, simd32uint8 a4_1) {
     simd32uint8 mask4(0x0f);
 
-    simd32uint8 a8_0 = combine_2x2(a4_0 & mask4, shiftr_16(a4_0, 4) & mask4);
+    simd16uint16 a8_0 = combine2x2(
+            (simd16uint16)(a4_0 & mask4),
+            (simd16uint16)(shiftr_16(a4_0, 4) & mask4));
 
-    simd32uint8 a8_1 = combine_2x2(a4_1 & mask4, shiftr_16(a4_1, 4) & mask4);
+    simd16uint16 a8_1 = combine2x2(
+            (simd16uint16)(a4_1 & mask4),
+            (simd16uint16)(shiftr_16(a4_1, 4) & mask4));
 
-    return simd32uint8(_mm256_hadd_epi16(a8_0.i, a8_1.i));
+    return simd32uint8(hadd(a8_0, a8_1));
 }
 
 template <class Preproc>
@@ -1083,10 +1080,9 @@ simd16uint16 histogram_16(const uint16_t* data, Preproc pp, size_t n_in) {
     simd16uint16 a16lo = accu8to16(a8lo);
     simd16uint16 a16hi = accu8to16(a8hi);
 
-    simd16uint16 a16 = simd16uint16(_mm256_hadd_epi16(a16lo.i, a16hi.i));
+    simd16uint16 a16 = hadd(a16lo, a16hi);
 
-    __m256i perm32 = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
-    a16.i = _mm256_permutevar8x32_epi32(a16.i, perm32);
+    a16 = simd16uint16{simd8uint32{a16}.unzip()};
 
     return a16;
 }
diff --git a/thirdparty/faiss/faiss/utils/prefetch.h b/thirdparty/faiss/faiss/utils/prefetch.h
new file mode 100644
index 000000000..9549eb344
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/prefetch.h
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// prefetches
+
+#ifdef __AVX__
+
+// AVX
+
+#include <xmmintrin.h>
+
+inline void prefetch_L1(const void* address) {
+    _mm_prefetch((const char*)address, _MM_HINT_T0);
+}
+inline void prefetch_L2(const void* address) {
+    _mm_prefetch((const char*)address, _MM_HINT_T1);
+}
+inline void prefetch_L3(const void* address) {
+    _mm_prefetch((const char*)address, _MM_HINT_T2);
+}
+
+#elif defined(__aarch64__)
+
+// ARM64
+
+#ifdef _MSC_VER
+
+// todo: arm on MSVC
+inline void prefetch_L1(const void* address) {}
+inline void prefetch_L2(const void* address) {}
+inline void prefetch_L3(const void* address) {}
+
+#else
+// arm on non-MSVC
+
+inline void prefetch_L1(const void* address) {
+    __builtin_prefetch(address, 0, 3);
+}
+inline void prefetch_L2(const void* address) {
+    __builtin_prefetch(address, 0, 2);
+}
+inline void prefetch_L3(const void* address) {
+    __builtin_prefetch(address, 0, 1);
+}
+#endif
+
+#else
+
+// a generic platform
+
+#ifdef _MSC_VER
+
+inline void prefetch_L1(const void* address) {}
+inline void prefetch_L2(const void* address) {}
+inline void prefetch_L3(const void* address) {}
+
+#else
+
+inline void prefetch_L1(const void* address) {
+    __builtin_prefetch(address, 0, 3);
+}
+inline void prefetch_L2(const void* address) {
+    __builtin_prefetch(address, 0, 2);
+}
+inline void prefetch_L3(const void* address) {
+    __builtin_prefetch(address, 0, 1);
+}
+
+#endif
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/quantize_lut.cpp b/thirdparty/faiss/faiss/utils/quantize_lut.cpp
index 12ca579b6..642f601d7 100644
--- a/thirdparty/faiss/faiss/utils/quantize_lut.cpp
+++ b/thirdparty/faiss/faiss/utils/quantize_lut.cpp
@@ -284,6 +284,68 @@ void quantize_LUT_and_bias(
         *b_out = b;
 }
 
+void aq_quantize_LUT_and_bias(
+        size_t nprobe,
+        size_t M,
+        size_t ksub,
+        const float* LUT,
+        const float* bias,
+        size_t M_norm,
+        int norm_scale,
+        uint8_t* LUTq,
+        size_t M2,
+        uint16_t* biasq,
+        float* a_out,
+        float* b_out) {
+    float a, b;
+    std::vector<float> mins(M);
+    float max_span_LUT = -HUGE_VAL, max_span_dis;
+    float bias_min = tab_min(bias, nprobe);
+    float bias_max = tab_max(bias, nprobe);
+    max_span_dis = bias_max - bias_min;
+    b = 0;
+    for (int i = 0; i < M; i++) {
+        mins[i] = tab_min(LUT + i * ksub, ksub);
+        float span = tab_max(LUT + i * ksub, ksub) - mins[i];
+        max_span_LUT = std::max(max_span_LUT, span);
+        max_span_dis += (i >= M - M_norm ? span * norm_scale : span);
+        b += mins[i];
+    }
+    a = std::min(255 / max_span_LUT, 65535 / max_span_dis);
+    b += bias_min;
+
+    for (int i = 0; i < M; i++) {
+        round_tab(LUT + i * ksub, ksub, a, mins[i], LUTq + i * ksub);
+    }
+    memset(LUTq + M * ksub, 0, ksub * (M2 - M));
+    round_tab(bias, nprobe, a, bias_min, biasq);
+
+    *a_out = a;
+    *b_out = b;
+}
+
+float aq_estimate_norm_scale(
+        size_t M,
+        size_t ksub,
+        size_t M_norm,
+        const float* LUT) {
+    float max_span_LUT = -HUGE_VAL;
+    for (int i = 0; i < M - M_norm; i++) {
+        float min = tab_min(LUT + i * ksub, ksub);
+        float span = tab_max(LUT + i * ksub, ksub) - min;
+        max_span_LUT = std::max(max_span_LUT, span);
+    }
+
+    float max_span_LUT_norm = -HUGE_VAL;
+    for (int i = M - M_norm; i < M; i++) {
+        float min = tab_min(LUT + i * ksub, ksub);
+        float span = tab_max(LUT + i * ksub, ksub) - min;
+        max_span_LUT_norm = std::max(max_span_LUT_norm, span);
+    }
+
+    return max_span_LUT_norm / max_span_LUT;
+}
+
 } // namespace quantize_lut
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/quantize_lut.h b/thirdparty/faiss/faiss/utils/quantize_lut.h
index 332ebdbfa..b7d4fc42f 100644
--- a/thirdparty/faiss/faiss/utils/quantize_lut.h
+++ b/thirdparty/faiss/faiss/utils/quantize_lut.h
@@ -77,6 +77,26 @@ void quantize_LUT_and_bias(
         float* a_out = nullptr,
         float* b_out = nullptr);
 
+void aq_quantize_LUT_and_bias(
+        size_t nprobe,
+        size_t M,
+        size_t ksub,
+        const float* LUT,
+        const float* bias,
+        size_t M_norm,
+        int norm_scale,
+        uint8_t* LUTq,
+        size_t M2,
+        uint16_t* biasq,
+        float* a_out,
+        float* b_out);
+
+float aq_estimate_norm_scale(
+        size_t M,
+        size_t ksub,
+        size_t M_norm,
+        const float* LUT);
+
 } // namespace quantize_lut
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/random.cpp b/thirdparty/faiss/faiss/utils/random.cpp
index df3e117ef..9ab8d0adb 100644
--- a/thirdparty/faiss/faiss/utils/random.cpp
+++ b/thirdparty/faiss/faiss/utils/random.cpp
@@ -9,6 +9,23 @@
 
 #include <faiss/utils/random.h>
 
+extern "C" {
+int sgemm_(
+        const char* transa,
+        const char* transb,
+        FINTEGER* m,
+        FINTEGER* n,
+        FINTEGER* k,
+        const float* alpha,
+        const float* a,
+        FINTEGER* lda,
+        const float* b,
+        FINTEGER* ldb,
+        float* beta,
+        float* c,
+        FINTEGER* ldc);
+}
+
 namespace faiss {
 
 /**************************************************
@@ -165,4 +182,40 @@ void byte_rand(uint8_t* x, size_t n, int64_t seed) {
     }
 }
 
+void rand_smooth_vectors(size_t n, size_t d, float* x, int64_t seed) {
+    size_t d1 = 10;
+    std::vector<float> x1(n * d1);
+    float_randn(x1.data(), x1.size(), seed);
+    std::vector<float> rot(d1 * d);
+    float_rand(rot.data(), rot.size(), seed + 1);
+
+    { //
+        FINTEGER di = d, d1i = d1, ni = n;
+        float one = 1.0, zero = 0.0;
+        sgemm_("Not transposed",
+               "Not transposed", // natural order
+               &di,
+               &ni,
+               &d1i,
+               &one,
+               rot.data(),
+               &di, // rotation matrix
+               x1.data(),
+               &d1i, // second term
+               &zero,
+               x,
+               &di);
+    }
+
+    std::vector<float> scales(d);
+    float_rand(scales.data(), d, seed + 2);
+
+#pragma omp parallel for if (n * d > 10000)
+    for (int64_t i = 0; i < n; i++) {
+        for (size_t j = 0; j < d; j++) {
+            x[i * d + j] = sinf(x[i * d + j] * (scales[j] * 4 + 0.1));
+        }
+    }
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/random.h b/thirdparty/faiss/faiss/utils/random.h
index a3c9eae57..8b4286894 100644
--- a/thirdparty/faiss/faiss/utils/random.h
+++ b/thirdparty/faiss/faiss/utils/random.h
@@ -54,4 +54,9 @@ void int64_rand_max(int64_t* x, size_t n, uint64_t max, int64_t seed);
 /* random permutation */
 void rand_perm(int* perm, size_t n, int64_t seed);
 
+/* Random set of vectors with intrinsic dimensionality 10 that is harder to
+ * index than a subspace of dim 10 but easier than uniform data in dimension d
+ * */
+void rand_smooth_vectors(size_t n, size_t d, float* x, int64_t seed);
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/simdlib.h b/thirdparty/faiss/faiss/utils/simdlib.h
index 21133d6a4..27e9cc59f 100644
--- a/thirdparty/faiss/faiss/utils/simdlib.h
+++ b/thirdparty/faiss/faiss/utils/simdlib.h
@@ -19,6 +19,7 @@
 #include <faiss/utils/simdlib_avx2.h>
 
 #elif defined(__aarch64__)
+
 #include <faiss/utils/simdlib_neon.h>
 
 #else
diff --git a/thirdparty/faiss/faiss/utils/simdlib_avx2.h b/thirdparty/faiss/faiss/utils/simdlib_avx2.h
index dc975a483..cd4cde1e1 100644
--- a/thirdparty/faiss/faiss/utils/simdlib_avx2.h
+++ b/thirdparty/faiss/faiss/utils/simdlib_avx2.h
@@ -70,6 +70,13 @@ struct simd256bit {
         bin(bits);
         return std::string(bits);
     }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
+        unsigned bitmask = _mm256_movemask_epi8(pcmp);
+        return (bitmask == 0xffffffffU);
+    }
 };
 
 /// vector of 16 elements in uint16
@@ -86,6 +93,41 @@ struct simd16uint16 : simd256bit {
 
     explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
 
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15)
+            : simd256bit(_mm256_setr_epi16(
+                      u0,
+                      u1,
+                      u2,
+                      u3,
+                      u4,
+                      u5,
+                      u6,
+                      u7,
+                      u8,
+                      u9,
+                      u10,
+                      u11,
+                      u12,
+                      u13,
+                      u14,
+                      u15)) {}
+
     std::string elements_to_string(const char* fmt) const {
         uint16_t bytes[16];
         storeu((void*)bytes);
@@ -111,6 +153,10 @@ struct simd16uint16 : simd256bit {
         i = _mm256_set1_epi16((short)x);
     }
 
+    simd16uint16 operator*(const simd16uint16& other) const {
+        return simd16uint16(_mm256_mullo_epi16(i, other.i));
+    }
+
     // shift must be known at compile time
     simd16uint16 operator>>(const int shift) const {
         return simd16uint16(_mm256_srli_epi16(i, shift));
@@ -147,9 +193,13 @@ struct simd16uint16 : simd256bit {
         return simd16uint16(_mm256_or_si256(i, other.i));
     }
 
+    simd16uint16 operator^(simd256bit other) const {
+        return simd16uint16(_mm256_xor_si256(i, other.i));
+    }
+
     // returns binary masks
-    simd16uint16 operator==(simd256bit other) const {
-        return simd16uint16(_mm256_cmpeq_epi16(i, other.i));
+    friend simd16uint16 operator==(const simd256bit lhs, const simd256bit rhs) {
+        return simd16uint16(_mm256_cmpeq_epi16(lhs.i, rhs.i));
     }
 
     simd16uint16 operator~() const {
@@ -251,6 +301,45 @@ inline uint32_t cmp_le32(simd16uint16 d0, simd16uint16 d1, simd16uint16 thr) {
     return ge;
 }
 
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    return simd16uint16(_mm256_hadd_epi16(a.i, b.i));
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+//
+// Works in i16 mode in order to save instructions. One may
+// switch from i16 to u16.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    // there's no lt instruction, so we'll need to emulate one
+    __m256i comparison = _mm256_cmpgt_epi16(currentValues.i, candidateValues.i);
+    comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi16(-1));
+
+    minValues.i = _mm256_min_epi16(candidateValues.i, currentValues.i);
+    minIndices.i = _mm256_blendv_epi8(
+            candidateIndices.i, currentIndices.i, comparison);
+    maxValues.i = _mm256_max_epi16(candidateValues.i, currentValues.i);
+    maxIndices.i = _mm256_blendv_epi8(
+            currentIndices.i, candidateIndices.i, comparison);
+}
+
 // vector of 32 unsigned 8-bit integers
 struct simd32uint8 : simd256bit {
     simd32uint8() {}
@@ -261,6 +350,75 @@ struct simd32uint8 : simd256bit {
 
     explicit simd32uint8(uint8_t x) : simd256bit(_mm256_set1_epi8(x)) {}
 
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        return simd32uint8(_mm256_setr_epi8(
+                (char)_0,
+                (char)_1,
+                (char)_2,
+                (char)_3,
+                (char)_4,
+                (char)_5,
+                (char)_6,
+                (char)_7,
+                (char)_8,
+                (char)_9,
+                (char)_10,
+                (char)_11,
+                (char)_12,
+                (char)_13,
+                (char)_14,
+                (char)_15,
+                (char)_16,
+                (char)_17,
+                (char)_18,
+                (char)_19,
+                (char)_20,
+                (char)_21,
+                (char)_22,
+                (char)_23,
+                (char)_24,
+                (char)_25,
+                (char)_26,
+                (char)_27,
+                (char)_28,
+                (char)_29,
+                (char)_30,
+                (char)_31));
+    }
+
     explicit simd32uint8(simd256bit x) : simd256bit(x) {}
 
     explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
@@ -355,6 +513,40 @@ struct simd8uint32 : simd256bit {
 
     explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
 
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7)
+            : simd256bit(_mm256_setr_epi32(u0, u1, u2, u3, u4, u5, u6, u7)) {}
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        return simd8uint32(_mm256_add_epi32(i, other.i));
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        return simd8uint32(_mm256_sub_epi32(i, other.i));
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        i = _mm256_add_epi32(i, other.i);
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        const __m256i pcmp = _mm256_cmpeq_epi32(i, other.i);
+        unsigned bitmask = _mm256_movemask_epi8(pcmp);
+        return (bitmask == 0xffffffffU);
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+
     std::string elements_to_string(const char* fmt) const {
         uint32_t bytes[8];
         storeu((void*)bytes);
@@ -379,8 +571,49 @@ struct simd8uint32 : simd256bit {
     void set1(uint32_t x) {
         i = _mm256_set1_epi32((int)x);
     }
+
+    simd8uint32 unzip() const {
+        return simd8uint32(_mm256_permutevar8x32_epi32(
+                i, _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7)));
+    }
 };
 
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    // there's no lt instruction, so we'll need to emulate one
+    __m256i comparison = _mm256_cmpgt_epi32(currentValues.i, candidateValues.i);
+    comparison = _mm256_andnot_si256(comparison, _mm256_set1_epi32(-1));
+
+    minValues.i = _mm256_min_epi32(candidateValues.i, currentValues.i);
+    minIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(currentIndices.i),
+            _mm256_castsi256_ps(comparison)));
+    maxValues.i = _mm256_max_epi32(candidateValues.i, currentValues.i);
+    maxIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(currentIndices.i),
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(comparison)));
+}
+
 struct simd8float32 : simd256bit {
     simd8float32() {}
 
@@ -390,7 +623,18 @@ struct simd8float32 : simd256bit {
 
     explicit simd8float32(float x) : simd256bit(_mm256_set1_ps(x)) {}
 
-    explicit simd8float32(const float* x) : simd256bit(_mm256_load_ps(x)) {}
+    explicit simd8float32(const float* x) : simd256bit(_mm256_loadu_ps(x)) {}
+
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7)
+            : simd256bit(_mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7)) {}
 
     simd8float32 operator*(simd8float32 other) const {
         return simd8float32(_mm256_mul_ps(f, other.f));
@@ -404,6 +648,22 @@ struct simd8float32 : simd256bit {
         return simd8float32(_mm256_sub_ps(f, other.f));
     }
 
+    simd8float32& operator+=(const simd8float32& other) {
+        f = _mm256_add_ps(f, other.f);
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        const __m256i pcmp =
+                _mm256_castps_si256(_mm256_cmp_ps(f, other.f, _CMP_EQ_OQ));
+        unsigned bitmask = _mm256_movemask_epi8(pcmp);
+        return (bitmask == 0xffffffffU);
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
     std::string tostring() const {
         float tab[8];
         storeu((void*)tab);
@@ -435,6 +695,85 @@ inline simd8float32 fmadd(simd8float32 a, simd8float32 b, simd8float32 c) {
     return simd8float32(_mm256_fmadd_ps(a.f, b.f, c.f));
 }
 
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    const __m256 comparison =
+            _mm256_cmp_ps(lowestValues.f, candidateValues.f, _CMP_LE_OS);
+    lowestValues.f = _mm256_min_ps(candidateValues.f, lowestValues.f);
+    lowestIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(lowestIndices.i),
+            comparison));
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    const __m256 comparison =
+            _mm256_cmp_ps(currentValues.f, candidateValues.f, _CMP_LE_OS);
+    minValues.f = _mm256_min_ps(candidateValues.f, currentValues.f);
+    minIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(candidateIndices.i),
+            _mm256_castsi256_ps(currentIndices.i),
+            comparison));
+    maxValues.f = _mm256_max_ps(candidateValues.f, currentValues.f);
+    maxIndices.i = _mm256_castps_si256(_mm256_blendv_ps(
+            _mm256_castsi256_ps(currentIndices.i),
+            _mm256_castsi256_ps(candidateIndices.i),
+            comparison));
+}
+
 namespace {
 
 // get even float32's of a and b, interleaved
@@ -725,4 +1064,6 @@ static int and_popcnt_AVX2_lookup(
     return result;
 }
 
+#undef DECLARE_LOOKUP
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/simdlib_emulated.h b/thirdparty/faiss/faiss/utils/simdlib_emulated.h
index 7ab52a5b7..f9cfb3b34 100644
--- a/thirdparty/faiss/faiss/utils/simdlib_emulated.h
+++ b/thirdparty/faiss/faiss/utils/simdlib_emulated.h
@@ -57,6 +57,17 @@ struct simd256bit {
         bin(bits);
         return std::string(bits);
     }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
 };
 
 /// vector of 16 elements in uint16
@@ -75,6 +86,41 @@ struct simd16uint16 : simd256bit {
 
     explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
 
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        this->u16[0] = u0;
+        this->u16[1] = u1;
+        this->u16[2] = u2;
+        this->u16[3] = u3;
+        this->u16[4] = u4;
+        this->u16[5] = u5;
+        this->u16[6] = u6;
+        this->u16[7] = u7;
+        this->u16[8] = u8;
+        this->u16[9] = u9;
+        this->u16[10] = u10;
+        this->u16[11] = u11;
+        this->u16[12] = u12;
+        this->u16[13] = u13;
+        this->u16[14] = u14;
+        this->u16[15] = u15;
+    }
+
     std::string elements_to_string(const char* fmt) const {
         char res[1000], *ptr = res;
         for (int i = 0; i < 16; i++) {
@@ -120,6 +166,11 @@ struct simd16uint16 : simd256bit {
         }
     }
 
+    simd16uint16 operator*(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a * b; });
+    }
+
     // shift must be known at compile time
     simd16uint16 operator>>(const int shift) const {
         return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
@@ -164,6 +215,13 @@ struct simd16uint16 : simd256bit {
                 });
     }
 
+    simd16uint16 operator^(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a ^ b;
+                });
+    }
+
     // returns binary masks
     simd16uint16 operator==(const simd16uint16& other) const {
         return binary_func(*this, other, [](uint16_t a, uint16_t b) {
@@ -283,6 +341,62 @@ inline uint32_t cmp_le32(
     return gem;
 }
 
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    c.u16[0] = a.u16[0] + a.u16[1];
+    c.u16[1] = a.u16[2] + a.u16[3];
+    c.u16[2] = a.u16[4] + a.u16[5];
+    c.u16[3] = a.u16[6] + a.u16[7];
+    c.u16[4] = b.u16[0] + b.u16[1];
+    c.u16[5] = b.u16[2] + b.u16[3];
+    c.u16[6] = b.u16[4] + b.u16[5];
+    c.u16[7] = b.u16[6] + b.u16[7];
+
+    c.u16[8] = a.u16[8] + a.u16[9];
+    c.u16[9] = a.u16[10] + a.u16[11];
+    c.u16[10] = a.u16[12] + a.u16[13];
+    c.u16[11] = a.u16[14] + a.u16[15];
+    c.u16[12] = b.u16[8] + b.u16[9];
+    c.u16[13] = b.u16[10] + b.u16[11];
+    c.u16[14] = b.u16[12] + b.u16[13];
+    c.u16[15] = b.u16[14] + b.u16[15];
+
+    return c;
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    for (size_t i = 0; i < 16; i++) {
+        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
+        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
+        minIndices.u16[i] =
+                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+        maxValues.u16[i] =
+                !flag ? candidateValues.u16[i] : currentValues.u16[i];
+        maxIndices.u16[i] =
+                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+    }
+}
+
 // vector of 32 unsigned 8-bit integers
 struct simd32uint8 : simd256bit {
     simd32uint8() {}
@@ -294,6 +408,75 @@ struct simd32uint8 : simd256bit {
     explicit simd32uint8(uint8_t x) {
         set1(x);
     }
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        simd32uint8 ret;
+        ret.u8[0] = _0;
+        ret.u8[1] = _1;
+        ret.u8[2] = _2;
+        ret.u8[3] = _3;
+        ret.u8[4] = _4;
+        ret.u8[5] = _5;
+        ret.u8[6] = _6;
+        ret.u8[7] = _7;
+        ret.u8[8] = _8;
+        ret.u8[9] = _9;
+        ret.u8[10] = _10;
+        ret.u8[11] = _11;
+        ret.u8[12] = _12;
+        ret.u8[13] = _13;
+        ret.u8[14] = _14;
+        ret.u8[15] = _15;
+        ret.u8[16] = _16;
+        ret.u8[17] = _17;
+        ret.u8[18] = _18;
+        ret.u8[19] = _19;
+        ret.u8[20] = _20;
+        ret.u8[21] = _21;
+        ret.u8[22] = _22;
+        ret.u8[23] = _23;
+        ret.u8[24] = _24;
+        ret.u8[25] = _25;
+        ret.u8[26] = _26;
+        ret.u8[27] = _27;
+        ret.u8[28] = _28;
+        ret.u8[29] = _29;
+        ret.u8[30] = _30;
+        ret.u8[31] = _31;
+        return ret;
+    }
 
     explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
 
@@ -433,7 +616,63 @@ struct simd8uint32 : simd256bit {
 
     explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
 
-    explicit simd8uint32(const uint8_t* x) : simd256bit((const void*)x) {}
+    explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
+
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        u32[0] = u0;
+        u32[1] = u1;
+        u32[2] = u2;
+        u32[3] = u3;
+        u32[4] = u4;
+        u32[5] = u5;
+        u32[6] = u6;
+        u32[7] = u7;
+    }
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] + other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] - other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] += other.u32[i];
+        }
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
 
     std::string elements_to_string(const char* fmt) const {
         char res[1000], *ptr = res;
@@ -458,8 +697,46 @@ struct simd8uint32 : simd256bit {
             u32[i] = x;
         }
     }
+
+    simd8uint32 unzip() const {
+        const uint32_t ret[] = {
+                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
+        return simd8uint32{ret};
+    }
 };
 
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
+        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.u32[i] =
+                !flag ? candidateValues.u32[i] : currentValues.u32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
 struct simd8float32 : simd256bit {
     simd8float32() {}
 
@@ -479,6 +756,25 @@ struct simd8float32 : simd256bit {
         }
     }
 
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        f32[0] = f0;
+        f32[1] = f1;
+        f32[2] = f2;
+        f32[3] = f3;
+        f32[4] = f4;
+        f32[5] = f5;
+        f32[6] = f6;
+        f32[7] = f7;
+    }
+
     template <typename F>
     static simd8float32 binary_func(
             const simd8float32& a,
@@ -506,6 +802,28 @@ struct simd8float32 : simd256bit {
                 *this, other, [](float a, float b) { return a - b; });
     }
 
+    simd8float32& operator+=(const simd8float32& other) {
+        for (size_t i = 0; i < 8; i++) {
+            f32[i] += other.f32[i];
+        }
+
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (f32[i] != other.f32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
     std::string tostring() const {
         char res[1000], *ptr = res;
         for (int i = 0; i < 8; i++) {
@@ -645,6 +963,83 @@ simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
     return c;
 }
 
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    for (size_t j = 0; j < 8; j++) {
+        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
+        if (comparison) {
+            lowestValues.f32[j] = candidateValues.f32[j];
+            lowestIndices.u32[j] = candidateIndices.u32[j];
+        }
+    }
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
+        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.f32[i] =
+                !flag ? candidateValues.f32[i] : currentValues.f32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
 } // namespace
 
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/simdlib_neon.h b/thirdparty/faiss/faiss/utils/simdlib_neon.h
index 8822e3e70..656a56121 100644
--- a/thirdparty/faiss/faiss/utils/simdlib_neon.h
+++ b/thirdparty/faiss/faiss/utils/simdlib_neon.h
@@ -17,7 +17,9 @@
 #include <type_traits>
 
 #include <arm_neon.h>
+
 #include <faiss/impl/FaissAssert.h>
+
 namespace faiss {
 
 namespace detail {
@@ -88,6 +90,23 @@ static inline float32x4x2_t reinterpret_f32(const float32x4x2_t& v) {
     return v;
 }
 
+// Surprisingly, vdupq_n_u16 has the type of
+// uint16x8_t (std::uint32_t) , and vdupq_n_u8 also has
+// uint8x16_t (std::uint32_t) on **some environments**.
+// We want argument type as same as the type of element
+// of result vector type (std::uint16_t for uint16x8_t,
+// and std::uint8_t for uint8x16_t) instead of
+// std::uint32_t due to using set1 function templates,
+// so let's fix the argument type here and use these
+// overload below.
+static inline ::uint16x8_t vdupq_n_u16(std::uint16_t v) {
+    return ::vdupq_n_u16(v);
+}
+
+static inline ::uint8x16_t vdupq_n_u8(std::uint8_t v) {
+    return ::vdupq_n_u8(v);
+}
+
 template <
         typename T,
         typename U = decltype(reinterpret_u8(std::declval<T>().data))>
@@ -119,11 +138,25 @@ static inline std::string bin(const S& simd) {
     return std::string(bits);
 }
 
-template <typename D, typename F, typename T>
-static inline void set1(D& d, F&& f, T t) {
-    const auto v = f(t);
-    d.val[0] = v;
-    d.val[1] = v;
+template <typename T>
+using remove_cv_ref_t =
+        typename std::remove_reference<typename std::remove_cv<T>::type>::type;
+
+template <typename D, typename T>
+struct set1_impl {
+    D& d;
+    T t;
+    template <remove_cv_ref_t<decltype(std::declval<D>().val[0])> (*F)(T)>
+    inline void call() {
+        const auto v = F(t);
+        d.val[0] = v;
+        d.val[1] = v;
+    }
+};
+
+template <typename D, typename T>
+static inline set1_impl<remove_cv_ref_t<D>, T> set1(D& d, T t) {
+    return {d, t};
 }
 
 template <typename T, size_t N, typename S>
@@ -135,27 +168,64 @@ static inline std::string elements_to_string(const char* fmt, const S& simd) {
     simd.store(bytes);
     char res[1000], *ptr = res;
     for (size_t i = 0; i < N; ++i) {
-        ptr += snprintf(ptr, 1000, fmt, bytes[i]);
+        ptr += sprintf(ptr, fmt, bytes[i]);
     }
     // strip last ,
     ptr[-1] = 0;
     return std::string(res);
 }
 
-template <typename T, typename F>
-static inline T unary_func(const T& a, F&& f) {
-    T t;
-    t.val[0] = f(a.val[0]);
-    t.val[1] = f(a.val[1]);
-    return t;
+template <typename T, typename U>
+struct unary_func_impl {
+    const U& a;
+    using Telem = remove_cv_ref_t<decltype(std::declval<T>().val[0])>;
+    using Uelem = remove_cv_ref_t<decltype(std::declval<U>().val[0])>;
+    template <Telem (*F)(Uelem)>
+    inline T call() {
+        T t;
+        t.val[0] = F(a.val[0]);
+        t.val[1] = F(a.val[1]);
+        return t;
+    }
+};
+
+template <typename T>
+static inline unary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<T>> unary_func(
+        const T& a) {
+    return {a};
 }
 
-template <typename T, typename F>
-static inline T binary_func(const T& a, const T& b, F&& f) {
-    T t;
-    t.val[0] = f(a.val[0], b.val[0]);
-    t.val[1] = f(a.val[1], b.val[1]);
-    return t;
+template <typename T, typename U>
+static inline unary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<U>> unary_func(
+        const U& a) {
+    return {a};
+}
+
+template <typename T, typename U>
+struct binary_func_impl {
+    const U& a;
+    const U& b;
+    using Telem = remove_cv_ref_t<decltype(std::declval<T>().val[0])>;
+    using Uelem = remove_cv_ref_t<decltype(std::declval<U>().val[0])>;
+    template <Telem (*F)(Uelem, Uelem)>
+    inline T call() {
+        T t;
+        t.val[0] = F(a.val[0], b.val[0]);
+        t.val[1] = F(a.val[1], b.val[1]);
+        return t;
+    }
+};
+
+template <typename T>
+static inline binary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<T>>
+binary_func(const T& a, const T& b) {
+    return {a, b};
+}
+
+template <typename T, typename U>
+static inline binary_func_impl<remove_cv_ref_t<T>, remove_cv_ref_t<U>>
+binary_func(const U& a, const U& b) {
+    return {a, b};
 }
 
 static inline uint16_t vmovmask_u8(const uint8x16_t& v) {
@@ -172,8 +242,8 @@ static inline uint32_t cmp_xe32(
         const uint16x8x2_t& d0,
         const uint16x8x2_t& d1,
         const uint16x8x2_t& thr) {
-    const auto d0_thr = detail::simdlib::binary_func(d0, thr, F);
-    const auto d1_thr = detail::simdlib::binary_func(d1, thr, F);
+    const auto d0_thr = detail::simdlib::binary_func(d0, thr).call<F>();
+    const auto d1_thr = detail::simdlib::binary_func(d1, thr).call<F>();
     const auto d0_mask = vmovmask_u8(
             vmovn_high_u16(vmovn_u16(d0_thr.val[0]), d0_thr.val[1]));
     const auto d1_mask = vmovmask_u8(
@@ -207,6 +277,44 @@ struct simd16uint16 {
 
     explicit simd16uint16(const uint16x8x2_t& v) : data{v} {}
 
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        uint16_t temp[16] = {
+                u0,
+                u1,
+                u2,
+                u3,
+                u4,
+                u5,
+                u6,
+                u7,
+                u8,
+                u9,
+                u10,
+                u11,
+                u12,
+                u13,
+                u14,
+                u15};
+        data.val[0] = vld1q_u16(temp);
+        data.val[1] = vld1q_u16(temp + 8);
+    }
+
     template <
             typename T,
             typename std::enable_if<
@@ -219,7 +327,8 @@ struct simd16uint16 {
             : data{vld1q_u16(x), vld1q_u16(x + 8)} {}
 
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_u16, static_cast<uint16_t>(0));
+        detail::simdlib::set1(data, static_cast<uint16_t>(0))
+                .call<&detail::simdlib::vdupq_n_u16>();
     }
 
     void storeu(uint16_t* ptr) const {
@@ -257,7 +366,12 @@ struct simd16uint16 {
     }
 
     void set1(uint16_t x) {
-        detail::simdlib::set1(data, &vdupq_n_u16, x);
+        detail::simdlib::set1(data, x).call<&detail::simdlib::vdupq_n_u16>();
+    }
+
+    simd16uint16 operator*(const simd16uint16& other) const {
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vmulq_u16>()};
     }
 
     // shift must be known at compile time
@@ -266,50 +380,56 @@ struct simd16uint16 {
             case 0:
                 return *this;
             case 1:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<1>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<1>>()};
             case 2:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<2>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<2>>()};
             case 3:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<3>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<3>>()};
             case 4:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<4>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<4>>()};
             case 5:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<5>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<5>>()};
             case 6:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<6>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<6>>()};
             case 7:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<7>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<7>>()};
             case 8:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<8>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<8>>()};
             case 9:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<9>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshrq<9>>()};
             case 10:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<10>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<10>>()};
             case 11:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<11>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<11>>()};
             case 12:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<12>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<12>>()};
             case 13:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<13>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<13>>()};
             case 14:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<14>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<14>>()};
             case 15:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshrq<15>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshrq<15>>()};
             default:
                 FAISS_THROW_FMT("Invalid shift %d", shift);
         }
@@ -321,50 +441,56 @@ struct simd16uint16 {
             case 0:
                 return *this;
             case 1:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<1>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<1>>()};
             case 2:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<2>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<2>>()};
             case 3:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<3>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<3>>()};
             case 4:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<4>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<4>>()};
             case 5:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<5>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<5>>()};
             case 6:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<6>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<6>>()};
             case 7:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<7>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<7>>()};
             case 8:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<8>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<8>>()};
             case 9:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<9>)};
+                return simd16uint16{detail::simdlib::unary_func(data)
+                                            .call<detail::simdlib::vshlq<9>>()};
             case 10:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<10>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<10>>()};
             case 11:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<11>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<11>>()};
             case 12:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<12>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<12>>()};
             case 13:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<13>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<13>>()};
             case 14:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<14>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<14>>()};
             case 15:
-                return simd16uint16{detail::simdlib::unary_func(
-                        data, detail::simdlib::vshlq<15>)};
+                return simd16uint16{
+                        detail::simdlib::unary_func(data)
+                                .call<detail::simdlib::vshlq<15>>()};
             default:
                 FAISS_THROW_FMT("Invalid shift %d", shift);
         }
@@ -381,13 +507,13 @@ struct simd16uint16 {
     }
 
     simd16uint16 operator+(const simd16uint16& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(data, other.data, &vaddq_u16)};
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vaddq_u16>()};
     }
 
     simd16uint16 operator-(const simd16uint16& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(data, other.data, &vsubq_u16)};
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vsubq_u16>()};
     }
 
     template <
@@ -396,10 +522,10 @@ struct simd16uint16 {
                     detail::simdlib::is_simd256bit<T>::value,
                     std::nullptr_t>::type = nullptr>
     simd16uint16 operator&(const T& other) const {
-        return simd16uint16{detail::simdlib::binary_func(
-                data,
-                detail::simdlib::reinterpret_u16(other.data),
-                &vandq_u16)};
+        return simd16uint16{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u16(other.data))
+                        .template call<&vandq_u16>()};
     }
 
     template <
@@ -408,20 +534,43 @@ struct simd16uint16 {
                     detail::simdlib::is_simd256bit<T>::value,
                     std::nullptr_t>::type = nullptr>
     simd16uint16 operator|(const T& other) const {
-        return simd16uint16{detail::simdlib::binary_func(
-                data,
-                detail::simdlib::reinterpret_u16(other.data),
-                &vorrq_u16)};
+        return simd16uint16{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u16(other.data))
+                        .template call<&vorrq_u16>()};
+    }
+
+    template <
+            typename T,
+            typename std::enable_if<
+                    detail::simdlib::is_simd256bit<T>::value,
+                    std::nullptr_t>::type = nullptr>
+    simd16uint16 operator^(const T& other) const {
+        return simd16uint16{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u16(other.data))
+                        .template call<&veorq_u16>()};
     }
 
     // returns binary masks
     simd16uint16 operator==(const simd16uint16& other) const {
-        return simd16uint16{
-                detail::simdlib::binary_func(data, other.data, &vceqq_u16)};
+        return simd16uint16{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vceqq_u16>()};
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    template <typename T>
+    bool is_same_as(T other) const {
+        const auto o = detail::simdlib::reinterpret_u16(other.data);
+        const auto equals = detail::simdlib::binary_func(data, o)
+                                    .template call<&vceqq_u16>();
+        const auto equal = vandq_u16(equals.val[0], equals.val[1]);
+        return vminvq_u16(equal) == 0xffffu;
     }
 
     simd16uint16 operator~() const {
-        return simd16uint16{detail::simdlib::unary_func(data, &vmvnq_u16)};
+        return simd16uint16{
+                detail::simdlib::unary_func(data).call<&vmvnq_u16>()};
     }
 
     // get scalar at index 0
@@ -432,8 +581,8 @@ struct simd16uint16 {
     // mask of elements where this >= thresh
     // 2 bit per component: 16 * 2 = 32 bit
     uint32_t ge_mask(const simd16uint16& thresh) const {
-        const auto input =
-                detail::simdlib::binary_func(data, thresh.data, &vcgeq_u16);
+        const auto input = detail::simdlib::binary_func(data, thresh.data)
+                                   .call<&vcgeq_u16>();
         const auto vmovmask_u16 = [](uint16x8_t v) -> uint16_t {
             uint16_t d[8];
             const auto v2 = vreinterpretq_u32_u16(vshrq_n_u16(v, 14));
@@ -466,23 +615,25 @@ struct simd16uint16 {
     }
 
     void accu_min(const simd16uint16& incoming) {
-        data = detail::simdlib::binary_func(incoming.data, data, &vminq_u16);
+        data = detail::simdlib::binary_func(incoming.data, data)
+                       .call<&vminq_u16>();
     }
 
     void accu_max(const simd16uint16& incoming) {
-        data = detail::simdlib::binary_func(incoming.data, data, &vmaxq_u16);
+        data = detail::simdlib::binary_func(incoming.data, data)
+                       .call<&vmaxq_u16>();
     }
 };
 
 // not really a std::min because it returns an elementwise min
 inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
     return simd16uint16{
-            detail::simdlib::binary_func(av.data, bv.data, &vminq_u16)};
+            detail::simdlib::binary_func(av.data, bv.data).call<&vminq_u16>()};
 }
 
 inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
     return simd16uint16{
-            detail::simdlib::binary_func(av.data, bv.data, &vmaxq_u16)};
+            detail::simdlib::binary_func(av.data, bv.data).call<&vmaxq_u16>()};
 }
 
 // decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
@@ -510,6 +661,60 @@ inline uint32_t cmp_le32(
     return detail::simdlib::cmp_xe32<&vcleq_u16>(d0.data, d1.data, thr.data);
 }
 
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    return simd16uint16{
+            detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_u16>()};
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    const uint16x8x2_t comparison =
+            detail::simdlib::binary_func(
+                    candidateValues.data, currentValues.data)
+                    .call<&vcltq_u16>();
+
+    minValues = min(candidateValues, currentValues);
+    minIndices.data = uint16x8x2_t{
+            vbslq_u16(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    currentIndices.data.val[0]),
+            vbslq_u16(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    currentIndices.data.val[1])};
+
+    maxValues = max(candidateValues, currentValues);
+    maxIndices.data = uint16x8x2_t{
+            vbslq_u16(
+                    comparison.val[0],
+                    currentIndices.data.val[0],
+                    candidateIndices.data.val[0]),
+            vbslq_u16(
+                    comparison.val[1],
+                    currentIndices.data.val[1],
+                    candidateIndices.data.val[1])};
+}
+
 // vector of 32 unsigned 8-bit integers
 struct simd32uint8 {
     uint8x16x2_t data;
@@ -522,6 +727,47 @@ struct simd32uint8 {
 
     explicit simd32uint8(const uint8x16x2_t& v) : data{v} {}
 
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        constexpr uint8_t ds[32] = {_0,  _1,  _2,  _3,  _4,  _5,  _6,  _7,
+                                    _8,  _9,  _10, _11, _12, _13, _14, _15,
+                                    _16, _17, _18, _19, _20, _21, _22, _23,
+                                    _24, _25, _26, _27, _28, _29, _30, _31};
+        return simd32uint8{ds};
+    }
+
     template <
             typename T,
             typename std::enable_if<
@@ -534,7 +780,8 @@ struct simd32uint8 {
             : data{vld1q_u8(x), vld1q_u8(x + 16)} {}
 
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_u8, static_cast<uint8_t>(0));
+        detail::simdlib::set1(data, static_cast<uint8_t>(0))
+                .call<&detail::simdlib::vdupq_n_u8>();
     }
 
     void storeu(uint8_t* ptr) const {
@@ -577,7 +824,7 @@ struct simd32uint8 {
     }
 
     void set1(uint8_t x) {
-        detail::simdlib::set1(data, &vdupq_n_u8, x);
+        detail::simdlib::set1(data, x).call<&detail::simdlib::vdupq_n_u8>();
     }
 
     template <
@@ -586,19 +833,21 @@ struct simd32uint8 {
                     detail::simdlib::is_simd256bit<T>::value,
                     std::nullptr_t>::type = nullptr>
     simd32uint8 operator&(const T& other) const {
-        return simd32uint8{detail::simdlib::binary_func(
-                data, detail::simdlib::reinterpret_u8(other.data), &vandq_u8)};
+        return simd32uint8{
+                detail::simdlib::binary_func(
+                        data, detail::simdlib::reinterpret_u8(other.data))
+                        .template call<&vandq_u8>()};
     }
 
     simd32uint8 operator+(const simd32uint8& other) const {
-        return simd32uint8{
-                detail::simdlib::binary_func(data, other.data, &vaddq_u8)};
+        return simd32uint8{detail::simdlib::binary_func(data, other.data)
+                                   .call<&vaddq_u8>()};
     }
 
     // The very important operation that everything relies on
     simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
-        return simd32uint8{
-                detail::simdlib::binary_func(data, idx.data, &vqtbl1q_u8)};
+        return simd32uint8{detail::simdlib::binary_func(data, idx.data)
+                                   .call<&vqtbl1q_u8>()};
     }
 
     simd32uint8 operator+=(const simd32uint8& other) {
@@ -613,6 +862,16 @@ struct simd32uint8 {
         vst1q_u8(tab, data.val[high]);
         return tab[i - high * 16];
     }
+
+    // Checks whether the other holds exactly the same bytes.
+    template <typename T>
+    bool is_same_as(T other) const {
+        const auto o = detail::simdlib::reinterpret_u8(other.data);
+        const auto equals = detail::simdlib::binary_func(data, o)
+                                    .template call<&vceqq_u8>();
+        const auto equal = vandq_u8(equals.val[0], equals.val[1]);
+        return vminvq_u8(equal) == 0xffu;
+    }
 };
 
 // convert with saturation
@@ -641,8 +900,8 @@ inline simd32uint8 blendv(
     const uint8x16x2_t msb_mask = {
             vtstq_u8(mask.data.val[0], msb), vtstq_u8(mask.data.val[1], msb)};
     const uint8x16x2_t selected = {
-            vbslq_u8(msb_mask.val[0], a.data.val[0], b.data.val[0]),
-            vbslq_u8(msb_mask.val[1], a.data.val[1], b.data.val[1])};
+            vbslq_u8(msb_mask.val[0], b.data.val[0], a.data.val[0]),
+            vbslq_u8(msb_mask.val[1], b.data.val[1], a.data.val[1])};
     return simd32uint8{selected};
 }
 
@@ -666,8 +925,63 @@ struct simd8uint32 {
 
     explicit simd8uint32(const uint8_t* x) : simd8uint32(simd32uint8(x)) {}
 
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        uint32_t temp[8] = {u0, u1, u2, u3, u4, u5, u6, u7};
+        data.val[0] = vld1q_u32(temp);
+        data.val[1] = vld1q_u32(temp + 4);
+    }
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        return simd8uint32{detail::simdlib::binary_func(data, other.data)
+                                   .call<&vaddq_u32>()};
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        return simd8uint32{detail::simdlib::binary_func(data, other.data)
+                                   .call<&vsubq_u32>()};
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        data.val[0] = vaddq_u32(data.val[0], other.data.val[0]);
+        data.val[1] = vaddq_u32(data.val[1], other.data.val[1]);
+        return *this;
+    }
+
+    simd8uint32 operator==(simd8uint32 other) const {
+        return simd8uint32{detail::simdlib::binary_func(data, other.data)
+                                   .call<&vceqq_u32>()};
+    }
+
+    simd8uint32 operator~() const {
+        return simd8uint32{
+                detail::simdlib::unary_func(data).call<&vmvnq_u32>()};
+    }
+
+    simd8uint32 operator!=(simd8uint32 other) const {
+        return ~(*this == other);
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    template <typename T>
+    bool is_same_as(T other) const {
+        const auto o = detail::simdlib::reinterpret_u32(other.data);
+        const auto equals = detail::simdlib::binary_func(data, o)
+                                    .template call<&vceqq_u32>();
+        const auto equal = vandq_u32(equals.val[0], equals.val[1]);
+        return vminvq_u32(equal) == 0xffffffffu;
+    }
+
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_u32, static_cast<uint32_t>(0));
+        detail::simdlib::set1(data, static_cast<uint32_t>(0))
+                .call<&vdupq_n_u32>();
     }
 
     void storeu(uint32_t* ptr) const {
@@ -705,10 +1019,68 @@ struct simd8uint32 {
     }
 
     void set1(uint32_t x) {
-        detail::simdlib::set1(data, &vdupq_n_u32, x);
+        detail::simdlib::set1(data, x).call<&vdupq_n_u32>();
+    }
+
+    simd8uint32 unzip() const {
+        return simd8uint32{uint32x4x2_t{
+                vuzp1q_u32(data.val[0], data.val[1]),
+                vuzp2q_u32(data.val[0], data.val[1])}};
     }
 };
 
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    const uint32x4x2_t comparison =
+            detail::simdlib::binary_func(
+                    candidateValues.data, currentValues.data)
+                    .call<&vcltq_u32>();
+
+    minValues.data = detail::simdlib::binary_func(
+                             candidateValues.data, currentValues.data)
+                             .call<&vminq_u32>();
+    minIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    currentIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    currentIndices.data.val[1])};
+
+    maxValues.data = detail::simdlib::binary_func(
+                             candidateValues.data, currentValues.data)
+                             .call<&vmaxq_u32>();
+    maxIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    currentIndices.data.val[0],
+                    candidateIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    currentIndices.data.val[1],
+                    candidateIndices.data.val[1])};
+}
+
 struct simd8float32 {
     float32x4x2_t data;
 
@@ -729,8 +1101,22 @@ struct simd8float32 {
     explicit simd8float32(const float* x)
             : data{vld1q_f32(x), vld1q_f32(x + 4)} {}
 
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        float temp[8] = {f0, f1, f2, f3, f4, f5, f6, f7};
+        data.val[0] = vld1q_f32(temp);
+        data.val[1] = vld1q_f32(temp + 4);
+    }
+
     void clear() {
-        detail::simdlib::set1(data, &vdupq_n_f32, 0.f);
+        detail::simdlib::set1(data, 0.f).call<&vdupq_n_f32>();
     }
 
     void storeu(float* ptr) const {
@@ -756,18 +1142,47 @@ struct simd8float32 {
     }
 
     simd8float32 operator*(const simd8float32& other) const {
-        return simd8float32{
-                detail::simdlib::binary_func(data, other.data, &vmulq_f32)};
+        return simd8float32{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vmulq_f32>()};
     }
 
     simd8float32 operator+(const simd8float32& other) const {
-        return simd8float32{
-                detail::simdlib::binary_func(data, other.data, &vaddq_f32)};
+        return simd8float32{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vaddq_f32>()};
     }
 
     simd8float32 operator-(const simd8float32& other) const {
-        return simd8float32{
-                detail::simdlib::binary_func(data, other.data, &vsubq_f32)};
+        return simd8float32{detail::simdlib::binary_func(data, other.data)
+                                    .call<&vsubq_f32>()};
+    }
+
+    simd8float32& operator+=(const simd8float32& other) {
+        // In this context, it is more compiler friendly to write intrinsics
+        // directly instead of using binary_func
+        data.val[0] = vaddq_f32(data.val[0], other.data.val[0]);
+        data.val[1] = vaddq_f32(data.val[1], other.data.val[1]);
+        return *this;
+    }
+
+    simd8uint32 operator==(simd8float32 other) const {
+        return simd8uint32{
+                detail::simdlib::binary_func<::uint32x4x2_t>(data, other.data)
+                        .call<&vceqq_f32>()};
+    }
+
+    simd8uint32 operator!=(simd8float32 other) const {
+        return ~(*this == other);
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    template <typename T>
+    bool is_same_as(T other) const {
+        const auto o = detail::simdlib::reinterpret_f32(other.data);
+        const auto equals =
+                detail::simdlib::binary_func<::uint32x4x2_t>(data, o)
+                        .template call<&vceqq_f32>();
+        const auto equal = vandq_u32(equals.val[0], equals.val[1]);
+        return vminvq_u32(equal) == 0xffffffffu;
     }
 
     std::string tostring() const {
@@ -778,17 +1193,17 @@ struct simd8float32 {
 // hadd does not cross lanes
 inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
     return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data, &vpaddq_f32)};
+            detail::simdlib::binary_func(a.data, b.data).call<&vpaddq_f32>()};
 }
 
 inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
     return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data, &vzip1q_f32)};
+            detail::simdlib::binary_func(a.data, b.data).call<&vzip1q_f32>()};
 }
 
 inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
     return simd8float32{
-            detail::simdlib::binary_func(a.data, b.data, &vzip2q_f32)};
+            detail::simdlib::binary_func(a.data, b.data).call<&vzip2q_f32>()};
 }
 
 // compute a * b + c
@@ -801,20 +1216,130 @@ inline simd8float32 fmadd(
             vfmaq_f32(c.data.val[1], a.data.val[1], b.data.val[1])}};
 }
 
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    const auto comparison = detail::simdlib::binary_func<::uint32x4x2_t>(
+                                    candidateValues.data, lowestValues.data)
+                                    .call<&vcltq_f32>();
+
+    lowestValues.data = float32x4x2_t{
+            vbslq_f32(
+                    comparison.val[0],
+                    candidateValues.data.val[0],
+                    lowestValues.data.val[0]),
+            vbslq_f32(
+                    comparison.val[1],
+                    candidateValues.data.val[1],
+                    lowestValues.data.val[1])};
+    lowestIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    lowestIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    lowestIndices.data.val[1])};
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    const uint32x4x2_t comparison =
+            detail::simdlib::binary_func<::uint32x4x2_t>(
+                    candidateValues.data, currentValues.data)
+                    .call<&vcltq_f32>();
+
+    minValues.data = detail::simdlib::binary_func(
+                             candidateValues.data, currentValues.data)
+                             .call<&vminq_f32>();
+    minIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    candidateIndices.data.val[0],
+                    currentIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    candidateIndices.data.val[1],
+                    currentIndices.data.val[1])};
+
+    maxValues.data = detail::simdlib::binary_func(
+                             candidateValues.data, currentValues.data)
+                             .call<&vmaxq_f32>();
+    maxIndices.data = uint32x4x2_t{
+            vbslq_u32(
+                    comparison.val[0],
+                    currentIndices.data.val[0],
+                    candidateIndices.data.val[0]),
+            vbslq_u32(
+                    comparison.val[1],
+                    currentIndices.data.val[1],
+                    candidateIndices.data.val[1])};
+}
+
 namespace {
 
 // get even float32's of a and b, interleaved
 simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{float32x4x2_t{
-            vuzp1q_f32(a.data.val[0], b.data.val[0]),
-            vuzp1q_f32(a.data.val[1], b.data.val[1])}};
+    return simd8float32{
+            detail::simdlib::binary_func(a.data, b.data).call<&vuzp1q_f32>()};
 }
 
 // get odd float32's of a and b, interleaved
 simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
-    return simd8float32{float32x4x2_t{
-            vuzp2q_f32(a.data.val[0], b.data.val[0]),
-            vuzp2q_f32(a.data.val[1], b.data.val[1])}};
+    return simd8float32{
+            detail::simdlib::binary_func(a.data, b.data).call<&vuzp2q_f32>()};
 }
 
 // 3 cycles
diff --git a/thirdparty/faiss/faiss/utils/sorting.cpp b/thirdparty/faiss/faiss/utils/sorting.cpp
new file mode 100644
index 000000000..67dd51bf7
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/sorting.cpp
@@ -0,0 +1,832 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// -*- c++ -*-
+
+#include <faiss/utils/sorting.h>
+
+#include <omp.h>
+#include <algorithm>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+/*****************************************************************************
+ * Argsort
+ ****************************************************************************/
+
+namespace {
+struct ArgsortComparator {
+    const float* vals;
+    bool operator()(const size_t a, const size_t b) const {
+        return vals[a] < vals[b];
+    }
+};
+
+struct SegmentS {
+    size_t i0; // begin pointer in the permutation array
+    size_t i1; // end
+    size_t len() const {
+        return i1 - i0;
+    }
+};
+
+// see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
+// extended to > 1 merge thread
+
+// merges 2 ranges that should be consecutive on the source into
+// the union of the two on the destination
+template <typename T>
+void parallel_merge(
+        const T* src,
+        T* dst,
+        SegmentS& s1,
+        SegmentS& s2,
+        int nt,
+        const ArgsortComparator& comp) {
+    if (s2.len() > s1.len()) { // make sure that s1 larger than s2
+        std::swap(s1, s2);
+    }
+
+    // compute sub-ranges for each thread
+    std::vector<SegmentS> s1s(nt), s2s(nt), sws(nt);
+    s2s[0].i0 = s2.i0;
+    s2s[nt - 1].i1 = s2.i1;
+
+    // not sure parallel actually helps here
+#pragma omp parallel for num_threads(nt)
+    for (int t = 0; t < nt; t++) {
+        s1s[t].i0 = s1.i0 + s1.len() * t / nt;
+        s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
+
+        if (t + 1 < nt) {
+            T pivot = src[s1s[t].i1];
+            size_t i0 = s2.i0, i1 = s2.i1;
+            while (i0 + 1 < i1) {
+                size_t imed = (i1 + i0) / 2;
+                if (comp(pivot, src[imed])) {
+                    i1 = imed;
+                } else {
+                    i0 = imed;
+                }
+            }
+            s2s[t].i1 = s2s[t + 1].i0 = i1;
+        }
+    }
+    s1.i0 = std::min(s1.i0, s2.i0);
+    s1.i1 = std::max(s1.i1, s2.i1);
+    s2 = s1;
+    sws[0].i0 = s1.i0;
+    for (int t = 0; t < nt; t++) {
+        sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
+        if (t + 1 < nt) {
+            sws[t + 1].i0 = sws[t].i1;
+        }
+    }
+    assert(sws[nt - 1].i1 == s1.i1);
+
+    // do the actual merging
+#pragma omp parallel for num_threads(nt)
+    for (int t = 0; t < nt; t++) {
+        SegmentS sw = sws[t];
+        SegmentS s1t = s1s[t];
+        SegmentS s2t = s2s[t];
+        if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
+            for (;;) {
+                // assert (sw.len() == s1t.len() + s2t.len());
+                if (comp(src[s1t.i0], src[s2t.i0])) {
+                    dst[sw.i0++] = src[s1t.i0++];
+                    if (s1t.i0 == s1t.i1) {
+                        break;
+                    }
+                } else {
+                    dst[sw.i0++] = src[s2t.i0++];
+                    if (s2t.i0 == s2t.i1) {
+                        break;
+                    }
+                }
+            }
+        }
+        if (s1t.len() > 0) {
+            assert(s1t.len() == sw.len());
+            memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
+        } else if (s2t.len() > 0) {
+            assert(s2t.len() == sw.len());
+            memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
+        }
+    }
+}
+
+}; // namespace
+
+void fvec_argsort(size_t n, const float* vals, size_t* perm) {
+    for (size_t i = 0; i < n; i++) {
+        perm[i] = i;
+    }
+    ArgsortComparator comp = {vals};
+    std::sort(perm, perm + n, comp);
+}
+
+void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
+    size_t* perm2 = new size_t[n];
+    // 2 result tables, during merging, flip between them
+    size_t *permB = perm2, *permA = perm;
+
+    int nt = omp_get_max_threads();
+    { // prepare correct permutation so that the result ends in perm
+      // at final iteration
+        int nseg = nt;
+        while (nseg > 1) {
+            nseg = (nseg + 1) / 2;
+            std::swap(permA, permB);
+        }
+    }
+
+#pragma omp parallel
+    for (size_t i = 0; i < n; i++) {
+        permA[i] = i;
+    }
+
+    ArgsortComparator comp = {vals};
+
+    std::vector<SegmentS> segs(nt);
+
+    // independent sorts
+#pragma omp parallel for
+    for (int t = 0; t < nt; t++) {
+        size_t i0 = t * n / nt;
+        size_t i1 = (t + 1) * n / nt;
+        SegmentS seg = {i0, i1};
+        std::sort(permA + seg.i0, permA + seg.i1, comp);
+        segs[t] = seg;
+    }
+    int prev_nested = omp_get_nested();
+    omp_set_nested(1);
+
+    int nseg = nt;
+    while (nseg > 1) {
+        int nseg1 = (nseg + 1) / 2;
+        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
+        int sub_nseg1 = nseg / 2;
+
+#pragma omp parallel for num_threads(nseg1)
+        for (int s = 0; s < nseg; s += 2) {
+            if (s + 1 == nseg) { // otherwise isolated segment
+                memcpy(permB + segs[s].i0,
+                       permA + segs[s].i0,
+                       segs[s].len() * sizeof(size_t));
+            } else {
+                int t0 = s * sub_nt / sub_nseg1;
+                int t1 = (s + 1) * sub_nt / sub_nseg1;
+                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
+                parallel_merge(
+                        permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
+            }
+        }
+        for (int s = 0; s < nseg; s += 2) {
+            segs[s / 2] = segs[s];
+        }
+        nseg = nseg1;
+        std::swap(permA, permB);
+    }
+    assert(permA == perm);
+    omp_set_nested(prev_nested);
+    delete[] perm2;
+}
+
+/*****************************************************************************
+ * Bucket sort
+ ****************************************************************************/
+
+// extern symbol in the .h
+int bucket_sort_verbose = 0;
+
+namespace {
+
+void bucket_sort_ref(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm) {
+    double t0 = getmillisecs();
+    memset(lims, 0, sizeof(*lims) * (vmax + 1));
+    for (size_t i = 0; i < nval; i++) {
+        FAISS_THROW_IF_NOT(vals[i] < vmax);
+        lims[vals[i] + 1]++;
+    }
+    double t1 = getmillisecs();
+    // compute cumulative sum
+    for (size_t i = 0; i < vmax; i++) {
+        lims[i + 1] += lims[i];
+    }
+    FAISS_THROW_IF_NOT(lims[vmax] == nval);
+    double t2 = getmillisecs();
+    // populate buckets
+    for (size_t i = 0; i < nval; i++) {
+        perm[lims[vals[i]]++] = i;
+    }
+    double t3 = getmillisecs();
+    // reset pointers
+    for (size_t i = vmax; i > 0; i--) {
+        lims[i] = lims[i - 1];
+    }
+    lims[0] = 0;
+    double t4 = getmillisecs();
+    if (bucket_sort_verbose) {
+        printf("times %.3f %.3f %.3f %.3f\n",
+               t1 - t0,
+               t2 - t1,
+               t3 - t2,
+               t4 - t3);
+    }
+}
+
+void bucket_sort_parallel(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm,
+        int nt_in) {
+    memset(lims, 0, sizeof(*lims) * (vmax + 1));
+#pragma omp parallel num_threads(nt_in)
+    {
+        int nt = omp_get_num_threads(); // might be different from nt_in
+        int rank = omp_get_thread_num();
+        std::vector<int64_t> local_lims(vmax + 1);
+
+        // range of indices handled by this thread
+        size_t i0 = nval * rank / nt;
+        size_t i1 = nval * (rank + 1) / nt;
+
+        // build histogram in local lims
+        double t0 = getmillisecs();
+        for (size_t i = i0; i < i1; i++) {
+            local_lims[vals[i]]++;
+        }
+#pragma omp critical
+        { // accumulate histograms (not shifted indices to prepare cumsum)
+            for (size_t i = 0; i < vmax; i++) {
+                lims[i + 1] += local_lims[i];
+            }
+        }
+#pragma omp barrier
+
+        double t1 = getmillisecs();
+#pragma omp master
+        {
+            // compute cumulative sum
+            for (size_t i = 0; i < vmax; i++) {
+                lims[i + 1] += lims[i];
+            }
+            FAISS_THROW_IF_NOT(lims[vmax] == nval);
+        }
+#pragma omp barrier
+
+#pragma omp critical
+        { // current thread grabs a slot in the buckets
+            for (size_t i = 0; i < vmax; i++) {
+                size_t nv = local_lims[i];
+                local_lims[i] = lims[i]; // where we should start writing
+                lims[i] += nv;
+            }
+        }
+
+        double t2 = getmillisecs();
+#pragma omp barrier
+        { // populate buckets, this is the slowest operation
+            for (size_t i = i0; i < i1; i++) {
+                perm[local_lims[vals[i]]++] = i;
+            }
+        }
+#pragma omp barrier
+        double t3 = getmillisecs();
+
+#pragma omp master
+        { // shift back lims
+            for (size_t i = vmax; i > 0; i--) {
+                lims[i] = lims[i - 1];
+            }
+            lims[0] = 0;
+            double t4 = getmillisecs();
+            if (bucket_sort_verbose) {
+                printf("times %.3f %.3f %.3f %.3f\n",
+                       t1 - t0,
+                       t2 - t1,
+                       t3 - t2,
+                       t4 - t3);
+            }
+        }
+    }
+}
+
+/***********************************************
+ * in-place bucket sort
+ */
+
+template <class TI>
+void bucket_sort_inplace_ref(
+        size_t nrow,
+        size_t ncol,
+        TI* vals,
+        TI nbucket,
+        int64_t* lims) {
+    double t0 = getmillisecs();
+    size_t nval = nrow * ncol;
+    FAISS_THROW_IF_NOT(
+            nbucket < nval); // unclear what would happen in this case...
+
+    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
+    for (size_t i = 0; i < nval; i++) {
+        FAISS_THROW_IF_NOT(vals[i] < nbucket);
+        lims[vals[i] + 1]++;
+    }
+    double t1 = getmillisecs();
+    // compute cumulative sum
+    for (size_t i = 0; i < nbucket; i++) {
+        lims[i + 1] += lims[i];
+    }
+    FAISS_THROW_IF_NOT(lims[nbucket] == nval);
+    double t2 = getmillisecs();
+
+    std::vector<size_t> ptrs(nbucket);
+    for (size_t i = 0; i < nbucket; i++) {
+        ptrs[i] = lims[i];
+    }
+
+    // find loops in the permutation and follow them
+    TI row = -1;
+    TI init_bucket_no = 0, bucket_no = 0;
+    for (;;) {
+        size_t idx = ptrs[bucket_no];
+        if (row >= 0) {
+            ptrs[bucket_no] += 1;
+        }
+        assert(idx < lims[bucket_no + 1]);
+        TI next_bucket_no = vals[idx];
+        vals[idx] = row;
+        if (next_bucket_no != -1) {
+            row = idx / ncol;
+            bucket_no = next_bucket_no;
+        } else {
+            // start new loop
+            for (; init_bucket_no < nbucket; init_bucket_no++) {
+                if (ptrs[init_bucket_no] < lims[init_bucket_no + 1]) {
+                    break;
+                }
+            }
+            if (init_bucket_no == nbucket) { // we're done
+                break;
+            }
+            bucket_no = init_bucket_no;
+            row = -1;
+        }
+    }
+
+    for (size_t i = 0; i < nbucket; i++) {
+        assert(ptrs[i] == lims[i + 1]);
+    }
+    double t3 = getmillisecs();
+    if (bucket_sort_verbose) {
+        printf("times %.3f %.3f %.3f\n", t1 - t0, t2 - t1, t3 - t2);
+    }
+}
+
+// collects row numbers to write into buckets
+template <class TI>
+struct ToWrite {
+    TI nbucket;
+    std::vector<TI> buckets;
+    std::vector<TI> rows;
+    std::vector<size_t> lims;
+
+    explicit ToWrite(TI nbucket) : nbucket(nbucket) {
+        lims.resize(nbucket + 1);
+    }
+
+    /// add one element (row) to write in bucket b
+    void add(TI row, TI b) {
+        assert(b >= 0 && b < nbucket);
+        rows.push_back(row);
+        buckets.push_back(b);
+    }
+
+    void bucket_sort() {
+        FAISS_THROW_IF_NOT(buckets.size() == rows.size());
+        lims.resize(nbucket + 1);
+        memset(lims.data(), 0, sizeof(lims[0]) * (nbucket + 1));
+
+        for (size_t i = 0; i < buckets.size(); i++) {
+            assert(buckets[i] >= 0 && buckets[i] < nbucket);
+            lims[buckets[i] + 1]++;
+        }
+        // compute cumulative sum
+        for (size_t i = 0; i < nbucket; i++) {
+            lims[i + 1] += lims[i];
+        }
+        FAISS_THROW_IF_NOT(lims[nbucket] == buckets.size());
+
+        // could also do a circular perm...
+        std::vector<TI> new_rows(rows.size());
+        std::vector<size_t> ptrs = lims;
+        for (size_t i = 0; i < buckets.size(); i++) {
+            TI b = buckets[i];
+            assert(ptrs[b] < lims[b + 1]);
+            new_rows[ptrs[b]++] = rows[i];
+        }
+        buckets.resize(0);
+        std::swap(rows, new_rows);
+    }
+
+    void swap(ToWrite& other) {
+        assert(nbucket == other.nbucket);
+        buckets.swap(other.buckets);
+        rows.swap(other.rows);
+        lims.swap(other.lims);
+    }
+};
+
+template <class TI>
+void bucket_sort_inplace_parallel(
+        size_t nrow,
+        size_t ncol,
+        TI* vals,
+        TI nbucket,
+        int64_t* lims,
+        int nt_in) {
+    int verbose = bucket_sort_verbose;
+    memset(lims, 0, sizeof(*lims) * (nbucket + 1));
+    std::vector<ToWrite<TI>> all_to_write;
+    size_t nval = nrow * ncol;
+    FAISS_THROW_IF_NOT(
+            nbucket < nval); // unclear what would happen in this case...
+
+    // try to keep size of all_to_write < 5GiB
+    // but we need at least one element per bucket
+    size_t init_to_write = std::max(
+            size_t(nbucket),
+            std::min(nval / 10, ((size_t)5 << 30) / (sizeof(TI) * 3 * nt_in)));
+    if (verbose > 0) {
+        printf("init_to_write=%zd\n", init_to_write);
+    }
+
+    std::vector<size_t> ptrs(nbucket); // ptrs is shared across all threads
+    std::vector<char> did_wrap(
+            nbucket); // DON'T use std::vector<bool> that cannot be accessed
+                      // safely from multiple threads!!!
+
+#pragma omp parallel num_threads(nt_in)
+    {
+        int nt = omp_get_num_threads(); // might be different from nt_in (?)
+        int rank = omp_get_thread_num();
+        std::vector<int64_t> local_lims(nbucket + 1);
+
+        // range of indices handled by this thread
+        size_t i0 = nval * rank / nt;
+        size_t i1 = nval * (rank + 1) / nt;
+
+        // build histogram in local lims
+        for (size_t i = i0; i < i1; i++) {
+            local_lims[vals[i]]++;
+        }
+#pragma omp critical
+        { // accumulate histograms (not shifted indices to prepare cumsum)
+            for (size_t i = 0; i < nbucket; i++) {
+                lims[i + 1] += local_lims[i];
+            }
+            all_to_write.push_back(ToWrite<TI>(nbucket));
+        }
+
+#pragma omp barrier
+        // this thread's things to write
+        ToWrite<TI>& to_write = all_to_write[rank];
+
+#pragma omp master
+        {
+            // compute cumulative sum
+            for (size_t i = 0; i < nbucket; i++) {
+                lims[i + 1] += lims[i];
+            }
+            FAISS_THROW_IF_NOT(lims[nbucket] == nval);
+            // at this point lims is final (read only!)
+
+            memcpy(ptrs.data(), lims, sizeof(lims[0]) * nbucket);
+
+            // initial values to write (we write -1s to get the process running)
+            // make sure at least one element per bucket
+            size_t written = 0;
+            for (TI b = 0; b < nbucket; b++) {
+                size_t l0 = lims[b], l1 = lims[b + 1];
+                size_t target_to_write = l1 * init_to_write / nval;
+                do {
+                    if (l0 == l1) {
+                        break;
+                    }
+                    to_write.add(-1, b);
+                    l0++;
+                    written++;
+                } while (written < target_to_write);
+            }
+
+            to_write.bucket_sort();
+        }
+
+        // this thread writes only buckets b0:b1
+        size_t b0 = (rank * nbucket + nt - 1) / nt;
+        size_t b1 = ((rank + 1) * nbucket + nt - 1) / nt;
+
+        // in this loop, we write elements collected in the previous round
+        // and collect the elements that are overwritten for the next round
+        size_t tot_written = 0;
+        int round = 0;
+        for (;;) {
+#pragma omp barrier
+
+            size_t n_to_write = 0;
+            for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                n_to_write += to_write_2.lims.back();
+            }
+
+            tot_written += n_to_write;
+            // assert(tot_written <= nval);
+
+#pragma omp master
+            {
+                if (verbose >= 1) {
+                    printf("ROUND %d n_to_write=%zd\n", round, n_to_write);
+                }
+                if (verbose > 2) {
+                    for (size_t b = 0; b < nbucket; b++) {
+                        printf("   b=%zd [", b);
+                        for (size_t i = lims[b]; i < lims[b + 1]; i++) {
+                            printf(" %s%d",
+                                   ptrs[b] == i ? ">" : "",
+                                   int(vals[i]));
+                        }
+                        printf(" %s] %s\n",
+                               ptrs[b] == lims[b + 1] ? ">" : "",
+                               did_wrap[b] ? "w" : "");
+                    }
+                    printf("To write\n");
+                    for (size_t b = 0; b < nbucket; b++) {
+                        printf("   b=%zd ", b);
+                        const char* sep = "[";
+                        for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                            printf("%s", sep);
+                            sep = " |";
+                            size_t l0 = to_write_2.lims[b];
+                            size_t l1 = to_write_2.lims[b + 1];
+                            for (size_t i = l0; i < l1; i++) {
+                                printf(" %d", int(to_write_2.rows[i]));
+                            }
+                        }
+                        printf(" ]\n");
+                    }
+                }
+            }
+            if (n_to_write == 0) {
+                break;
+            }
+            round++;
+
+#pragma omp barrier
+
+            ToWrite<TI> next_to_write(nbucket);
+
+            for (size_t b = b0; b < b1; b++) {
+                for (const ToWrite<TI>& to_write_2 : all_to_write) {
+                    size_t l0 = to_write_2.lims[b];
+                    size_t l1 = to_write_2.lims[b + 1];
+                    for (size_t i = l0; i < l1; i++) {
+                        TI row = to_write_2.rows[i];
+                        size_t idx = ptrs[b];
+                        if (verbose > 2) {
+                            printf("    bucket %d (rank %d) idx %zd\n",
+                                   int(row),
+                                   rank,
+                                   idx);
+                        }
+                        if (idx < lims[b + 1]) {
+                            ptrs[b]++;
+                        } else {
+                            // wrapping around
+                            assert(!did_wrap[b]);
+                            did_wrap[b] = true;
+                            idx = lims[b];
+                            ptrs[b] = idx + 1;
+                        }
+
+                        // check if we need to remember the overwritten number
+                        if (vals[idx] >= 0) {
+                            TI new_row = idx / ncol;
+                            next_to_write.add(new_row, vals[idx]);
+                            if (verbose > 2) {
+                                printf("       new_row=%d\n", int(new_row));
+                            }
+                        } else {
+                            assert(did_wrap[b]);
+                        }
+
+                        vals[idx] = row;
+                    }
+                }
+            }
+            next_to_write.bucket_sort();
+#pragma omp barrier
+            all_to_write[rank].swap(next_to_write);
+        }
+    }
+}
+
+} // anonymous namespace
+
+void bucket_sort(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t vmax,
+        int64_t* lims,
+        int64_t* perm,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_ref(nval, vals, vmax, lims, perm);
+    } else {
+        bucket_sort_parallel(nval, vals, vmax, lims, perm, nt);
+    }
+}
+
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int32_t* vals,
+        int32_t vmax,
+        int64_t* lims,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
+    } else {
+        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
+    }
+}
+
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int64_t* vals,
+        int64_t vmax,
+        int64_t* lims,
+        int nt) {
+    if (nt == 0) {
+        bucket_sort_inplace_ref(nrow, ncol, vals, vmax, lims);
+    } else {
+        bucket_sort_inplace_parallel(nrow, ncol, vals, vmax, lims, nt);
+    }
+}
+
+/** Hashtable implementation for int64 -> int64 with external storage
+ * implemented for speed and parallel processing.
+ */
+
+namespace {
+
+int log2_capacity_to_log2_nbucket(int log2_capacity) {
+    return log2_capacity < 12    ? 0
+            : log2_capacity < 20 ? log2_capacity - 12
+                                 : 10;
+}
+
+// https://bigprimes.org/
+int64_t bigprime = 8955327411143;
+
+inline int64_t hash_function(int64_t x) {
+    return (x * 1000003) % bigprime;
+}
+
+} // anonymous namespace
+
+void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab) {
+    size_t capacity = (size_t)1 << log2_capacity;
+#pragma omp parallel for
+    for (int64_t i = 0; i < capacity; i++) {
+        tab[2 * i] = -1;
+        tab[2 * i + 1] = -1;
+    }
+}
+
+void hashtable_int64_to_int64_add(
+        int log2_capacity,
+        int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        const int64_t* vals) {
+    size_t capacity = (size_t)1 << log2_capacity;
+    std::vector<int64_t> hk(n);
+    std::vector<uint64_t> bucket_no(n);
+    int64_t mask = capacity - 1;
+    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
+    size_t nbucket = (size_t)1 << log2_nbucket;
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        hk[i] = hash_function(keys[i]) & mask;
+        bucket_no[i] = hk[i] >> (log2_capacity - log2_nbucket);
+    }
+
+    std::vector<int64_t> lims(nbucket + 1);
+    std::vector<int64_t> perm(n);
+    bucket_sort(
+            n,
+            bucket_no.data(),
+            nbucket,
+            lims.data(),
+            perm.data(),
+            omp_get_max_threads());
+
+    int num_errors = 0;
+#pragma omp parallel for reduction(+ : num_errors)
+    for (int64_t bucket = 0; bucket < nbucket; bucket++) {
+        size_t k0 = bucket << (log2_capacity - log2_nbucket);
+        size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
+
+        for (size_t i = lims[bucket]; i < lims[bucket + 1]; i++) {
+            int64_t j = perm[i];
+            assert(bucket_no[j] == bucket);
+            assert(hk[j] >= k0 && hk[j] < k1);
+            size_t slot = hk[j];
+            for (;;) {
+                if (tab[slot * 2] == -1) { // found!
+                    tab[slot * 2] = keys[j];
+                    tab[slot * 2 + 1] = vals[j];
+                    break;
+                } else if (tab[slot * 2] == keys[j]) { // overwrite!
+                    tab[slot * 2 + 1] = vals[j];
+                    break;
+                }
+                slot++;
+                if (slot == k1) {
+                    slot = k0;
+                }
+                if (slot == hk[j]) { // no free slot left in bucket
+                    num_errors++;
+                    break;
+                }
+            }
+            if (num_errors > 0) {
+                break;
+            }
+        }
+    }
+    FAISS_THROW_IF_NOT_MSG(num_errors == 0, "hashtable capacity exhausted");
+}
+
+void hashtable_int64_to_int64_lookup(
+        int log2_capacity,
+        const int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        int64_t* vals) {
+    size_t capacity = (size_t)1 << log2_capacity;
+    std::vector<int64_t> hk(n), bucket_no(n);
+    int64_t mask = capacity - 1;
+    int log2_nbucket = log2_capacity_to_log2_nbucket(log2_capacity);
+    size_t nbucket = (size_t)1 << log2_nbucket;
+
+#pragma omp parallel for
+    for (int64_t i = 0; i < n; i++) {
+        int64_t k = keys[i];
+        int64_t hk = hash_function(k) & mask;
+        size_t slot = hk;
+
+        if (tab[2 * slot] == -1) { // not in table
+            vals[i] = -1;
+        } else if (tab[2 * slot] == k) { // found!
+            vals[i] = tab[2 * slot + 1];
+        } else { // need to search in [k0, k1)
+            size_t bucket = hk >> (log2_capacity - log2_nbucket);
+            size_t k0 = bucket << (log2_capacity - log2_nbucket);
+            size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
+            for (;;) {
+                if (tab[slot * 2] == k) { // found!
+                    vals[i] = tab[2 * slot + 1];
+                    break;
+                }
+                slot++;
+                if (slot == k1) {
+                    slot = k0;
+                }
+                if (slot == hk) { // bucket is full and not found
+                    vals[i] = -1;
+                    break;
+                }
+            }
+        }
+    }
+}
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/sorting.h b/thirdparty/faiss/faiss/utils/sorting.h
new file mode 100644
index 000000000..50dadad47
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/sorting.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+namespace faiss {
+
+/** Indirect sort of a floating-point array
+ *
+ * @param n     size of the array
+ * @param vals  array to sort, size n
+ * @param perm  output: permutation of [0..n-1], st.
+ *              vals[perm[i + 1]] >= vals[perm[i]]
+ */
+void fvec_argsort(size_t n, const float* vals, size_t* perm);
+
+/** Same as fvec_argsort, parallelized */
+void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm);
+
+/// increase verbosity of the bucket_sort functions
+FAISS_API extern int bucket_sort_verbose;
+
+/** Bucket sort of a list of values
+ *
+ * @param vals     values to sort, size nval, max value nbucket - 1
+ * @param lims     output limits of buckets, size nbucket + 1
+ * @param perm     output buckets, the elements of bucket
+ *                 i are in perm[lims[i]:lims[i + 1]]
+ * @param nt       number of threads (0 = pure sequential code)
+ */
+void bucket_sort(
+        size_t nval,
+        const uint64_t* vals,
+        uint64_t nbucket,
+        int64_t* lims,
+        int64_t* perm,
+        int nt = 0);
+
+/** in-place bucket sort (with attention to memory=>int32)
+ * on input the values are in a nrow * col matrix
+ * we want to store the row numbers in the output.
+ *
+ * @param vals     positive values to sort, size nrow * ncol,
+ *                 max value nbucket - 1
+ * @param lims     output limits of buckets, size nbucket + 1
+ * @param nt       number of threads (0 = pure sequential code)
+ */
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int32_t* vals,
+        int32_t nbucket,
+        int64_t* lims,
+        int nt = 0);
+
+/// same with int64 elements
+void matrix_bucket_sort_inplace(
+        size_t nrow,
+        size_t ncol,
+        int64_t* vals,
+        int64_t nbucket,
+        int64_t* lims,
+        int nt = 0);
+
+/** Hashtable implementation for int64 -> int64 with external storage
+ * implemented for fast batch add and lookup.
+ *
+ * tab is of size  2 * (1 << log2_capacity)
+ * n is the number of elements to add or search
+ *
+ * adding several values in a same batch: an arbitrary one gets added
+ * in different batches: the newer batch overwrites.
+ * raises an exception if capacity is exhausted.
+ */
+
+void hashtable_int64_to_int64_init(int log2_capacity, int64_t* tab);
+
+void hashtable_int64_to_int64_add(
+        int log2_capacity,
+        int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        const int64_t* vals);
+
+void hashtable_int64_to_int64_lookup(
+        int log2_capacity,
+        const int64_t* tab,
+        size_t n,
+        const int64_t* keys,
+        int64_t* vals);
+
+} // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/transpose/transpose-avx2-inl.h b/thirdparty/faiss/faiss/utils/transpose/transpose-avx2-inl.h
new file mode 100644
index 000000000..4b6798429
--- /dev/null
+++ b/thirdparty/faiss/faiss/utils/transpose/transpose-avx2-inl.h
@@ -0,0 +1,165 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// This file contains transposing kernels for AVX2 for
+// tiny float/int32 matrices, such as 8x2.
+
+#ifdef __AVX2__
+
+#include <immintrin.h>
+
+namespace faiss {
+
+// 8x2 -> 2x8
+inline void transpose_8x2(
+        const __m256 i0,
+        const __m256 i1,
+        __m256& o0,
+        __m256& o1) {
+    // say, we have the following as in input:
+    // i0:  00 01 10 11 20 21 30 31
+    // i1:  40 41 50 51 60 61 70 71
+
+    // 00 01 10 11 40 41 50 51
+    const __m256 r0 = _mm256_permute2f128_ps(i0, i1, _MM_SHUFFLE(0, 2, 0, 0));
+    // 20 21 30 31 60 61 70 71
+    const __m256 r1 = _mm256_permute2f128_ps(i0, i1, _MM_SHUFFLE(0, 3, 0, 1));
+
+    // 00 10 20 30 40 50 60 70
+    o0 = _mm256_shuffle_ps(r0, r1, _MM_SHUFFLE(2, 0, 2, 0));
+    // 01 11 21 31 41 51 61 71
+    o1 = _mm256_shuffle_ps(r0, r1, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+// 8x4 -> 4x8
+inline void transpose_8x4(
+        const __m256 i0,
+        const __m256 i1,
+        const __m256 i2,
+        const __m256 i3,
+        __m256& o0,
+        __m256& o1,
+        __m256& o2,
+        __m256& o3) {
+    // say, we have the following as an input:
+    // i0:  00 01 02 03 10 11 12 13
+    // i1:  20 21 22 23 30 31 32 33
+    // i2:  40 41 42 43 50 51 52 53
+    // i3:  60 61 62 63 70 71 72 73
+
+    // 00 01 02 03 40 41 42 43
+    const __m256 r0 = _mm256_permute2f128_ps(i0, i2, _MM_SHUFFLE(0, 2, 0, 0));
+    // 20 21 22 23 60 61 62 63
+    const __m256 r1 = _mm256_permute2f128_ps(i1, i3, _MM_SHUFFLE(0, 2, 0, 0));
+    // 10 11 12 13 50 51 52 53
+    const __m256 r2 = _mm256_permute2f128_ps(i0, i2, _MM_SHUFFLE(0, 3, 0, 1));
+    // 30 31 32 33 70 71 72 73
+    const __m256 r3 = _mm256_permute2f128_ps(i1, i3, _MM_SHUFFLE(0, 3, 0, 1));
+
+    // 00 02 10 12 40 42 50 52
+    const __m256 t0 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(2, 0, 2, 0));
+    // 01 03 11 13 41 43 51 53
+    const __m256 t1 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 1, 3, 1));
+    // 20 22 30 32 60 62 70 72
+    const __m256 t2 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(2, 0, 2, 0));
+    // 21 23 31 33 61 63 71 73
+    const __m256 t3 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 1, 3, 1));
+
+    // 00 10 20 30 40 50 60 70
+    o0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(2, 0, 2, 0));
+    // 01 11 21 31 41 51 61 71
+    o1 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(2, 0, 2, 0));
+    // 02 12 22 32 42 52 62 72
+    o2 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 1, 3, 1));
+    // 03 13 23 33 43 53 63 73
+    o3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 1, 3, 1));
+}
+
+inline void transpose_8x8(
+        const __m256 i0,
+        const __m256 i1,
+        const __m256 i2,
+        const __m256 i3,
+        const __m256 i4,
+        const __m256 i5,
+        const __m256 i6,
+        const __m256 i7,
+        __m256& o0,
+        __m256& o1,
+        __m256& o2,
+        __m256& o3,
+        __m256& o4,
+        __m256& o5,
+        __m256& o6,
+        __m256& o7) {
+    // say, we have the following as an input:
+    // i0:  00 01 02 03 04 05 06 07
+    // i1:  10 11 12 13 14 15 16 17
+    // i2:  20 21 22 23 24 25 26 27
+    // i3:  30 31 32 33 34 35 36 37
+    // i4:  40 41 42 43 44 45 46 47
+    // i5:  50 51 52 53 54 55 56 57
+    // i6:  60 61 62 63 64 65 66 67
+    // i7:  70 71 72 73 74 75 76 77
+
+    // 00 10 01 11 04 14 05 15
+    const __m256 r0 = _mm256_unpacklo_ps(i0, i1);
+    // 02 12 03 13 06 16 07 17
+    const __m256 r1 = _mm256_unpackhi_ps(i0, i1);
+    // 20 30 21 31 24 34 25 35
+    const __m256 r2 = _mm256_unpacklo_ps(i2, i3);
+    // 22 32 23 33 26 36 27 37
+    const __m256 r3 = _mm256_unpackhi_ps(i2, i3);
+    // 40 50 41 51 44 54 45 55
+    const __m256 r4 = _mm256_unpacklo_ps(i4, i5);
+    // 42 52 43 53 46 56 47 57
+    const __m256 r5 = _mm256_unpackhi_ps(i4, i5);
+    // 60 70 61 71 64 74 65 75
+    const __m256 r6 = _mm256_unpacklo_ps(i6, i7);
+    // 62 72 63 73 66 76 67 77
+    const __m256 r7 = _mm256_unpackhi_ps(i6, i7);
+
+    // 00 10 20 30 04 14 24 34
+    const __m256 rr0 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0));
+    // 01 11 21 31 05 15 25 35
+    const __m256 rr1 = _mm256_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2));
+    // 02 12 22 32 06 16 26 36
+    const __m256 rr2 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0));
+    // 03 13 23 33 07 17 27 37
+    const __m256 rr3 = _mm256_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2));
+    // 40 50 60 70 44 54 64 74
+    const __m256 rr4 = _mm256_shuffle_ps(r4, r6, _MM_SHUFFLE(1, 0, 1, 0));
+    // 41 51 61 71 45 55 65 75
+    const __m256 rr5 = _mm256_shuffle_ps(r4, r6, _MM_SHUFFLE(3, 2, 3, 2));
+    // 42 52 62 72 46 56 66 76
+    const __m256 rr6 = _mm256_shuffle_ps(r5, r7, _MM_SHUFFLE(1, 0, 1, 0));
+    // 43 53 63 73 47 57 67 77
+    const __m256 rr7 = _mm256_shuffle_ps(r5, r7, _MM_SHUFFLE(3, 2, 3, 2));
+
+    // 00 10 20 30 40 50 60 70
+    o0 = _mm256_permute2f128_ps(rr0, rr4, 0x20);
+    // 01 11 21 31 41 51 61 71
+    o1 = _mm256_permute2f128_ps(rr1, rr5, 0x20);
+    // 02 12 22 32 42 52 62 72
+    o2 = _mm256_permute2f128_ps(rr2, rr6, 0x20);
+    // 03 13 23 33 43 53 63 73
+    o3 = _mm256_permute2f128_ps(rr3, rr7, 0x20);
+    // 04 14 24 34 44 54 64 74
+    o4 = _mm256_permute2f128_ps(rr0, rr4, 0x31);
+    // 05 15 25 35 45 55 65 75
+    o5 = _mm256_permute2f128_ps(rr1, rr5, 0x31);
+    // 06 16 26 36 46 56 66 76
+    o6 = _mm256_permute2f128_ps(rr2, rr6, 0x31);
+    // 07 17 27 37 47 57 67 77
+    o7 = _mm256_permute2f128_ps(rr3, rr7, 0x31);
+}
+
+} // namespace faiss
+
+#endif
diff --git a/thirdparty/faiss/faiss/utils/utils.cpp b/thirdparty/faiss/faiss/utils/utils.cpp
index 9dda61aae..3fdc7cc84 100644
--- a/thirdparty/faiss/faiss/utils/utils.cpp
+++ b/thirdparty/faiss/faiss/utils/utils.cpp
@@ -28,13 +28,16 @@
 #include <omp.h>
 
 #include <algorithm>
+#include <cinttypes>
+#include <set>
+#include <type_traits>
 #include <vector>
 
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/random.h>
-#include <cinttypes>
+
 #ifndef FINTEGER
 #define FINTEGER long
 #endif
@@ -101,6 +104,9 @@ int sgemv_(
 
 namespace faiss {
 
+// this will be set at load time from GPU Faiss
+std::string gpu_compile_options;
+
 std::string get_compile_options() {
     std::string options;
 
@@ -110,13 +116,15 @@ std::string get_compile_options() {
 #endif
 
 #ifdef __AVX2__
-    options += "AVX2";
+    options += "AVX2 ";
 #elif defined(__aarch64__)
-    options += "NEON";
+    options += "NEON ";
 #else
-    options += "GENERIC";
+    options += "GENERIC ";
 #endif
 
+    options += gpu_compile_options;
+
     return options;
 }
 
@@ -423,185 +431,33 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
     }
 }
 
-size_t ivec_checksum(size_t n, const int* a) {
-    size_t cs = 112909;
-    while (n--)
+uint64_t ivec_checksum(size_t n, const int32_t* assigned) {
+    const uint32_t* a = reinterpret_cast<const uint32_t*>(assigned);
+    uint64_t cs = 112909;
+    while (n--) {
         cs = cs * 65713 + a[n] * 1686049;
+    }
     return cs;
 }
 
-namespace {
-struct ArgsortComparator {
-    const float* vals;
-    bool operator()(const size_t a, const size_t b) const {
-        return vals[a] < vals[b];
-    }
-};
-
-struct SegmentS {
-    size_t i0; // begin pointer in the permutation array
-    size_t i1; // end
-    size_t len() const {
-        return i1 - i0;
-    }
-};
-
-// see https://en.wikipedia.org/wiki/Merge_algorithm#Parallel_merge
-// extended to > 1 merge thread
-
-// merges 2 ranges that should be consecutive on the source into
-// the union of the two on the destination
-template <typename T>
-void parallel_merge(
-        const T* src,
-        T* dst,
-        SegmentS& s1,
-        SegmentS& s2,
-        int nt,
-        const ArgsortComparator& comp) {
-    if (s2.len() > s1.len()) { // make sure that s1 larger than s2
-        std::swap(s1, s2);
-    }
-
-    // compute sub-ranges for each thread
-    std::vector<SegmentS> s1s(nt), s2s(nt), sws(nt);
-    s2s[0].i0 = s2.i0;
-    s2s[nt - 1].i1 = s2.i1;
-
-    // not sure parallel actually helps here
-#pragma omp parallel for num_threads(nt)
-    for (int t = 0; t < nt; t++) {
-        s1s[t].i0 = s1.i0 + s1.len() * t / nt;
-        s1s[t].i1 = s1.i0 + s1.len() * (t + 1) / nt;
-
-        if (t + 1 < nt) {
-            T pivot = src[s1s[t].i1];
-            size_t i0 = s2.i0, i1 = s2.i1;
-            while (i0 + 1 < i1) {
-                size_t imed = (i1 + i0) / 2;
-                if (comp(pivot, src[imed])) {
-                    i1 = imed;
-                } else {
-                    i0 = imed;
-                }
-            }
-            s2s[t].i1 = s2s[t + 1].i0 = i1;
-        }
-    }
-    s1.i0 = std::min(s1.i0, s2.i0);
-    s1.i1 = std::max(s1.i1, s2.i1);
-    s2 = s1;
-    sws[0].i0 = s1.i0;
-    for (int t = 0; t < nt; t++) {
-        sws[t].i1 = sws[t].i0 + s1s[t].len() + s2s[t].len();
-        if (t + 1 < nt) {
-            sws[t + 1].i0 = sws[t].i1;
-        }
-    }
-    assert(sws[nt - 1].i1 == s1.i1);
-
-    // do the actual merging
-#pragma omp parallel for num_threads(nt)
-    for (int t = 0; t < nt; t++) {
-        SegmentS sw = sws[t];
-        SegmentS s1t = s1s[t];
-        SegmentS s2t = s2s[t];
-        if (s1t.i0 < s1t.i1 && s2t.i0 < s2t.i1) {
-            for (;;) {
-                // assert (sw.len() == s1t.len() + s2t.len());
-                if (comp(src[s1t.i0], src[s2t.i0])) {
-                    dst[sw.i0++] = src[s1t.i0++];
-                    if (s1t.i0 == s1t.i1)
-                        break;
-                } else {
-                    dst[sw.i0++] = src[s2t.i0++];
-                    if (s2t.i0 == s2t.i1)
-                        break;
-                }
-            }
-        }
-        if (s1t.len() > 0) {
-            assert(s1t.len() == sw.len());
-            memcpy(dst + sw.i0, src + s1t.i0, s1t.len() * sizeof(dst[0]));
-        } else if (s2t.len() > 0) {
-            assert(s2t.len() == sw.len());
-            memcpy(dst + sw.i0, src + s2t.i0, s2t.len() * sizeof(dst[0]));
-        }
+uint64_t bvec_checksum(size_t n, const uint8_t* a) {
+    uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a);
+    for (size_t i = n / 4 * 4; i < n; i++) {
+        cs = cs * 65713 + a[n] * 1686049;
     }
+    return cs;
 }
 
-}; // namespace
-
-void fvec_argsort(size_t n, const float* vals, size_t* perm) {
-    for (size_t i = 0; i < n; i++)
-        perm[i] = i;
-    ArgsortComparator comp = {vals};
-    std::sort(perm, perm + n, comp);
-}
-
-void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
-    size_t* perm2 = new size_t[n];
-    // 2 result tables, during merging, flip between them
-    size_t *permB = perm2, *permA = perm;
-
-    int nt = omp_get_max_threads();
-    { // prepare correct permutation so that the result ends in perm
-      // at final iteration
-        int nseg = nt;
-        while (nseg > 1) {
-            nseg = (nseg + 1) / 2;
-            std::swap(permA, permB);
-        }
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
+    // MSVC can't accept unsigned index for #pragma omp parallel for
+    // so below codes only accept n <= std::numeric_limits<ssize_t>::max()
+    using ssize_t = std::make_signed<std::size_t>::type;
+    const ssize_t size = n;
+#pragma omp parallel for if (size > 1000)
+    for (ssize_t i_ = 0; i_ < size; i_++) {
+        const auto i = static_cast<std::size_t>(i_);
+        cs[i] = bvec_checksum(d, a + i * d);
     }
-
-#pragma omp parallel
-    for (size_t i = 0; i < n; i++)
-        permA[i] = i;
-
-    ArgsortComparator comp = {vals};
-
-    std::vector<SegmentS> segs(nt);
-
-    // independent sorts
-#pragma omp parallel for
-    for (int t = 0; t < nt; t++) {
-        size_t i0 = t * n / nt;
-        size_t i1 = (t + 1) * n / nt;
-        SegmentS seg = {i0, i1};
-        std::sort(permA + seg.i0, permA + seg.i1, comp);
-        segs[t] = seg;
-    }
-    int prev_nested = omp_get_nested();
-    omp_set_nested(1);
-
-    int nseg = nt;
-    while (nseg > 1) {
-        int nseg1 = (nseg + 1) / 2;
-        int sub_nt = nseg % 2 == 0 ? nt : nt - 1;
-        int sub_nseg1 = nseg / 2;
-
-#pragma omp parallel for num_threads(nseg1)
-        for (int s = 0; s < nseg; s += 2) {
-            if (s + 1 == nseg) { // otherwise isolated segment
-                memcpy(permB + segs[s].i0,
-                       permA + segs[s].i0,
-                       segs[s].len() * sizeof(size_t));
-            } else {
-                int t0 = s * sub_nt / sub_nseg1;
-                int t1 = (s + 1) * sub_nt / sub_nseg1;
-                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
-                parallel_merge(
-                        permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
-            }
-        }
-        for (int s = 0; s < nseg; s += 2)
-            segs[s / 2] = segs[s];
-        nseg = nseg1;
-        std::swap(permA, permB);
-    }
-    assert(permA == perm);
-    omp_set_nested(prev_nested);
-    delete[] perm2;
 }
 
 const float* fvecs_maybe_subsample(
@@ -723,4 +579,81 @@ int64_t get_l3_size() {
     return l3_size;
 }
 
+namespace {
+
+template <typename T>
+int64_t count_lt(int64_t n, const T* row, T threshold) {
+    for (int64_t i = 0; i < n; i++) {
+        if (!(row[i] < threshold)) {
+            return i;
+        }
+    }
+    return n;
+}
+
+template <typename T>
+int64_t count_gt(int64_t n, const T* row, T threshold) {
+    for (int64_t i = 0; i < n; i++) {
+        if (!(row[i] > threshold)) {
+            return i;
+        }
+    }
+    return n;
+}
+
+} // namespace
+
+template <typename T>
+void CombinerRangeKNN<T>::compute_sizes(int64_t* L_res) {
+    this->L_res = L_res;
+    L_res[0] = 0;
+    int64_t j = 0;
+    for (int64_t i = 0; i < nq; i++) {
+        int64_t n_in;
+        if (!mask || !mask[i]) {
+            const T* row = D + i * k;
+            n_in = keep_max ? count_gt(k, row, r2) : count_lt(k, row, r2);
+        } else {
+            n_in = lim_remain[j + 1] - lim_remain[j];
+            j++;
+        }
+        L_res[i + 1] = n_in; // L_res[i] + n_in;
+    }
+    // cumsum
+    for (int64_t i = 0; i < nq; i++) {
+        L_res[i + 1] += L_res[i];
+    }
+}
+
+template <typename T>
+void CombinerRangeKNN<T>::write_result(T* D_res, int64_t* I_res) {
+    FAISS_THROW_IF_NOT(L_res);
+    int64_t j = 0;
+    for (int64_t i = 0; i < nq; i++) {
+        int64_t n_in = L_res[i + 1] - L_res[i];
+        T* D_row = D_res + L_res[i];
+        int64_t* I_row = I_res + L_res[i];
+        if (!mask || !mask[i]) {
+            memcpy(D_row, D + i * k, n_in * sizeof(*D_row));
+            memcpy(I_row, I + i * k, n_in * sizeof(*I_row));
+        } else {
+            memcpy(D_row, D_remain + lim_remain[j], n_in * sizeof(*D_row));
+            memcpy(I_row, I_remain + lim_remain[j], n_in * sizeof(*I_row));
+            j++;
+        }
+    }
+}
+
+// explicit template instantiations
+template struct CombinerRangeKNN<float>;
+template struct CombinerRangeKNN<int16_t>;
+
+void CodeSet::insert(size_t n, const uint8_t* codes, bool* inserted) {
+    for (size_t i = 0; i < n; i++) {
+        auto res = s.insert(
+                std::vector<uint8_t>(codes + i * d, codes + i * d + d));
+        inserted[i] = res.second;
+    }
+}
+
 } // namespace faiss
diff --git a/thirdparty/faiss/faiss/utils/utils.h b/thirdparty/faiss/faiss/utils/utils.h
index 4df28f36e..4742102f6 100644
--- a/thirdparty/faiss/faiss/utils/utils.h
+++ b/thirdparty/faiss/faiss/utils/utils.h
@@ -17,12 +17,11 @@
 #define FAISS_utils_h
 
 #include <stdint.h>
+#include <set>
 #include <string>
+#include <vector>
 
-#ifdef _MSC_VER
-#define strtok_r strtok_s
-#endif // _MSC_VER
-
+#include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
 
 namespace faiss {
@@ -57,17 +56,19 @@ uint64_t get_cycles();
  * @param b   size n
  * @param c   restult table, size n
  */
-//void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c);
+// // Provided by Knowhere.
+// void fvec_madd(size_t n, const float* a, float bf, const float* b, float* c);
 
 /** same as fvec_madd, also return index of the min of the result table
  * @return    index of the min of table c
  */
-//int fvec_madd_and_argmin(
-//        size_t n,
-//        const float* a,
-//        float bf,
-//        const float* b,
-//        float* c);
+// // Provided by Knowhere.
+// int fvec_madd_and_argmin(
+//         size_t n,
+//         const float* a,
+//         float bf,
+//         const float* b,
+//         float* c);
 
 /* perform a reflection (not an efficient implementation, just for test ) */
 void reflection(const float* u, float* x, size_t n, size_t d, size_t nu);
@@ -113,10 +114,6 @@ double imbalance_factor(int n, int k, const int64_t* assign);
 /// same, takes a histogram as input
 double imbalance_factor(int k, const int* hist);
 
-void fvec_argsort(size_t n, const float* vals, size_t* perm);
-
-void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm);
-
 /// compute histogram on v
 int ivec_hist(size_t n, const int* v, int vmax, int* hist);
 
@@ -128,7 +125,19 @@ int ivec_hist(size_t n, const int* v, int vmax, int* hist);
 void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);
 
 /// compute a checksum on a table.
-size_t ivec_checksum(size_t n, const int* a);
+uint64_t ivec_checksum(size_t n, const int32_t* a);
+
+/// compute a checksum on a table.
+uint64_t bvec_checksum(size_t n, const uint8_t* a);
+
+/** compute checksums for the rows of a matrix
+ *
+ * @param n   number of rows
+ * @param d   size per row
+ * @param a   matrix to handle, size n * d
+ * @param cs  output checksums, size n
+ */
+void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs);
 
 /** random subsamples a set of vectors if there are too many of them
  *
@@ -170,9 +179,51 @@ uint64_t hash_bytes(const uint8_t* bytes, int64_t n);
 /** Whether OpenMP annotations were respected. */
 bool check_openmp();
 
-/** get the size of L3 cache  */
+/** Get L3 cache size */
 int64_t get_l3_size();
 
+/** This class is used to combine range and knn search results
+ * in contrib.exhaustive_search.range_search_gpu */
+
+template <typename T>
+struct CombinerRangeKNN {
+    int64_t nq;    /// nb of queries
+    size_t k;      /// number of neighbors for the knn search part
+    T r2;          /// range search radius
+    bool keep_max; /// whether to keep max values instead of min.
+
+    CombinerRangeKNN(int64_t nq, size_t k, T r2, bool keep_max)
+            : nq(nq), k(k), r2(r2), keep_max(keep_max) {}
+
+    /// Knn search results
+    const int64_t* I = nullptr; /// size nq * k
+    const T* D = nullptr;       /// size nq * k
+
+    /// optional: range search results (ignored if mask is NULL)
+    const bool* mask =
+            nullptr; /// mask for where knn results are valid, size nq
+    // range search results for remaining entries nrange = sum(mask)
+    const int64_t* lim_remain = nullptr; /// size nrange + 1
+    const T* D_remain = nullptr;         /// size lim_remain[nrange]
+    const int64_t* I_remain = nullptr;   /// size lim_remain[nrange]
+
+    const int64_t* L_res = nullptr; /// size nq + 1
+    // Phase 1: compute sizes into limits array (of size nq + 1)
+    void compute_sizes(int64_t* L_res);
+
+    /// Phase 2: caller allocates D_res and I_res (size L_res[nq])
+    /// Phase 3: fill in D_res and I_res
+    void write_result(T* D_res, int64_t* I_res);
+};
+
+struct CodeSet {
+    size_t d;
+    std::set<std::vector<uint8_t>> s;
+
+    explicit CodeSet(size_t d) : d(d) {}
+    void insert(size_t n, const uint8_t* codes, bool* inserted);
+};
+
 } // namespace faiss
 
 #endif /* FAISS_utils_h */
diff --git a/thirdparty/faiss/tests/CMakeLists.txt b/thirdparty/faiss/tests/CMakeLists.txt
index d4c8f5ae7..5d316bfa9 100644
--- a/thirdparty/faiss/tests/CMakeLists.txt
+++ b/thirdparty/faiss/tests/CMakeLists.txt
@@ -61,11 +61,27 @@ set(FAISS_TEST_SRC
   test_threaded_index.cpp
   test_transfer_invlists.cpp
   # test_mem_leak.cpp
+  test_cppcontrib_sa_decode.cpp
+  test_cppcontrib_uintreader.cpp
+  test_simdlib.cpp
+  test_approx_topk.cpp
+  test_RCQ_cropping.cpp
+  test_distances_simd.cpp
+  test_heap.cpp
+  test_code_distance.cpp
+  test_hnsw.cpp
+  test_partitioning.cpp
+  test_distances_if.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
 
 if(FAISS_OPT_LEVEL STREQUAL "avx2")
+  if(NOT WIN32)
+    target_compile_options(faiss_test PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma>)
+  else()
+    target_compile_options(faiss_test PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
+  endif()
   target_link_libraries(faiss_test PRIVATE faiss_avx2)
 else()
   target_link_libraries(faiss_test PRIVATE faiss)
@@ -73,7 +89,7 @@ endif()
 
 include(FetchContent)
 FetchContent_Declare(googletest
-  URL "https://github.com/google/googletest/archive/release-1.10.0.tar.gz")
+  URL "https://github.com/google/googletest/archive/release-1.12.1.tar.gz")
 set(BUILD_GMOCK CACHE BOOL OFF)
 set(INSTALL_GTEST CACHE BOOL OFF)
 FetchContent_MakeAvailable(googletest)
@@ -89,5 +105,3 @@ target_link_libraries(faiss_test PRIVATE
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
 gtest_discover_tests(faiss_test)
-
-install(TARGETS faiss_test DESTINATION unittest)
\ No newline at end of file
diff --git a/thirdparty/faiss/tests/common_faiss_tests.py b/thirdparty/faiss/tests/common_faiss_tests.py
index 8621dd822..8dc25edec 100644
--- a/thirdparty/faiss/tests/common_faiss_tests.py
+++ b/thirdparty/faiss/tests/common_faiss_tests.py
@@ -49,7 +49,7 @@ def evalres(self, DI):
         for rank in 1, 10, 100:
             e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
                        float(self.nq))
-        print("1-recalls: %s" % e)
+        # print("1-recalls: %s" % e)
         return e
 
 
diff --git a/thirdparty/faiss/tests/test_RCQ_cropping.cpp b/thirdparty/faiss/tests/test_RCQ_cropping.cpp
new file mode 100644
index 000000000..4dd347088
--- /dev/null
+++ b/thirdparty/faiss/tests/test_RCQ_cropping.cpp
@@ -0,0 +1,131 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexAdditiveQuantizer.h>
+#include <faiss/IndexScalarQuantizer.h>
+#include <faiss/utils/random.h>
+#include <gtest/gtest.h>
+
+/* This test creates a 3-level RCQ and performs a search on it.
+ * Then it crops the RCQ to just the 2 first levels and verifies that
+ * the 3-level vectors are in a subtree that was visited in the 2-level RCQ. */
+TEST(RCQCropping, test_cropping) {
+    size_t nq = 10, nt = 2000, nb = 1000, d = 32;
+
+    using idx_t = faiss::idx_t;
+
+    std::vector<float> buf((nq + nb + nt) * d);
+    faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
+    const float* xt = buf.data();
+    const float* xb = xt + nt * d;
+    const float* xq = xb + nb * d;
+
+    std::vector<size_t> nbits = {5, 4, 4};
+    faiss::ResidualCoarseQuantizer rcq(d, nbits);
+
+    rcq.train(nt, xt);
+    // fprintf(stderr, "nb centroids: %zd\n", rcq.ntotal);
+
+    // the test below works only for beam size == nprobe
+    rcq.set_beam_factor(1.0);
+
+    // perform search
+    int nprobe = 15;
+    std::vector<faiss::idx_t> Iref(nq * nprobe);
+    std::vector<float> Dref(nq * nprobe);
+    rcq.search(nq, xq, nprobe, Dref.data(), Iref.data());
+
+    // crop to the first 2 quantization levels
+    int last_nbits = nbits.back();
+    nbits.pop_back();
+    faiss::ResidualCoarseQuantizer rcq_cropped(d, nbits);
+    rcq_cropped.initialize_from(rcq);
+    // fprintf(stderr, "cropped nb centroids: %zd\n", rcq_cropped.ntotal);
+
+    EXPECT_EQ(rcq_cropped.ntotal, rcq.ntotal >> last_nbits);
+
+    // perform search
+    std::vector<faiss::idx_t> Inew(nq * nprobe);
+    std::vector<float> Dnew(nq * nprobe);
+    rcq_cropped.search(nq, xq, nprobe, Dnew.data(), Inew.data());
+
+    // these bits are in common between the two RCQs
+    idx_t mask = ((idx_t)1 << rcq_cropped.rq.tot_bits) - 1;
+    for (int q = 0; q < nq; q++) {
+        for (int i = 0; i < nprobe; i++) {
+            idx_t fine = Iref[q * nprobe + i];
+            EXPECT_GE(fine, 0);
+            bool found = false;
+
+            // fine should be generated from a path that passes through coarse
+            for (int j = 0; j < nprobe; j++) {
+                idx_t coarse = Inew[q * nprobe + j];
+                if ((fine & mask) == coarse) {
+                    found = true;
+                    break;
+                }
+            }
+            EXPECT_TRUE(found);
+        }
+    }
+}
+
+TEST(RCQCropping, search_params) {
+    size_t nq = 10, nt = 2000, nb = 1000, d = 32;
+
+    using idx_t = faiss::idx_t;
+
+    std::vector<float> buf((nq + nb + nt) * d);
+    faiss::rand_smooth_vectors(nq + nb + nt, d, buf.data(), 1234);
+    const float* xt = buf.data();
+    const float* xb = xt + nt * d;
+    const float* xq = xb + nb * d;
+
+    std::vector<size_t> nbits = {3, 6, 3};
+    faiss::ResidualCoarseQuantizer quantizer(d, nbits);
+    size_t ntotal = (size_t)1 << quantizer.rq.tot_bits;
+    faiss::IndexIVFScalarQuantizer index(
+            &quantizer, d, ntotal, faiss::ScalarQuantizer::QT_8bit);
+    index.quantizer_trains_alone = true;
+
+    index.train(nt, xt);
+    index.add(nb, xb);
+
+    index.nprobe = 10;
+
+    int k = 4;
+    float beam_factor_1 = 8.0;
+    quantizer.set_beam_factor(beam_factor_1);
+    std::vector<idx_t> I1(nq * k);
+    std::vector<float> D1(nq * k);
+    index.search(nq, xq, k, D1.data(), I1.data());
+
+    // change from 8 to 1
+    quantizer.set_beam_factor(1.0f);
+    std::vector<idx_t> I2(nq * k);
+    std::vector<float> D2(nq * k);
+    index.search(nq, xq, k, D2.data(), I2.data());
+
+    // make sure it changes the result
+    EXPECT_NE(I1, I2);
+    EXPECT_NE(D1, D2);
+
+    // override the class level beam factor
+    faiss::SearchParametersResidualCoarseQuantizer params1;
+    params1.beam_factor = beam_factor_1;
+    faiss::SearchParametersIVF params;
+    params.nprobe = index.nprobe;
+    params.quantizer_params = &params1;
+
+    std::vector<idx_t> I3(nq * k);
+    std::vector<float> D3(nq * k);
+    index.search(nq, xq, k, D3.data(), I3.data(), &params);
+
+    // make sure we find back the original results
+    EXPECT_EQ(I1, I3);
+    EXPECT_EQ(D1, D3);
+}
diff --git a/thirdparty/faiss/tests/test_approx_topk.cpp b/thirdparty/faiss/tests/test_approx_topk.cpp
new file mode 100644
index 000000000..23087f629
--- /dev/null
+++ b/thirdparty/faiss/tests/test_approx_topk.cpp
@@ -0,0 +1,225 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <chrono>
+#include <cstdint>
+#include <random>
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include <faiss/utils/approx_topk/approx_topk.h>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/FaissException.h>
+#include <faiss/utils/Heap.h>
+
+//
+using namespace faiss;
+
+//
+template <uint32_t NBUCKETS, uint32_t N>
+void test_approx_topk(
+        const uint32_t beamSize,
+        const uint32_t nPerBeam,
+        const uint32_t k,
+        const uint32_t nDatasetsToTest,
+        const bool verbose) {
+    if (verbose) {
+        printf("-----------\n");
+    }
+
+    // generate random data
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // matches
+    size_t nMatches = 0;
+    // the element was completely missed in approx version.
+    size_t nMissed = 0;
+    // the element is available
+    size_t nAvailable = 0;
+    // the distance is the same, but the index is different.
+    size_t nSoftMismatches = 0;
+    // the distances are different
+    size_t nHardMismatches = 0;
+    // error of distances
+    double sqrError = 0.0;
+
+    //
+    double timeBaseline = 0.0;
+    double timeApprox = 0.0;
+
+    for (size_t iDataset = 0; iDataset < nDatasetsToTest; iDataset++) {
+        const size_t n = (size_t)(nPerBeam)*beamSize;
+        std::vector<float> distances(n, 0);
+        for (size_t i = 0; i < n; i++) {
+            distances[i] = u(rng);
+        }
+
+        //
+        using C = CMax<float, int>;
+
+        // do a regular beam search
+        std::vector<float> baselineDistances(k, C::neutral());
+        std::vector<int> baselineIndices(k, -1);
+
+        auto startBaseline = std::chrono::high_resolution_clock::now();
+        heap_addn<C>(
+                k,
+                baselineDistances.data(),
+                baselineIndices.data(),
+                distances.data(),
+                nullptr,
+                nPerBeam * beamSize);
+        auto endBaseline = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diffBaseline =
+                endBaseline - startBaseline;
+        timeBaseline += diffBaseline.count();
+
+        heap_reorder<C>(k, baselineDistances.data(), baselineIndices.data());
+
+        // do an approximate beam search
+        std::vector<float> approxDistances(k, C::neutral());
+        std::vector<int> approxIndices(k, -1);
+
+        auto startApprox = std::chrono::high_resolution_clock::now();
+        try {
+            HeapWithBuckets<C, NBUCKETS, N>::bs_addn(
+                    beamSize,
+                    nPerBeam,
+                    distances.data(),
+                    k,
+                    approxDistances.data(),
+                    approxIndices.data());
+        } catch (const faiss::FaissException& ex) {
+            //
+            if (verbose) {
+                printf("Skipping the case.\n");
+            }
+            return;
+        }
+
+        auto endApprox = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> diffApprox = endApprox - startApprox;
+        timeApprox += diffApprox.count();
+
+        heap_reorder<C>(k, approxDistances.data(), approxIndices.data());
+
+        bool bGotMismatches = false;
+
+        // the error
+        for (uint32_t i = 0; i < k; i++) {
+            if (baselineDistances[i] != approxDistances[i]) {
+                nHardMismatches += 1;
+
+                double diff = baselineDistances[i] - approxDistances[i];
+                sqrError += diff * diff;
+
+                bGotMismatches = true;
+
+                if (verbose) {
+                    printf("i=%d, bs.d=%f, bs.i=%d, app.d=%f, app.i=%d\n",
+                           i,
+                           baselineDistances[i],
+                           baselineIndices[i],
+                           approxDistances[i],
+                           approxIndices[i]);
+                }
+            } else {
+                if (baselineIndices[i] != approxIndices[i]) {
+                    nSoftMismatches += 1;
+                } else {
+                    nMatches += 1;
+                }
+            }
+        }
+
+        if (bGotMismatches) {
+            if (verbose) {
+                printf("\n");
+            }
+        }
+
+        //
+        std::unordered_set<int> bsIndicesHS(
+                baselineIndices.cbegin(), baselineIndices.cend());
+        for (uint32_t i = 0; i < k; i++) {
+            auto itr = bsIndicesHS.find(approxIndices[i]);
+            if (itr != bsIndicesHS.cend()) {
+                nAvailable += 1;
+            } else {
+                nMissed += 1;
+            }
+        }
+    }
+
+    if (verbose) {
+        printf("%d, %d, %d, %d, %d, %d: %ld, %ld, %ld, %f, %ld, %ld, %f, %f\n",
+               NBUCKETS,
+               N,
+               beamSize,
+               nPerBeam,
+               k,
+               nDatasetsToTest,
+               nMatches,
+               nSoftMismatches,
+               nHardMismatches,
+               sqrError,
+               nAvailable,
+               nMissed,
+               timeBaseline,
+               timeApprox);
+    }
+
+    // just confirm that the error is not crazy
+    if (NBUCKETS * N * beamSize >= k) {
+        EXPECT_TRUE(nAvailable > nMissed);
+    } else {
+        // it is possible that the results are crazy here. Skip it.
+    }
+}
+
+//
+TEST(testApproxTopk, COMMON) {
+    constexpr bool verbose = false;
+
+    //
+    const uint32_t nDifferentDatasets = 8;
+
+    uint32_t kValues[] = {1, 2, 3, 5, 8, 13, 21, 34};
+
+    for (size_t codebookBitSize = 8; codebookBitSize <= 10; codebookBitSize++) {
+        const uint32_t codebookSize = 1 << codebookBitSize;
+        for (const auto k : kValues) {
+            test_approx_topk<1 * 8, 3>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<1 * 8, 3>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+
+            test_approx_topk<1 * 8, 2>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<1 * 8, 2>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+
+            test_approx_topk<2 * 8, 2>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<2 * 8, 2>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+
+            test_approx_topk<4 * 8, 2>(
+                    1, codebookSize, k, nDifferentDatasets, verbose);
+            test_approx_topk<4 * 8, 2>(
+                    k, codebookSize, k, nDifferentDatasets, verbose);
+        }
+    }
+}
+
+//
diff --git a/thirdparty/faiss/tests/test_binary_flat.cpp b/thirdparty/faiss/tests/test_binary_flat.cpp
index 6076b3cb4..ea37d16e2 100644
--- a/thirdparty/faiss/tests/test_binary_flat.cpp
+++ b/thirdparty/faiss/tests/test_binary_flat.cpp
@@ -42,7 +42,7 @@ TEST(BinaryFlat, accuracy) {
         }
 
         int k = 5;
-        std::vector<faiss::IndexBinary::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<int> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff --git a/thirdparty/faiss/tests/test_build_blocks.py b/thirdparty/faiss/tests/test_build_blocks.py
index 017ab1847..0a97e6318 100644
--- a/thirdparty/faiss/tests/test_build_blocks.py
+++ b/thirdparty/faiss/tests/test_build_blocks.py
@@ -256,6 +256,14 @@ def test_normalized(self):
         print(comments)
         assert 'vectors are normalized' in comments
 
+    def test_hash(self):
+        cc = []
+        for _ in range(2):
+            rs = np.random.RandomState(123)
+            m = rs.rand(40, 20).astype('float32')
+            cc.append(faiss.MatrixStats(m).hash_value)
+        self.assertTrue(cc[0] == cc[1])
+
 
 class TestScalarQuantizer(unittest.TestCase):
 
@@ -325,6 +333,32 @@ def test_6bit_equiv(self):
                     # print(dis, D[i, j])
                     assert abs(D[i, j] - dis) / dis < 1e-5
 
+    def test_reconstruct(self):
+        self.do_reconstruct(True)
+
+    def test_reconstruct_no_residual(self):
+        self.do_reconstruct(False)
+
+    def do_reconstruct(self, by_residual):
+        d = 32
+        xt, xb, xq = get_dataset_2(d, 100, 5, 5)
+
+        index = faiss.index_factory(d, "IVF10,SQ8")
+        index.by_residual = by_residual
+        index.train(xt)
+        index.add(xb)
+        index.nprobe = 10
+        D, I = index.search(xq, 4)
+        xb2 = index.reconstruct_n(0, index.ntotal)
+        for i in range(5):
+            for j in range(4):
+                self.assertAlmostEqual(
+                    ((xq[i] - xb2[I[i, j]]) ** 2).sum(),
+                    D[i, j],
+                    places=4
+                )
+
+
 class TestRandom(unittest.TestCase):
 
     def test_rand(self):
@@ -340,6 +374,24 @@ def test_randint(self):
         print(c)
         assert c.max() - c.min() < 50 * 2
 
+    def test_rand_vector(self):
+        """ test if the smooth_vectors function is reasonably compressible with
+        a small PQ """
+        x = faiss.rand_smooth_vectors(1300, 32)
+        xt = x[:1000]
+        xb = x[1000:1200]
+        xq = x[1200:]
+        _, gt = faiss.knn(xq, xb, 10)
+        index = faiss.IndexPQ(32, 4, 4)
+        index.train(xt)
+        index.add(xb)
+        D, I = index.search(xq, 10)
+        ninter = faiss.eval_intersection(I, gt)
+        # 445 for SyntheticDataset
+        self.assertGreater(ninter, 420)
+        self.assertLess(ninter, 460)
+
+
 
 class TestPairwiseDis(unittest.TestCase):
 
@@ -498,6 +550,23 @@ def subtest(self, d, K, metric):
         print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
         assert recall > 0.99
 
+    def test_small_nndescent(self):
+        """ building a too small graph used to crash, make sure it raises
+        an exception instead.
+        TODO: build the exact knn graph for small cases
+        """
+        d = 32
+        K = 10
+        index = faiss.IndexNNDescentFlat(d, K, faiss.METRIC_L2)
+        index.nndescent.S = 10
+        index.nndescent.R = 32
+        index.nndescent.L = K + 20
+        index.nndescent.iter = 5
+        index.verbose = True
+
+        xb = np.zeros((78, d), dtype='float32')
+        self.assertRaises(RuntimeError, index.add, xb)
+
 
 class TestResultHeap(unittest.TestCase):
 
@@ -529,3 +598,155 @@ def run_test(self, keep_max):
 
         np.testing.assert_equal(all_rh[1].D, all_rh[3].D)
         np.testing.assert_equal(all_rh[1].I, all_rh[3].I)
+
+
+class TestReconstructBatch(unittest.TestCase):
+
+    def test_indexflat(self):
+        index = faiss.IndexFlatL2(32)
+        x = faiss.randn((100, 32), 1234)
+        index.add(x)
+
+        subset = [4, 7, 45]
+        np.testing.assert_equal(x[subset], index.reconstruct_batch(subset))
+
+    def test_exception(self):
+        index = faiss.index_factory(32, "IVF2,Flat")
+        x = faiss.randn((100, 32), 1234)
+        index.train(x)
+        index.add(x)
+
+        # make sure it raises an exception even if it enters the openmp for
+        subset = np.zeros(1200, dtype=int)
+        self.assertRaises(
+            RuntimeError,
+            lambda : index.reconstruct_batch(subset),
+        )
+
+
+class TestBucketSort(unittest.TestCase):
+
+    def do_test_bucket_sort(self, nt):
+        rs = np.random.RandomState(123)
+        tab = rs.randint(100, size=1000, dtype='int64')
+        lims, perm = faiss.bucket_sort(tab, nt=nt)
+        for i in range(max(tab) + 1):
+            assert np.all(tab[perm[lims[i]: lims[i + 1]]] == i)
+
+    def test_bucket_sort(self):
+        self.do_test_bucket_sort(0)
+
+    def test_bucket_sort_parallel(self):
+        self.do_test_bucket_sort(4)
+
+    def do_test_bucket_sort_inplace(
+            self, nt, nrow=500, ncol=20, nbucket=300, repro=False,
+            dtype='int32'):
+        rs = np.random.RandomState(123)
+        tab = rs.randint(nbucket, size=(nrow, ncol), dtype=dtype)
+
+        tab2 = tab.copy()
+        faiss.cvar.bucket_sort_verbose
+        faiss.cvar.bucket_sort_verbose = 1
+
+        lims = faiss.matrix_bucket_sort_inplace(tab2, nt=nt)
+        tab2 = tab2.ravel()
+
+        for b in range(nbucket):
+            rows, _ = np.where(tab == b)
+            rows.sort()
+            tab2[lims[b]:lims[b + 1]].sort()
+            # print(rows, tab2[lims[b] : lims[b + 1]])
+            rows = set(rows)
+            self.assertEqual(rows, set(tab2[lims[b]:lims[b + 1]]))
+
+    def test_bucket_sort_inplace(self):
+        self.do_test_bucket_sort_inplace(0)
+
+    def test_bucket_sort_inplace_parallel(self):
+        self.do_test_bucket_sort_inplace(4)
+
+    def test_bucket_sort_inplace_parallel_fewcol(self):
+        self.do_test_bucket_sort_inplace(4, ncol=3)
+
+    def test_bucket_sort_inplace_parallel_fewbucket(self):
+        self.do_test_bucket_sort_inplace(4, nbucket=5)
+
+    def test_bucket_sort_inplace_int64(self):
+        self.do_test_bucket_sort_inplace(0, dtype='int64')
+
+    def test_bucket_sort_inplace_parallel_int64(self):
+        self.do_test_bucket_sort_inplace(4, dtype='int64')
+
+class TestMergeKNNResults(unittest.TestCase):
+
+    def do_test(self, ismax, dtype):
+        rs = np.random.RandomState()
+        n, k, nshard = 10, 5, 3
+        all_ids = rs.randint(100000, size=(nshard, n, k)).astype('int64')
+        all_dis = rs.rand(nshard, n, k)
+        if dtype == 'int32':
+            all_dis = (all_dis * 1000000).astype("int32")
+        else:
+            all_dis = all_dis.astype(dtype)
+        for i in range(nshard):
+            for j in range(n):
+                all_dis[i, j].sort()
+                if ismax:
+                    all_dis[i, j] = all_dis[i, j][::-1]
+        Dref = np.zeros((n, k), dtype=dtype)
+        Iref = np.zeros((n, k), dtype='int64')
+
+        for i in range(n):
+            dis = all_dis[:, i, :].ravel()
+            ids = all_ids[:, i, :].ravel()
+            o = dis.argsort()
+            if ismax:
+                o = o[::-1]
+            Dref[i] = dis[o[:k]]
+            Iref[i] = ids[o[:k]]
+
+        Dnew, Inew = faiss.merge_knn_results(all_dis, all_ids, keep_max=ismax)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_min_float(self):
+        self.do_test(ismax=False, dtype='float32')
+
+    def test_max_int(self):
+        self.do_test(ismax=True, dtype='int32')
+
+    def test_max_float(self):
+        self.do_test(ismax=True, dtype='float32')
+
+
+class TestMapInt64ToInt64(unittest.TestCase):
+
+    def do_test(self, capacity, n):
+        """ test that we are able to lookup """
+        rs = np.random.RandomState(123)
+        # make sure we have unique values
+        keys = np.unique(rs.choice(2 ** 29, size=n).astype("int64"))
+        rs.shuffle(keys)
+        n = keys.size
+        vals = rs.choice(2 ** 30, size=n).astype('int64')
+        tab = faiss.MapInt64ToInt64(capacity)
+        tab.add(keys, vals)
+
+        # lookup and check
+        vals2 = tab.lookup(keys)
+        np.testing.assert_array_equal(vals, vals2)
+
+        # make a few keys that we know are not there
+        mask = rs.rand(n) < 0.3
+        keys[mask] = rs.choice(2 ** 29, size=n)[mask] + 2 ** 29
+        vals2 = tab.lookup(keys)
+        np.testing.assert_array_equal(-1, vals2[mask])
+        np.testing.assert_array_equal(vals[~mask], vals2[~mask])
+
+    def test_small(self):
+        self.do_test(16384, 10000)
+
+    def xx_test_large(self):
+        # don't run by default because it's slow
+        self.do_test(2 ** 21, 10 ** 6)
diff --git a/thirdparty/faiss/tests/test_clone.py b/thirdparty/faiss/tests/test_clone.py
new file mode 100644
index 000000000..1cc98668b
--- /dev/null
+++ b/thirdparty/faiss/tests/test_clone.py
@@ -0,0 +1,88 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+from faiss.contrib import datasets
+
+faiss.omp_set_num_threads(4)
+
+
+class TestClone(unittest.TestCase):
+    """
+    Test clone_index for various index combinations.
+    """
+
+    def do_test_clone(self, factory, with_ids=False):
+        """
+        Verify that cloning works for a given index type
+        """
+        d = 32
+        ds = datasets.SyntheticDataset(d, 1000, 2000, 10)
+        index1 = faiss.index_factory(d, factory)
+        index1.train(ds.get_train())
+        if with_ids:
+            index1.add_with_ids(ds.get_database(),
+                                np.arange(ds.nb).astype("int64"))
+        else:
+            index1.add(ds.get_database())
+        k = 5
+        Dref1, Iref1 = index1.search(ds.get_queries(), k)
+
+        index2 = faiss.clone_index(index1)
+        self.assertEqual(type(index1), type(index2))
+        index1 = None
+
+        Dref2, Iref2 = index2.search(ds.get_queries(), k)
+        np.testing.assert_array_equal(Dref1, Dref2)
+        np.testing.assert_array_equal(Iref1, Iref2)
+
+    def test_RFlat(self):
+        self.do_test_clone("SQ4,RFlat")
+
+    def test_Refine(self):
+        self.do_test_clone("SQ4,Refine(SQ8)")
+
+    def test_IVF(self):
+        self.do_test_clone("IVF16,Flat")
+
+    def test_PCA(self):
+        self.do_test_clone("PCA8,Flat")
+
+    def test_IDMap(self):
+        self.do_test_clone("IVF16,Flat,IDMap", with_ids=True)
+
+    def test_IDMap2(self):
+        self.do_test_clone("IVF16,Flat,IDMap2", with_ids=True)
+
+    def test_NSGPQ(self):
+        self.do_test_clone("NSG32,Flat")
+
+    def test_IVFAdditiveQuantizer(self):
+        self.do_test_clone("IVF16,LSQ5x6_Nqint8")
+        self.do_test_clone("IVF16,RQ5x6_Nqint8")
+        self.do_test_clone("IVF16,PLSQ4x3x5_Nqint8")
+        self.do_test_clone("IVF16,PRQ4x3x5_Nqint8")
+
+    def test_IVFAdditiveQuantizerFastScan(self):
+        self.do_test_clone("IVF16,LSQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("IVF16,RQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("IVF16,PLSQ2x3x4fs_Nlsq2x4")
+        self.do_test_clone("IVF16,PRQ2x3x4fs_Nrq2x4")
+
+    def test_AdditiveQuantizer(self):
+        self.do_test_clone("LSQ5x6_Nqint8")
+        self.do_test_clone("RQ5x6_Nqint8")
+        self.do_test_clone("PLSQ4x3x5_Nqint8")
+        self.do_test_clone("PRQ4x3x5_Nqint8")
+
+    def test_AdditiveQuantizerFastScan(self):
+        self.do_test_clone("LSQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("RQ3x4fs_32_Nlsq2x4")
+        self.do_test_clone("PLSQ2x3x4fs_Nlsq2x4")
+        self.do_test_clone("PRQ2x3x4fs_Nrq2x4")
diff --git a/thirdparty/faiss/tests/test_code_distance.cpp b/thirdparty/faiss/tests/test_code_distance.cpp
new file mode 100644
index 000000000..4a2021a78
--- /dev/null
+++ b/thirdparty/faiss/tests/test_code_distance.cpp
@@ -0,0 +1,240 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <omp.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <thread>
+#include <tuple>
+#include <vector>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/code_distance/code_distance.h>
+
+size_t nMismatches(
+        const std::vector<float>& ref,
+        const std::vector<float>& candidate) {
+    size_t count = 0;
+    for (size_t i = 0; i < count; i++) {
+        double abs = std::abs(ref[i] - candidate[i]);
+        if (abs >= 1e-5) {
+            count += 1;
+        }
+    }
+
+    return count;
+}
+
+void test(
+        // dimensionality of the data
+        const size_t dim,
+        // number of subquantizers
+        const size_t subq,
+        // bits per subquantizer
+        const size_t nbits,
+        // number of codes to process
+        const size_t n) {
+    FAISS_THROW_IF_NOT(nbits == 8);
+
+    // remove if benchmarking is needed
+    omp_set_num_threads(1);
+
+    // rng
+    std::minstd_rand rng(123);
+    std::uniform_int_distribution<uint8_t> u(0, 255);
+    std::uniform_real_distribution<float> uf(0, 1);
+
+    // initialize lookup
+    std::vector<float> lookup(256 * subq, 0);
+    for (size_t i = 0; i < lookup.size(); i++) {
+        lookup[i] = uf(rng);
+    }
+
+    // initialize codes
+    std::vector<uint8_t> codes(n * subq);
+#pragma omp parallel
+    {
+        std::minstd_rand rng0(123);
+        std::uniform_int_distribution<uint8_t> u1(0, 255);
+
+#pragma omp for schedule(guided)
+        for (size_t i = 0; i < codes.size(); i++) {
+            codes[i] = u1(rng0);
+        }
+    }
+
+    // warmup. compute reference results
+    std::vector<float> resultsRef(n, 0);
+    for (size_t k = 0; k < 10; k++) {
+#pragma omp parallel for schedule(guided)
+        for (size_t i = 0; i < n; i++) {
+            resultsRef[i] =
+                    faiss::distance_single_code_generic<faiss::PQDecoder8>(
+                            subq, 8, lookup.data(), codes.data() + subq * i);
+        }
+    }
+
+    // generic, 1 code per step
+    std::vector<float> resultsNewGeneric1x(n, 0);
+    double generic1xMsec = 0;
+    {
+        const auto startingTimepoint = std::chrono::steady_clock::now();
+        for (size_t k = 0; k < 1000; k++) {
+#pragma omp parallel for schedule(guided)
+            for (size_t i = 0; i < n; i++) {
+                resultsNewGeneric1x[i] =
+                        faiss::distance_single_code_generic<faiss::PQDecoder8>(
+                                subq,
+                                8,
+                                lookup.data(),
+                                codes.data() + subq * i);
+            }
+        }
+        const auto endingTimepoint = std::chrono::steady_clock::now();
+
+        std::chrono::duration<double> duration =
+                endingTimepoint - startingTimepoint;
+        generic1xMsec = (duration.count() * 1000.0);
+    }
+
+    // generic, 4 codes per step
+    std::vector<float> resultsNewGeneric4x(n, 0);
+    double generic4xMsec = 0;
+    {
+        const auto startingTimepoint = std::chrono::steady_clock::now();
+        for (size_t k = 0; k < 1000; k++) {
+#pragma omp parallel for schedule(guided)
+            for (size_t i = 0; i < n; i += 4) {
+                faiss::distance_four_codes_generic<faiss::PQDecoder8>(
+                        subq,
+                        8,
+                        lookup.data(),
+                        codes.data() + subq * (i + 0),
+                        codes.data() + subq * (i + 1),
+                        codes.data() + subq * (i + 2),
+                        codes.data() + subq * (i + 3),
+                        resultsNewGeneric4x[i + 0],
+                        resultsNewGeneric4x[i + 1],
+                        resultsNewGeneric4x[i + 2],
+                        resultsNewGeneric4x[i + 3]);
+            }
+        }
+
+        const auto endingTimepoint = std::chrono::steady_clock::now();
+
+        std::chrono::duration<double> duration =
+                endingTimepoint - startingTimepoint;
+        generic4xMsec = (duration.count() * 1000.0);
+    }
+
+    // generic, 1 code per step
+    std::vector<float> resultsNewCustom1x(n, 0);
+    double custom1xMsec = 0;
+    {
+        const auto startingTimepoint = std::chrono::steady_clock::now();
+        for (size_t k = 0; k < 1000; k++) {
+#pragma omp parallel for schedule(guided)
+            for (size_t i = 0; i < n; i++) {
+                resultsNewCustom1x[i] =
+                        faiss::distance_single_code<faiss::PQDecoder8>(
+                                subq,
+                                8,
+                                lookup.data(),
+                                codes.data() + subq * i);
+            }
+        }
+        const auto endingTimepoint = std::chrono::steady_clock::now();
+
+        std::chrono::duration<double> duration =
+                endingTimepoint - startingTimepoint;
+        custom1xMsec = (duration.count() * 1000.0);
+    }
+
+    // generic, 4 codes per step
+    std::vector<float> resultsNewCustom4x(n, 0);
+    double custom4xMsec = 0;
+    {
+        const auto startingTimepoint = std::chrono::steady_clock::now();
+        for (size_t k = 0; k < 1000; k++) {
+#pragma omp parallel for schedule(guided)
+            for (size_t i = 0; i < n; i += 4) {
+                faiss::distance_four_codes<faiss::PQDecoder8>(
+                        subq,
+                        8,
+                        lookup.data(),
+                        codes.data() + subq * (i + 0),
+                        codes.data() + subq * (i + 1),
+                        codes.data() + subq * (i + 2),
+                        codes.data() + subq * (i + 3),
+                        resultsNewCustom4x[i + 0],
+                        resultsNewCustom4x[i + 1],
+                        resultsNewCustom4x[i + 2],
+                        resultsNewCustom4x[i + 3]);
+            }
+        }
+
+        const auto endingTimepoint = std::chrono::steady_clock::now();
+
+        std::chrono::duration<double> duration =
+                endingTimepoint - startingTimepoint;
+        custom4xMsec = (duration.count() * 1000.0);
+    }
+
+    const size_t nMismatchesG1 = nMismatches(resultsRef, resultsNewGeneric1x);
+    const size_t nMismatchesG4 = nMismatches(resultsRef, resultsNewGeneric4x);
+    const size_t nMismatchesCustom1 =
+            nMismatches(resultsRef, resultsNewCustom1x);
+    const size_t nMismatchesCustom4 =
+            nMismatches(resultsRef, resultsNewCustom4x);
+
+    std::cout << "Dim = " << dim << ", subq = " << subq << ", nbits = " << nbits
+              << ", n = " << n << std::endl;
+    std::cout << "Generic 1x code: " << generic1xMsec << " msec, "
+              << nMismatchesG1 << " mismatches" << std::endl;
+    std::cout << "Generic 4x code: " << generic4xMsec << " msec, "
+              << nMismatchesG4 << " mismatches" << std::endl;
+    std::cout << "custom 1x code: " << custom1xMsec << " msec, "
+              << nMismatchesCustom1 << " mismatches" << std::endl;
+    std::cout << "custom 4x code: " << custom4xMsec << " msec, "
+              << nMismatchesCustom4 << " mismatches" << std::endl;
+    std::cout << std::endl;
+
+    ASSERT_EQ(nMismatchesG1, 0);
+    ASSERT_EQ(nMismatchesG4, 0);
+    ASSERT_EQ(nMismatchesCustom1, 0);
+    ASSERT_EQ(nMismatchesCustom4, 0);
+}
+
+// this test can be used as a benchmark.
+// 1. Increase the value of NELEMENTS
+// 2. Remove omp_set_num_threads()
+
+constexpr size_t NELEMENTS = 10000;
+
+TEST(TestCodeDistance, SUBQ4_NBITS8) {
+    test(256, 4, 8, NELEMENTS);
+}
+
+TEST(TestCodeDistance, SUBQ8_NBITS8) {
+    test(256, 8, 8, NELEMENTS);
+}
+
+TEST(TestCodeDistance, SUBQ16_NBITS8) {
+    test(256, 16, 8, NELEMENTS);
+}
+
+TEST(TestCodeDistance, SUBQ32_NBITS8) {
+    test(256, 32, 8, NELEMENTS);
+}
diff --git a/thirdparty/faiss/tests/test_contrib.py b/thirdparty/faiss/tests/test_contrib.py
index 4723597b8..36c17792c 100644
--- a/thirdparty/faiss/tests/test_contrib.py
+++ b/thirdparty/faiss/tests/test_contrib.py
@@ -7,32 +7,35 @@
 import unittest
 import numpy as np
 import platform
+import os
+import random
+import tempfile
 
 from faiss.contrib import datasets
 from faiss.contrib import inspect_tools
 from faiss.contrib import evaluation
 from faiss.contrib import ivf_tools
+from faiss.contrib import clustering
+from faiss.contrib import big_batch_search
 
 from common_faiss_tests import get_dataset_2
 try:
     from faiss.contrib.exhaustive_search import \
         knn_ground_truth, knn, range_ground_truth, \
         range_search_max_results, exponential_query_iterator
-
 except:
     pass  # Submodule import broken in python 2.
 
 
-
-@unittest.skipIf(platform.python_version_tuple()[0] < '3', \
+@unittest.skipIf(platform.python_version_tuple()[0] < '3',
                  'Submodule import broken in python 2.')
 class TestComputeGT(unittest.TestCase):
 
-    def test_compute_GT(self):
+    def do_test_compute_GT(self, metric=faiss.METRIC_L2):
         d = 64
         xt, xb, xq = get_dataset_2(d, 0, 10000, 100)
 
-        index = faiss.IndexFlatL2(d)
+        index = faiss.IndexFlat(d, metric)
         index.add(xb)
         Dref, Iref = index.search(xq, 10)
 
@@ -42,12 +45,19 @@ def matrix_iterator(xb, bs):
             for i0 in range(0, xb.shape[0], bs):
                 yield xb[i0:i0 + bs]
 
-        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10)
+        Dnew, Inew = knn_ground_truth(
+            xq, matrix_iterator(xb, 1000), 10, metric)
 
         np.testing.assert_array_equal(Iref, Inew)
         # decimal = 4 required when run on GPU
         np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
 
+    def test_compute_GT(self):
+        self.do_test_compute_GT()
+
+    def test_compute_GT_ip(self):
+        self.do_test_compute_GT(faiss.METRIC_INNER_PRODUCT)
+
 
 class TestDatasets(unittest.TestCase):
     """here we test only the synthetic dataset. Datasets that require
@@ -72,7 +82,6 @@ def test_synthetic_ip(self):
             index.search(ds.get_queries(), 100)[1]
         )
 
-
     def test_synthetic_iterator(self):
         ds = datasets.SyntheticDataset(32, 1000, 2000, 10)
         xb = ds.get_database()
@@ -98,7 +107,6 @@ def test_knn_cpu(self):
         assert np.all(Inew == Iref)
         assert np.allclose(Dref, Dnew)
 
-
         index = faiss.IndexFlatIP(32)
         index.add(xb)
         Dref, Iref = index.search(xq, 10)
@@ -123,7 +131,7 @@ def do_test_range(self, metric):
             xq, ds.database_iterator(bs=100), threshold, ngpu=0,
             metric_type=metric)
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             ref_lims, ref_D, ref_I,
             new_lims, new_D, new_I
         )
@@ -154,7 +162,7 @@ def matrix_iterator(xb, bs):
         _, new_lims, new_D, new_I = range_search_max_results(
             index, matrix_iterator(xq, 100), threshold, max_results=1e10)
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             ref_lims, ref_D, ref_I,
             new_lims, new_D, new_I
         )
@@ -168,7 +176,7 @@ def matrix_iterator(xb, bs):
 
         ref_lims, ref_D, ref_I = index.range_search(xq, new_threshold)
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             ref_lims, ref_D, ref_I,
             new_lims, new_D, new_I
         )
@@ -201,6 +209,26 @@ def test_IndexFlat(self):
             xb, inspect_tools.get_flat_data(index)
         )
 
+    def test_make_LT(self):
+        rs = np.random.RandomState(123)
+        X = rs.rand(13, 20).astype('float32')
+        A = rs.rand(5, 20).astype('float32')
+        b = rs.rand(5).astype('float32')
+        Yref = X @ A.T + b
+        lt = inspect_tools.make_LinearTransform_matrix(A, b)
+        Ynew = lt.apply(X)
+        np.testing.assert_allclose(Yref, Ynew, rtol=1e-06)
+
+    def test_NSG_neighbors(self):
+        # FIXME number of elements to add should be >> 100
+        ds = datasets.SyntheticDataset(32, 0, 200, 10)
+        index = faiss.index_factory(ds.d, "NSG")
+        index.add(ds.get_database())
+        neighbors = inspect_tools.get_NSG_neighbors(index.nsg)
+        # neighbors should be either valid indexes or -1
+        np.testing.assert_array_less(-2, neighbors)
+        np.testing.assert_array_less(neighbors, ds.nb)
+
 
 class TestRangeEval(unittest.TestCase):
 
@@ -305,14 +333,16 @@ def test_float(self):
         a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
         ivf_tools.add_preassigned(index, xb, a)
 
-        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
+        # search elements xq, increase nprobe, check 4 first results w/
+        # groundtruth
         prev_inter_perf = 0
         for nprobe in 1, 10, 20:
 
             index.nprobe = nprobe
             a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
             D, I = ivf_tools.search_preassigned(index, xq, 4, a)
-            inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size
+            inter_perf = faiss.eval_intersection(
+                I, ds.get_groundtruth()[:, :4])
             self.assertTrue(inter_perf >= prev_inter_perf)
             prev_inter_perf = inter_perf
 
@@ -328,7 +358,8 @@ def test_float(self):
 
         lims, DR, IR = ivf_tools.range_search_preassigned(index, xq, radius, a)
 
-        # with that radius the k-NN results are a subset of the range search results
+        # with that radius the k-NN results are a subset of the range search
+        # results
         for q in range(len(xq)):
             l0, l1 = lims[q], lims[q + 1]
             self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
@@ -341,7 +372,8 @@ def test_binary(self):
         xq = ds.get_queries()
         xb = ds.get_database()
 
-        # define alternative quantizer on the 20 first dims of vectors (will be in float)
+        # define alternative quantizer on the 20 first dims of vectors
+        # (will be in float)
         km = faiss.Kmeans(20, 50)
         km.train(xt[:, :20].copy())
         alt_quantizer = km.index
@@ -362,15 +394,22 @@ def test_binary(self):
         a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
         ivf_tools.add_preassigned(index, xb_bin, a)
 
-        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
+        # recompute GT in binary
+        k = 15
+        ib = faiss.IndexBinaryFlat(128)
+        ib.add(xb_bin)
+        Dgt, Igt = ib.search(xq_bin, k)
+
+        # search elements xq, increase nprobe, check 4 first results w/
+        # groundtruth
         prev_inter_perf = 0
         for nprobe in 1, 10, 20:
 
             index.nprobe = nprobe
             a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
-            D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
-            inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size
-            self.assertTrue(inter_perf >= prev_inter_perf)
+            D, I = ivf_tools.search_preassigned(index, xq_bin, k, a)
+            inter_perf = faiss.eval_intersection(I, Igt)
+            self.assertGreaterEqual(inter_perf, prev_inter_perf)
             prev_inter_perf = inter_perf
 
         # test range search
@@ -383,9 +422,11 @@ def test_binary(self):
         D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
         radius = int(D.max() + 1)
 
-        lims, DR, IR = ivf_tools.range_search_preassigned(index, xq_bin, radius, a)
+        lims, DR, IR = ivf_tools.range_search_preassigned(
+            index, xq_bin, radius, a)
 
-        # with that radius the k-NN results are a subset of the range search results
+        # with that radius the k-NN results are a subset of the range
+        # search results
         for q in range(len(xq)):
             l0, l1 = lims[q], lims[q + 1]
             self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
@@ -405,16 +446,17 @@ def do_test(self, metric_type):
         # baseline = search with that radius
         lims_ref, Dref, Iref = index.range_search(ds.get_queries(), radius0)
 
-        # now see if using just the total number of results, we can get back the same
-        # result table
+        # now see if using just the total number of results, we can get back
+        # the same result table
         query_iterator = exponential_query_iterator(ds.get_queries())
 
         init_radius = 1e10 if metric_type == faiss.METRIC_L2 else -1e10
         radius1, lims_new, Dnew, Inew = range_search_max_results(
-            index, query_iterator, init_radius, min_results=Dref.size, clip_to_min=True
+            index, query_iterator, init_radius,
+            min_results=Dref.size, clip_to_min=True
         )
 
-        evaluation.test_ref_range_results(
+        evaluation.check_ref_range_results(
             lims_ref, Dref, Iref,
             lims_new, Dnew, Inew
         )
@@ -424,3 +466,191 @@ def test_L2(self):
 
     def test_IP(self):
         self.do_test(faiss.METRIC_INNER_PRODUCT)
+
+    def test_binary(self):
+        ds = datasets.SyntheticDataset(64, 1000, 1000, 200)
+        tobinary = faiss.index_factory(ds.d, "LSHrt")
+        tobinary.train(ds.get_train())
+        index = faiss.IndexBinaryFlat(ds.d)
+        xb = tobinary.sa_encode(ds.get_database())
+        xq = tobinary.sa_encode(ds.get_queries())
+        index.add(xb)
+
+        # find a reasonable radius
+        D, _ = index.search(xq, 10)
+        radius0 = int(np.median(D[:, -1]))
+
+        # baseline = search with that radius
+        lims_ref, Dref, Iref = index.range_search(xq, radius0)
+
+        # now see if using just the total number of results, we can get back
+        # the same result table
+        query_iterator = exponential_query_iterator(xq)
+
+        radius1, lims_new, Dnew, Inew = range_search_max_results(
+            index, query_iterator, ds.d // 2,
+            min_results=Dref.size, clip_to_min=True
+        )
+
+        evaluation.check_ref_range_results(
+            lims_ref, Dref, Iref,
+            lims_new, Dnew, Inew
+        )
+
+
+class TestClustering(unittest.TestCase):
+
+    def test_2level(self):
+        " verify that 2-level clustering is not too sub-optimal "
+        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
+        xt = ds.get_train()
+        km_ref = faiss.Kmeans(ds.d, 100)
+        km_ref.train(xt)
+        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
+
+        centroids2, _ = clustering.two_level_clustering(xt, 10, 100)
+        err2 = faiss.knn(xt, centroids2, 1)[0].sum()
+
+        self.assertLess(err2, err * 1.1)
+
+    def test_ivf_train_2level(self):
+        " check 2-level clustering with IVF training "
+        ds = datasets.SyntheticDataset(32, 10000, 1000, 200)
+        index = faiss.index_factory(ds.d, "PCA16,IVF100,SQ8")
+        faiss.extract_index_ivf(index).nprobe = 10
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        index = faiss.index_factory(ds.d, "PCA16,IVF100,SQ8")
+        faiss.extract_index_ivf(index).nprobe = 10
+        clustering.train_ivf_index_with_2level(
+            index, ds.get_train(), verbose=True, rebalance=False)
+        index.add(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), 1)
+
+        # normally 47 / 200 differences
+        ndiff = (Iref != Inew).sum()
+        self.assertLess(ndiff, 51)
+
+
+class TestBigBatchSearch(unittest.TestCase):
+
+    def do_test(self, factory_string, metric=faiss.METRIC_L2):
+        # ds = datasets.SyntheticDataset(32, 2000, 4000, 1000)
+        ds = datasets.SyntheticDataset(32, 2000, 400, 500)
+        k = 10
+        index = faiss.index_factory(ds.d, factory_string, metric)
+        assert index.metric_type == metric
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), k)
+        # faiss.omp_set_num_threads(1)
+        for method in ("pairwise_distances", "knn_function", "index"):
+            for threaded in 0, 1, 2:
+                Dnew, Inew = big_batch_search.big_batch_search(
+                    index, ds.get_queries(),
+                    k, method=method,
+                    threaded=threaded
+                )
+                self.assertLess((Inew != Iref).sum() / Iref.size, 1e-4)
+                np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+
+    def test_Flat(self):
+        self.do_test("IVF64,Flat")
+
+    def test_Flat_IP(self):
+        self.do_test("IVF64,Flat", metric=faiss.METRIC_INNER_PRODUCT)
+
+    def test_PQ(self):
+        self.do_test("IVF64,PQ4np")
+
+    def test_SQ(self):
+        self.do_test("IVF64,SQ8")
+
+    def test_checkpoint(self):
+        ds = datasets.SyntheticDataset(32, 2000, 400, 500)
+        k = 10
+        index = faiss.index_factory(ds.d, "IVF64,SQ8")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), k)
+
+        checkpoint = tempfile.mktemp()
+        try:
+            # First big batch search
+            try:
+                Dnew, Inew = big_batch_search.big_batch_search(
+                    index, ds.get_queries(),
+                    k, method="knn_function",
+                    threaded=2,
+                    checkpoint=checkpoint, checkpoint_freq=0.1,
+                    crash_at=20
+                )
+            except ZeroDivisionError:
+                pass
+            else:
+                self.assertFalse("should have crashed")
+            # Second big batch search
+            Dnew, Inew = big_batch_search.big_batch_search(
+                index, ds.get_queries(),
+                k, method="knn_function",
+                threaded=2,
+                checkpoint=checkpoint, checkpoint_freq=5
+            )
+            self.assertLess((Inew != Iref).sum() / Iref.size, 1e-4)
+            np.testing.assert_almost_equal(Dnew, Dref, decimal=4)
+        finally:
+            if os.path.exists(checkpoint):
+                os.unlink(checkpoint)
+
+
+class TestInvlistSort(unittest.TestCase):
+
+    def test_sort(self):
+        """ make sure that the search results do not change
+        after sorting the inverted lists """
+        ds = datasets.SyntheticDataset(32, 2000, 200, 20)
+        index = faiss.index_factory(ds.d, "IVF50,SQ8")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), 5)
+
+        ivf_tools.sort_invlists_by_size(index)
+        list_sizes = ivf_tools.get_invlist_sizes(index.invlists)
+        assert np.all(list_sizes[1:] >= list_sizes[:-1])
+
+        Dnew, Inew = index.search(ds.get_queries(), 5)
+        np.testing.assert_equal(Dnew, Dref)
+        np.testing.assert_equal(Inew, Iref)
+
+    def test_hnsw_permute(self):
+        """ make sure HNSW permutation works (useful when used as coarse quantizer) """
+        ds = datasets.SyntheticDataset(32, 0, 1000, 50)
+        index = faiss.index_factory(ds.d, "HNSW32,Flat")
+        index.add(ds.get_database())
+        Dref, Iref = index.search(ds.get_queries(), 5)
+        rs = np.random.RandomState(1234)
+        perm = rs.permutation(index.ntotal)
+        index.permute_entries(perm)
+        Dnew, Inew = index.search(ds.get_queries(), 5)
+        np.testing.assert_equal(Dnew, Dref)
+        Inew_remap = perm[Inew]
+        np.testing.assert_equal(Inew_remap, Iref)
+
+
+class TestCodeSet(unittest.TestCase):
+
+    def test_code_set(self):
+        """ CodeSet and np.unique should produce the same output """
+        d = 8
+        n = 1000  # > 256 and using only 0 or 1 so there must be duplicates
+        codes = np.random.randint(0, 2, (n, d), dtype=np.uint8)
+        s = faiss.CodeSet(d)
+        inserted = s.insert(codes)
+        np.testing.assert_equal(
+            np.sort(np.unique(codes, axis=0), axis=None),
+            np.sort(codes[inserted], axis=None))
diff --git a/thirdparty/faiss/tests/test_contrib_with_scipy.py b/thirdparty/faiss/tests/test_contrib_with_scipy.py
new file mode 100644
index 000000000..cb81bb623
--- /dev/null
+++ b/thirdparty/faiss/tests/test_contrib_with_scipy.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import unittest
+import numpy as np
+
+from faiss.contrib import datasets
+from faiss.contrib import clustering
+
+import scipy.sparse
+
+# this test is not in test_contrib because it depends on scipy
+
+
+class TestClustering(unittest.TestCase):
+
+    def test_python_kmeans(self):
+        """ Test the python implementation of kmeans """
+        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
+        x = ds.get_train()
+
+        # bad distribution to stress-test split code
+        xt = x[:10000].copy()
+        xt[:5000] = x[0]
+
+        km_ref = faiss.Kmeans(ds.d, 100, niter=10)
+        km_ref.train(xt)
+        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()
+
+        data = clustering.DatasetAssign(xt)
+        centroids = clustering.kmeans(100, data, 10)
+        err2 = faiss.knn(xt, centroids, 1)[0].sum()
+
+        # 33517.645 and 33031.098
+        self.assertLess(err2, err * 1.1)
+
+    def test_sparse_routines(self):
+        """ the sparse assignment routine """
+        ds = datasets.SyntheticDataset(1000, 2000, 0, 200)
+        xt = ds.get_train().copy()
+        faiss.normalize_L2(xt)
+
+        mask = np.abs(xt) > 0.045
+        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
+        xt[np.logical_not(mask)] = 0
+
+        centroids = ds.get_queries()
+        assert len(centroids) == 200
+
+        xsparse = scipy.sparse.csr_matrix(xt)
+
+        Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1)
+        D, I = clustering.sparse_assign_to_dense(xsparse, centroids)
+
+        np.testing.assert_array_equal(Iref.ravel(), I)
+        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=3)
+
+        D, I = clustering.sparse_assign_to_dense_blocks(
+            xsparse, centroids, qbs=123, bbs=33, nt=4)
+
+        np.testing.assert_array_equal(Iref.ravel(), I)
+        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=3)
+
+    def test_sparse_kmeans(self):
+        """ demo on how to cluster sparse data into dense clusters """
+
+        ds = datasets.SyntheticDataset(1000, 1500, 0, 0)
+        xt = ds.get_train().copy()
+        faiss.normalize_L2(xt)
+
+        mask = np.abs(xt) > 0.045
+        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
+        xt[np.logical_not(mask)] = 0
+
+        km = faiss.Kmeans(ds.d, 50)
+        km.train(xt)
+        ref_err = km.iteration_stats[-1]["obj"]
+
+        xsparse = scipy.sparse.csr_matrix(xt)
+
+        centroids, iteration_stats = clustering.kmeans(
+            50, clustering.DatasetAssignSparse(xsparse), return_stats=True)
+
+        new_err = iteration_stats[-1]["obj"]
+
+        self.assertLess(new_err, ref_err * 1.1)
diff --git a/thirdparty/faiss/tests/test_cppcontrib_sa_decode.cpp b/thirdparty/faiss/tests/test_cppcontrib_sa_decode.cpp
new file mode 100644
index 000000000..cb13e8bf9
--- /dev/null
+++ b/thirdparty/faiss/tests/test_cppcontrib_sa_decode.cpp
@@ -0,0 +1,1306 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <tuple>
+#include <vector>
+
+#include <faiss/Index.h>
+#include <faiss/Index2Layer.h>
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexPQ.h>
+#include <faiss/impl/io.h>
+#include <faiss/index_factory.h>
+#include <faiss/index_io.h>
+
+#include <faiss/IndexRowwiseMinMax.h>
+#include <faiss/cppcontrib/SaDecodeKernels.h>
+
+using namespace ::testing;
+using ::testing::TestWithParam;
+using ::testing::Values;
+
+std::tuple<std::shared_ptr<faiss::Index>, std::vector<uint8_t>> trainDataset(
+        const std::vector<float>& input,
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description) {
+    // train an index
+    auto index = std::shared_ptr<faiss::Index>(
+            faiss::index_factory((int)d, description.c_str()));
+    index->train((int)n, input.data());
+
+    // encode
+    const size_t codeSize = index->sa_code_size();
+
+    std::vector<uint8_t> encodedData(n * codeSize);
+    index->sa_encode(n, input.data(), encodedData.data());
+
+    return std::make_tuple(std::move(index), std::move(encodedData));
+}
+
+bool testIfIVFPQ(
+        const faiss::Index* const index,
+        const float** pqCoarseCentroidsQ,
+        const float** pqFineCentroidsQ) {
+    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
+        return false;
+    }
+
+    const faiss::IndexIVFPQ* const indexQ =
+            dynamic_cast<const faiss::IndexIVFPQ*>(index);
+    if (indexQ == nullptr) {
+        return false;
+    }
+
+    const auto coarseIndexQ =
+            dynamic_cast<const faiss::IndexFlatCodes*>(indexQ->quantizer);
+    if (coarseIndexQ == nullptr) {
+        return false;
+    }
+
+    *pqFineCentroidsQ = indexQ->pq.centroids.data();
+    *pqCoarseCentroidsQ =
+            reinterpret_cast<const float*>(coarseIndexQ->codes.data());
+    return true;
+}
+
+bool testIfResidualPQ(
+        const faiss::Index* const index,
+        const float** pqCoarseCentroidsQ,
+        const float** pqFineCentroidsQ) {
+    if (pqFineCentroidsQ == nullptr || pqCoarseCentroidsQ == nullptr) {
+        return false;
+    }
+
+    const faiss::Index2Layer* const indexQ =
+            dynamic_cast<const faiss::Index2Layer*>(index);
+    if (indexQ == nullptr) {
+        return false;
+    }
+
+    const auto coarseIndexQ = dynamic_cast<const faiss::MultiIndexQuantizer*>(
+            indexQ->q1.quantizer);
+    if (coarseIndexQ == nullptr) {
+        return false;
+    }
+
+    *pqFineCentroidsQ = indexQ->pq.centroids.data();
+    *pqCoarseCentroidsQ = coarseIndexQ->pq.centroids.data();
+    return true;
+}
+
+template <typename T>
+void verifyIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData) {
+    //
+    const float* pqFineCentroidsQ = nullptr;
+    const float* pqCoarseCentroidsQ = nullptr;
+
+    //
+    testIfIVFPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+    testIfResidualPQ(index.get(), &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+
+    //
+    const size_t codeSize = index->sa_code_size();
+
+    //
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // test general purpose version vs contrib::store
+    std::vector<float> outputFaiss(d, 0);
+    std::vector<float> tmpFaiss(d, 0);
+    std::vector<float> tmpContrib(d, 0);
+    for (size_t i = 0; i < n; i++) {
+        // compute using faiss
+        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
+
+        // compute using contrib
+        T::store(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + i * codeSize,
+                tmpContrib.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++)
+            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
+
+        // save for the further comparison
+        const float weight = u(rng);
+        for (size_t j = 0; j < d; j++)
+            outputFaiss[j] += weight * tmpFaiss[j];
+    }
+
+    // test contrib::accum, 1 sample per iteration
+    rng.seed(123);
+
+    std::vector<float> outputContrib1s(d, 0);
+    for (size_t i = 0; i < n; i++) {
+        const float weight0 = u(rng);
+
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data());
+    }
+
+    // verify
+    for (size_t j = 0; j < d; j++) {
+        ASSERT_FLOAT_EQ(outputFaiss[j], outputContrib1s[j]);
+    }
+
+    // test contrib::accum, 2 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib2s(d, 0);
+    std::vector<float> outputContrib2sSame(d, 0);
+    for (size_t i = 0; i < n; i += 2) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib2s[j] = (j + 1) * (j + 1);
+            outputContrib2sSame[j] = (j + 1) * (j + 1);
+        }
+
+        // do a single step, 2 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2s.data());
+
+        // do a single step, 2 samples per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2sSame.data());
+
+        // do two steps, 1 sample per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data());
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
+        }
+    }
+
+    // test contrib::accum, 3 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib3s(d, 0);
+    std::vector<float> outputContrib3sSame(d, 0);
+    const size_t n3 = (n / 3) * 3;
+    for (size_t i = 0; i < n3; i += 3) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib3s[j] = (j + 1) * (j + 1);
+            outputContrib3sSame[j] = (j + 1) * (j + 1);
+        }
+
+        // do a single step, 3 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+        const float weight2 = u(rng);
+
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3s.data());
+
+        // do a single step, 3 samples per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3sSame.data());
+
+        // do three steps, 1 sample per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data());
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data());
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib1s.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
+        }
+    }
+}
+
+template <typename T>
+void verifyMinMaxIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData) {
+    //
+    const float* pqFineCentroidsQ = nullptr;
+    const float* pqCoarseCentroidsQ = nullptr;
+
+    // extract an index that is wrapped with IndexRowwiseMinMaxBase
+    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
+            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
+    ASSERT_NE(indexMinMax.get(), nullptr);
+
+    auto subIndex = indexMinMax->index;
+
+    //
+    testIfIVFPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+    testIfResidualPQ(subIndex, &pqCoarseCentroidsQ, &pqFineCentroidsQ);
+
+    //
+    const size_t codeSize = index->sa_code_size();
+
+    //
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // test general purpose version vs contrib::store
+    std::vector<float> outputFaiss(d, 0);
+    std::vector<float> tmpFaiss(d, 0);
+    std::vector<float> tmpContrib(d, 0);
+    for (size_t i = 0; i < n; i++) {
+        // compute using faiss
+        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
+
+        // compute using contrib
+        T::store(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + i * codeSize,
+                tmpContrib.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++)
+            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
+
+        // save for the further comparison
+        const float weight = u(rng);
+        for (size_t j = 0; j < d; j++)
+            outputFaiss[j] += weight * tmpFaiss[j];
+    }
+
+    // test contrib::accum, 1 sample per iteration.
+    // This needs a way of handling that is different from just IVFPQ and PQ
+    // because of the scaling, but rather similar to how 2 samples per iteration
+    // is processed.
+    rng.seed(123);
+
+    std::vector<float> outputContrib1s(d, 0);
+    float outputMinv1s = 0;
+    for (size_t i = 0; i < n; i++) {
+        // compute using faiss
+        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
+
+        // populate some initial data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+        }
+        outputMinv1s = 0;
+
+        // generate a weight
+        const float weight0 = u(rng);
+
+        //
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data(),
+                outputMinv1s);
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    tmpFaiss[j] * weight0 + (j + 1) * (j + 1));
+        }
+    }
+
+    // test contrib::accum, 2 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib2s(d, 0);
+    std::vector<float> outputContrib2sSame(d, 0);
+    float outputMinv2s = 0;
+    float outputMinv2sSame = 0;
+    for (size_t i = 0; i < n; i += 2) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib2s[j] = (j + 1) * (j + 1);
+            outputContrib2sSame[j] = (j + 1) * (j + 1);
+        }
+        outputMinv1s = 0;
+        outputMinv2s = 0;
+        outputMinv2sSame = 0;
+
+        // do a single step, 2 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2s.data(),
+                outputMinv2s);
+
+        // do a single step, 2 samples per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2sSame.data(),
+                outputMinv2sSame);
+
+        // do two steps, 1 sample per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data(),
+                outputMinv1s);
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data(),
+                outputMinv1s);
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib2s[j] + outputMinv2s);
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib2sSame[j] + outputMinv2sSame);
+        }
+    }
+
+    // test contrib::accum, 3 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib3s(d, 0);
+    float outputMinv3s = 0;
+    std::vector<float> outputContrib3sSame(d, 0);
+    float outputMinv3sSame = 0;
+    const size_t n3 = (n / 3) * 3;
+    for (size_t i = 0; i < n3; i += 3) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib3s[j] = (j + 1) * (j + 1);
+            outputContrib3sSame[j] = (j + 1) * (j + 1);
+        }
+        outputMinv1s = 0;
+        outputMinv3s = 0;
+        outputMinv3sSame = 0;
+
+        // do a single step, 3 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+        const float weight2 = u(rng);
+
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3s.data(),
+                outputMinv3s);
+
+        // do a single step, 3 samples per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3sSame.data(),
+                outputMinv3sSame);
+
+        // do three steps, 1 sample per step
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data(),
+                outputMinv1s);
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data(),
+                outputMinv1s);
+        T::accum(
+                pqCoarseCentroidsQ,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib1s.data(),
+                outputMinv1s);
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib3s[j] + outputMinv3s);
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib3sSame[j] + outputMinv3sSame);
+        }
+    }
+}
+
+template <typename T>
+void verifyIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData) {
+    //
+    const faiss::IndexPQ* const indexQ =
+            dynamic_cast<const faiss::IndexPQ*>(index.get());
+    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
+
+    //
+    const size_t codeSize = index->sa_code_size();
+
+    //
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // test general purpose version vs contrib::store
+    std::vector<float> outputFaiss(d, 0);
+    std::vector<float> tmpFaiss(d, 0);
+    std::vector<float> tmpContrib(d, 0);
+    for (size_t i = 0; i < n; i++) {
+        // compute using faiss
+        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
+
+        // compute using contrib
+        T::store(
+                pqFineCentroidsQ,
+                encodedData.data() + i * codeSize,
+                tmpContrib.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++)
+            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
+
+        // save for the further comparison
+        const float weight = u(rng);
+        for (size_t j = 0; j < d; j++)
+            outputFaiss[j] += weight * tmpFaiss[j];
+    }
+
+    // test contrib::accum, 1 sample per iteration
+    rng.seed(123);
+
+    std::vector<float> outputContrib1s(d, 0);
+    for (size_t i = 0; i < n; i++) {
+        const float weight0 = u(rng);
+
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data());
+    }
+
+    // verify
+    for (size_t j = 0; j < d; j++) {
+        ASSERT_FLOAT_EQ(outputFaiss[j], outputContrib1s[j]);
+    }
+
+    // test contrib::accum, 2 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib2s(d, 0);
+    std::vector<float> outputContrib2sSame(d, 0);
+    for (size_t i = 0; i < n; i += 2) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib2s[j] = (j + 1) * (j + 1);
+            outputContrib2sSame[j] = (j + 1) * (j + 1);
+        }
+
+        // do a single step, 2 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2s.data());
+
+        // do a single step, 2 samples per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2sSame.data());
+
+        // do two steps, 1 sample per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data());
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2s[j]);
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib2sSame[j]);
+        }
+    }
+
+    // test contrib::accum, 3 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib3s(d, 0);
+    std::vector<float> outputContrib3sSame(d, 0);
+    const size_t n3 = (n / 3) * 3;
+    for (size_t i = 0; i < n3; i += 3) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib3s[j] = (j + 1) * (j + 1);
+            outputContrib3sSame[j] = (j + 1) * (j + 1);
+        }
+
+        // do a single step, 3 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+        const float weight2 = u(rng);
+
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3s.data());
+
+        // do a single step, 3 samples per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3sSame.data());
+
+        // do three steps, 1 sample per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data());
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data());
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib1s.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3s[j]);
+            ASSERT_FLOAT_EQ(outputContrib1s[j], outputContrib3sSame[j]);
+        }
+    }
+}
+
+template <typename T>
+void verifyMinMaxIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::shared_ptr<faiss::Index>& index,
+        const std::vector<uint8_t>& encodedData) {
+    // extract an index that is wrapped with IndexRowwiseMinMaxBase
+    const std::shared_ptr<faiss::IndexRowwiseMinMaxBase> indexMinMax =
+            std::dynamic_pointer_cast<faiss::IndexRowwiseMinMaxBase>(index);
+    ASSERT_NE(indexMinMax.get(), nullptr);
+
+    auto subIndex = indexMinMax->index;
+
+    //
+    const faiss::IndexPQ* const indexQ =
+            dynamic_cast<const faiss::IndexPQ*>(subIndex);
+    const float* const pqFineCentroidsQ = indexQ->pq.centroids.data();
+
+    //
+    const size_t codeSize = index->sa_code_size();
+
+    //
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    // test general purpose version vs contrib::store
+    std::vector<float> outputFaiss(d, 0);
+    std::vector<float> tmpFaiss(d, 0);
+    std::vector<float> tmpContrib(d, 0);
+    for (size_t i = 0; i < n; i++) {
+        // compute using faiss
+        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
+
+        // compute using contrib
+        T::store(
+                pqFineCentroidsQ,
+                encodedData.data() + i * codeSize,
+                tmpContrib.data());
+
+        // compare
+        for (size_t j = 0; j < d; j++)
+            ASSERT_FLOAT_EQ(tmpFaiss[j], tmpContrib[j]);
+
+        // save for the further comparison
+        const float weight = u(rng);
+        for (size_t j = 0; j < d; j++)
+            outputFaiss[j] += weight * tmpFaiss[j];
+    }
+
+    // test contrib::accum, 1 sample per iteration.
+    // This needs a way of handling that is different from just IVFPQ and PQ
+    // because of the scaling, but rather similar to how 2 samples per iteration
+    // is processed.
+    rng.seed(123);
+
+    std::vector<float> outputContrib1s(d, 0);
+    float outputMinv1s = 0;
+    for (size_t i = 0; i < n; i++) {
+        // compute using faiss
+        index->sa_decode(1, encodedData.data() + i * codeSize, tmpFaiss.data());
+
+        // populate some initial data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+        }
+        outputMinv1s = 0;
+
+        // generate a weight
+        const float weight0 = u(rng);
+
+        //
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data(),
+                outputMinv1s);
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    tmpFaiss[j] * weight0 + (j + 1) * (j + 1));
+        }
+    }
+
+    // test contrib::accum, 2 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib2s(d, 0);
+    float outputMinv2s = 0;
+    std::vector<float> outputContrib2sSame(d, 0);
+    float outputMinv2sSame = 0;
+    for (size_t i = 0; i < n; i += 2) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib2s[j] = (j + 1) * (j + 1);
+            outputContrib2sSame[j] = (j + 1) * (j + 1);
+        }
+        outputMinv1s = 0;
+        outputMinv2s = 0;
+        outputMinv2sSame = 0;
+
+        // do a single step, 2 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2s.data(),
+                outputMinv2s);
+
+        // do a single step, 2 samples per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib2sSame.data(),
+                outputMinv2sSame);
+
+        // do two steps, 1 sample per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data(),
+                outputMinv1s);
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data(),
+                outputMinv1s);
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib2s[j] + outputMinv2s);
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib2sSame[j] + outputMinv2sSame);
+        }
+    }
+
+    // test contrib::accum, 3 samples per iteration.
+    rng.seed(123);
+
+    std::vector<float> outputContrib3s(d, 0);
+    float outputMinv3s = 0;
+    std::vector<float> outputContrib3sSame(d, 0);
+    float outputMinv3sSame = 0;
+    const size_t n3 = (n / 3) * 3;
+    for (size_t i = 0; i < n3; i += 3) {
+        // populate outputContribs with some existing data
+        for (size_t j = 0; j < d; j++) {
+            outputContrib1s[j] = (j + 1) * (j + 1);
+            outputContrib3s[j] = (j + 1) * (j + 1);
+            outputContrib3sSame[j] = (j + 1) * (j + 1);
+        }
+        outputMinv1s = 0;
+        outputMinv3s = 0;
+        outputMinv3sSame = 0;
+
+        // do a single step, 3 samples per step
+        const float weight0 = u(rng);
+        const float weight1 = u(rng);
+        const float weight2 = u(rng);
+
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3s.data(),
+                outputMinv3s);
+
+        // do a single step, 3 samples per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib3sSame.data(),
+                outputMinv3sSame);
+
+        // do three steps, 1 sample per step
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 0) * codeSize,
+                weight0,
+                outputContrib1s.data(),
+                outputMinv1s);
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 1) * codeSize,
+                weight1,
+                outputContrib1s.data(),
+                outputMinv1s);
+        T::accum(
+                pqFineCentroidsQ,
+                encodedData.data() + (i + 2) * codeSize,
+                weight2,
+                outputContrib1s.data(),
+                outputMinv1s);
+
+        // compare
+        for (size_t j = 0; j < d; j++) {
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib3s[j] + outputMinv3s);
+            ASSERT_FLOAT_EQ(
+                    outputContrib1s[j] + outputMinv1s,
+                    outputContrib3sSame[j] + outputMinv3sSame);
+        }
+    }
+}
+
+std::vector<float> generate(const size_t n, const size_t d) {
+    std::vector<float> data(n * d);
+
+    std::minstd_rand rng(345);
+    std::uniform_real_distribution<float> ux(0, 1);
+
+    //
+    for (size_t k = 0; k < n; k++) {
+        for (size_t j = 0; j < d; j++) {
+            data[k * d + j] = ux(rng);
+        }
+    }
+
+    return data;
+}
+
+template <typename T>
+void testIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyIndex2LevelDecoder<T>(n, d, index, encodedData);
+}
+
+template <typename T>
+void testMinMaxIndex2LevelDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyMinMaxIndex2LevelDecoder<T>(n, d, index, encodedData);
+}
+
+template <typename T>
+void testIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyIndexPQDecoder<T>(n, d, index, encodedData);
+}
+
+template <typename T>
+void testMinMaxIndexPQDecoder(
+        const uint64_t n,
+        const uint64_t d,
+        const std::string& description) {
+    auto data = generate(n, d);
+    std::shared_ptr<faiss::Index> index;
+    std::vector<uint8_t> encodedData;
+    std::tie(index, encodedData) = trainDataset(data, n, d, description);
+
+    verifyMinMaxIndexPQDecoder<T>(n, d, index, encodedData);
+}
+
+constexpr size_t NSAMPLES = 256;
+
+//
+TEST(testCppcontribSaDecode, D256_IVF256_PQ16) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 256, "IVF256,PQ16np");
+}
+
+TEST(testCppcontribSaDecode, D256_IVF256_PQ8) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 32>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 256, "IVF256,PQ8np");
+}
+
+//
+TEST(testCppcontribSaDecode, D192_IVF256_PQ24) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<192, 192, 8>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 192, "IVF256,PQ24np");
+}
+
+//
+TEST(testCppcontribSaDecode, D192_IVF256_PQ16) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<192, 192, 12>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 192, "IVF256,PQ16np");
+}
+
+//
+TEST(testCppcontribSaDecode, D192_IVF256_PQ12) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<192, 192, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 192, "IVF256,PQ12np");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_IVF256_PQ40) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 4>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ40np");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_IVF256_PQ20) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 8>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ20np");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_IVF256_PQ10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ10np");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_IVF256_PQ8) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 20>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 160, "IVF256,PQ8np");
+}
+
+//
+TEST(testCppcontribSaDecode, D128_IVF256_PQ8) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 128, "IVF256,PQ8np");
+}
+
+TEST(testCppcontribSaDecode, D128_IVF256_PQ4) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<128, 128, 32>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 128, "IVF256,PQ4np");
+}
+
+//
+TEST(testCppcontribSaDecode, D64_IVF256_PQ16) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<64, 64, 8>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 64, "IVF256,PQ8np");
+}
+
+TEST(testCppcontribSaDecode, D64_IVF256_PQ8) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<64, 64, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 64, "IVF256,PQ4np");
+}
+
+#if defined(__AVX2__)
+TEST(testCppcontribSaDecode, D40_IVF256_PQ20) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<40, 40, 2>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 40, "IVF256,PQ20np");
+}
+#endif
+
+//
+TEST(testCppcontribSaDecode, D256_Residual4x8_PQ16) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 256, "Residual4x8,PQ16");
+}
+
+TEST(testCppcontribSaDecode, D256_Residual4x8_PQ8) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 32>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 256, "Residual4x8,PQ8");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_Residual4x8_PQ10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 40, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 160, "Residual4x8,PQ10");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_Residual2x8_PQ10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 80, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 160, "Residual2x8,PQ10");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_Residual1x8_PQ10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 160, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 160, "Residual1x8,PQ10");
+}
+
+//
+TEST(testCppcontribSaDecode, D128_Residual4x8_PQ8) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 128, "Residual4x8,PQ8");
+}
+
+TEST(testCppcontribSaDecode, D128_Residual4x8_PQ4) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<128, 32, 32>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 128, "Residual4x8,PQ4");
+}
+
+//
+TEST(testCppcontribSaDecode, D64_Residual4x8_PQ8) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<64, 16, 8>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 64, "Residual4x8,PQ8");
+}
+
+TEST(testCppcontribSaDecode, D64_Residual4x8_PQ4) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<64, 16, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES, 64, "Residual4x8,PQ4");
+}
+
+//
+TEST(testCppcontribSaDecode, D256_IVF1024_PQ16) {
+    // It is acceptable to use COARSE_BITS=16 in this case,
+    // because there's only one coarse quantizer element.
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "IVF1024,PQ16np");
+}
+
+TEST(testCppcontribSaDecode, D64_Residual1x9_PQ8) {
+    // It is acceptable to use COARSE_BITS=16 in this case,
+    // because there's only one coarse quantizer element.
+    // It won't work for "Residual2x9,PQ8".
+    using T = faiss::cppcontrib::Index2LevelDecoder<64, 64, 8, 16>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 2, 64, "Residual1x9,PQ8");
+}
+
+//
+TEST(testCppcontribSaDecode, D256_PQ16) {
+    using T = faiss::cppcontrib::IndexPQDecoder<256, 16>;
+    testIndexPQDecoder<T>(NSAMPLES, 256, "PQ16np");
+}
+
+//
+TEST(testCppcontribSaDecode, D160_PQ20) {
+    using T = faiss::cppcontrib::IndexPQDecoder<160, 8>;
+    testIndexPQDecoder<T>(NSAMPLES, 160, "PQ20np");
+}
+
+#if defined(__AVX2__)
+TEST(testCppcontribSaDecode, D40_PQ20) {
+    using T = faiss::cppcontrib::IndexPQDecoder<40, 2>;
+    testIndexPQDecoder<T>(NSAMPLES, 40, "PQ20np");
+}
+#endif
+
+// test IndexRowwiseMinMaxFP16
+TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF256_PQ16) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(NSAMPLES, 256, "MinMaxFP16,IVF256,PQ16np");
+}
+
+TEST(testCppcontribSaDecode, D256_MINMAXFP16_PQ16) {
+    using SubT = faiss::cppcontrib::IndexPQDecoder<256, 16>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndexPQDecoder<T>(NSAMPLES, 256, "MinMaxFP16,PQ16np");
+}
+
+// test IndexRowwiseMinMax
+TEST(testCppcontribSaDecode, D256_MINMAX_IVF256_PQ16) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16>;
+    using T = faiss::cppcontrib::IndexMinMaxDecoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(NSAMPLES, 256, "MinMax,IVF256,PQ16np");
+}
+
+TEST(testCppcontribSaDecode, D256_MINMAX_PQ16) {
+    using SubT = faiss::cppcontrib::IndexPQDecoder<256, 16>;
+    using T = faiss::cppcontrib::IndexMinMaxDecoder<SubT>;
+    testMinMaxIndexPQDecoder<T>(NSAMPLES, 256, "MinMax,PQ16np");
+}
+
+// implemented for AVX2 and ARM so far
+#if defined(__AVX2__) || defined(__ARM_NEON)
+TEST(testCppcontribSaDecode, D256_PQ16x10) {
+    using T = faiss::cppcontrib::IndexPQDecoder<256, 16, 10>;
+    testIndexPQDecoder<T>(NSAMPLES * 4, 256, "PQ16x10np");
+}
+
+TEST(testCppcontribSaDecode, D256_PQ16x12) {
+    using T = faiss::cppcontrib::IndexPQDecoder<256, 16, 12>;
+    testIndexPQDecoder<T>(NSAMPLES * 16, 256, "PQ16x12np");
+}
+
+TEST(testCppcontribSaDecode, D160_PQ20x10) {
+    using T = faiss::cppcontrib::IndexPQDecoder<160, 8, 10>;
+    testIndexPQDecoder<T>(NSAMPLES * 4, 160, "PQ20x10np");
+}
+
+TEST(testCppcontribSaDecode, D160_PQ20x12) {
+    using T = faiss::cppcontrib::IndexPQDecoder<160, 8, 12>;
+    testIndexPQDecoder<T>(NSAMPLES * 16, 160, "PQ20x12np");
+}
+
+TEST(testCppcontribSaDecode, D256_IVF256_PQ16x10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 10>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "IVF256,PQ16x10np");
+}
+
+TEST(testCppcontribSaDecode, D256_IVF256_PQ16x12) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 12>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 16, 256, "IVF256,PQ16x12np");
+}
+
+TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF256_PQ16x10) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 8, 10>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(
+            NSAMPLES * 4, 256, "MinMaxFP16,IVF256,PQ16x10np");
+}
+
+TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF1024_PQ16x10) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 10, 10>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(
+            NSAMPLES * 4, 256, "MinMaxFP16,IVF1024,PQ16x10np");
+}
+
+TEST(testCppcontribSaDecode, D256_MINMAXFP16_IVF1024_PQ16x10_ALTERNATIVE) {
+    using SubT = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16, 10>;
+    using T = faiss::cppcontrib::IndexMinMaxFP16Decoder<SubT>;
+    testMinMaxIndex2LevelDecoder<T>(
+            NSAMPLES * 4, 256, "MinMaxFP16,IVF1024,PQ16x10np");
+}
+
+TEST(testCppcontribSaDecode, D160_Residual4x8_PQ8x10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<160, 40, 20, 8, 10>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 160, "Residual4x8,PQ8x10");
+}
+
+TEST(testCppcontribSaDecode, D256_Residual1x9_PQ16x10) {
+    // It is acceptable to use COARSE_BITS=16 in this case,
+    // because there's only one coarse quantizer element.
+    // It won't work for "Residual2x9,PQ16x10".
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 256, 16, 16, 10>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "Residual1x9,PQ16x10");
+}
+
+TEST(testCppcontribSaDecode, D256_Residual4x10_PQ16x10) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16, 10, 10>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 4, 256, "Residual4x10,PQ16x10");
+}
+
+TEST(testCppcontribSaDecode, D256_Residual4x12_PQ16x12) {
+    using T = faiss::cppcontrib::Index2LevelDecoder<256, 64, 16, 12, 12>;
+    testIndex2LevelDecoder<T>(NSAMPLES * 16, 256, "Residual4x12,PQ16x12");
+}
+
+#endif
diff --git a/thirdparty/faiss/tests/test_cppcontrib_uintreader.cpp b/thirdparty/faiss/tests/test_cppcontrib_uintreader.cpp
new file mode 100644
index 000000000..b6ecb261b
--- /dev/null
+++ b/thirdparty/faiss/tests/test_cppcontrib_uintreader.cpp
@@ -0,0 +1,114 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This test was designed to be run using valgrind or ASAN to test the
+// correctness of memory accesses.
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <random>
+
+#include <faiss/utils/hamming.h>
+
+#include <faiss/cppcontrib/detail/UintReader.h>
+
+template <intptr_t N_ELEMENTS, intptr_t CODE_BITS, intptr_t CPOS>
+struct TestLoop {
+    static void test(
+            const uint8_t* const container,
+            faiss::BitstringReader& br) {
+        // validate
+        const intptr_t uintreader_data = faiss::cppcontrib::detail::
+                UintReaderRaw<N_ELEMENTS, CODE_BITS, CPOS>::get(container);
+        const intptr_t bitstringreader_data = br.read(CODE_BITS);
+
+        ASSERT_EQ(uintreader_data, bitstringreader_data)
+                << "Mismatch between BitstringReader (" << bitstringreader_data
+                << ") and UintReader (" << uintreader_data
+                << ") for N_ELEMENTS=" << N_ELEMENTS
+                << ", CODE_BITS=" << CODE_BITS << ", CPOS=" << CPOS;
+
+        //
+        TestLoop<N_ELEMENTS, CODE_BITS, CPOS + 1>::test(container, br);
+    }
+};
+
+template <intptr_t N_ELEMENTS, intptr_t CODE_BITS>
+struct TestLoop<N_ELEMENTS, CODE_BITS, N_ELEMENTS> {
+    static void test(
+            const uint8_t* const container,
+            faiss::BitstringReader& br) {}
+};
+
+template <intptr_t N_ELEMENTS, intptr_t CODE_BITS>
+void TestUintReader() {
+    constexpr intptr_t CODE_BYTES = (CODE_BITS * N_ELEMENTS + 7) / 8;
+
+    std::default_random_engine rng;
+    std::uniform_int_distribution<uint64_t> u(0, 1 << CODE_BITS);
+
+    // do several attempts
+    for (size_t attempt = 0; attempt < 10; attempt++) {
+        // allocate a buffer. This way, not std::vector
+        std::unique_ptr<uint8_t[]> container(new uint8_t[CODE_BYTES]);
+        // make it empty
+        for (size_t i = 0; i < CODE_BYTES; i++) {
+            container.get()[i] = 0;
+        }
+
+        // populate it
+        faiss::BitstringWriter bw(container.get(), CODE_BYTES);
+        for (size_t i = 0; i < N_ELEMENTS; i++) {
+            bw.write(u(rng), CODE_BITS);
+        }
+
+        // read it back and verify against bitreader
+        faiss::BitstringReader br(container.get(), CODE_BYTES);
+
+        TestLoop<N_ELEMENTS, CODE_BITS, 0>::test(container.get(), br);
+    }
+}
+
+template <intptr_t CODE_BITS>
+void TestUintReaderBits() {
+    TestUintReader<1, CODE_BITS>();
+    TestUintReader<2, CODE_BITS>();
+    TestUintReader<3, CODE_BITS>();
+    TestUintReader<4, CODE_BITS>();
+    TestUintReader<5, CODE_BITS>();
+    TestUintReader<6, CODE_BITS>();
+    TestUintReader<7, CODE_BITS>();
+    TestUintReader<8, CODE_BITS>();
+    TestUintReader<9, CODE_BITS>();
+    TestUintReader<10, CODE_BITS>();
+    TestUintReader<11, CODE_BITS>();
+    TestUintReader<12, CODE_BITS>();
+    TestUintReader<13, CODE_BITS>();
+    TestUintReader<14, CODE_BITS>();
+    TestUintReader<15, CODE_BITS>();
+    TestUintReader<16, CODE_BITS>();
+    TestUintReader<17, CODE_BITS>();
+}
+
+TEST(testCppcontribUintreader, Test8bit) {
+    TestUintReaderBits<8>();
+}
+
+TEST(testCppcontribUintreader, Test10bit) {
+    TestUintReaderBits<10>();
+}
+
+TEST(testCppcontribUintreader, Test12bit) {
+    TestUintReaderBits<12>();
+}
+
+TEST(testCppcontribUintreader, Test16bit) {
+    TestUintReaderBits<16>();
+}
diff --git a/thirdparty/faiss/tests/test_dealloc_invlists.cpp b/thirdparty/faiss/tests/test_dealloc_invlists.cpp
index dfbfa1ba9..fb132087e 100644
--- a/thirdparty/faiss/tests/test_dealloc_invlists.cpp
+++ b/thirdparty/faiss/tests/test_dealloc_invlists.cpp
@@ -24,8 +24,6 @@ using namespace faiss;
 
 namespace {
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
@@ -110,7 +108,7 @@ struct EncapsulateInvertedLists : InvertedLists {
                 il->get_single_code(list_no, offset), code_size);
     }
 
-    size_t add_entries(size_t, size_t, const idx_t*, const uint8_t*, const float* ) override {
+    size_t add_entries(size_t, size_t, const idx_t*, const uint8_t*, const float*) override {
         assert(!"not implemented");
         return 0;
     }
diff --git a/thirdparty/faiss/tests/test_distances_if.cpp b/thirdparty/faiss/tests/test_distances_if.cpp
new file mode 100644
index 000000000..f5bc55060
--- /dev/null
+++ b/thirdparty/faiss/tests/test_distances_if.cpp
@@ -0,0 +1,141 @@
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <faiss/utils/distances.h>
+
+#include <faiss/utils/distances_if.h>
+
+/* compute the inner product between x and a subset y of ny vectors,
+   whose indices are given by idy.  */
+void fvec_inner_products_by_idx_ref(
+        float* __restrict ip,
+        const float* x,
+        const float* y,
+        const int64_t* __restrict ids, /* for y vecs */
+        size_t d,
+        size_t nx,
+        size_t ny) {
+#pragma omp parallel for
+    for (int64_t j = 0; j < nx; j++) {
+        const int64_t* __restrict idsj = ids + j * ny;
+        const float* xj = x + j * d;
+        float* __restrict ipj = ip + j * ny;
+
+        // baseline version
+        for (size_t i = 0; i < ny; i++) {
+            if (idsj[i] < 0)
+                continue;
+            ipj[i] = faiss::fvec_inner_product(xj, y + d * idsj[i], d);
+        }
+    }
+}
+
+/* compute the inner product between x and a subset y of ny vectors,
+   whose indices are given by idy.  */
+void fvec_L2sqr_by_idx_ref(
+        float* __restrict dis,
+        const float* x,
+        const float* y,
+        const int64_t* __restrict ids, /* ids of y vecs */
+        size_t d,
+        size_t nx,
+        size_t ny) {
+#pragma omp parallel for
+    for (int64_t j = 0; j < nx; j++) {
+        const int64_t* __restrict idsj = ids + j * ny;
+        const float* xj = x + j * d;
+        float* __restrict disj = dis + j * ny;
+
+        // baseline version
+        for (size_t i = 0; i < ny; i++) {
+            if (idsj[i] < 0)
+                continue;
+            disj[i] = faiss::fvec_L2sqr(xj, y + d * idsj[i], d);
+        }
+    }
+}
+
+TEST(TestDistancesIf, TestNyByIdx) {
+    const size_t dim = 16;
+    const size_t nx = 32;
+
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 1);
+
+    std::vector<float> x(nx * dim);
+    for (size_t i = 0; i < x.size(); i++) {
+        x[i] = u(rng);
+    }
+    
+    std::vector<float> y(64 * dim);
+    for (size_t i = 0; i < y.size(); i++) {
+        y[i] = u(rng);
+    }
+
+    for (size_t attempt = 0; attempt < 5; attempt++) {
+        for (const size_t ny : {1, 2, 3, 4, 5, 6, 7, 8, 16, 63, 64}) {
+            std::vector<float> dis_IP(nx * ny, 1e20);
+            std::vector<float> dis_IP_ref(nx * ny, 1e20);
+            std::vector<float> dis_L2(nx * ny, 1e20);
+            std::vector<float> dis_L2_ref(nx * ny, 1e20);
+
+            std::uniform_int_distribution<int64_t> ids_u(0, ny - 1);
+            std::vector<int64_t> ids(nx * ny);
+            for (size_t i = 0; i < nx * ny; i++) {
+                if (u(rng) < 0.5) {
+                    ids[i] = -1;
+                }
+                else {
+                    ids[i] = ids_u(rng);
+                }
+            }
+
+            // test IP
+            fvec_inner_products_by_idx_ref(
+                dis_IP_ref.data(), 
+                x.data(), 
+                y.data(),
+                ids.data(),
+                dim,
+                nx,
+                ny);
+
+            faiss::fvec_inner_products_by_idx(
+                dis_IP.data(), 
+                x.data(), 
+                y.data(),
+                ids.data(),
+                dim,
+                nx,
+                ny);
+
+            ASSERT_EQ(dis_IP, dis_IP_ref) << "ny = " << ny;
+
+            // test L2
+            fvec_L2sqr_by_idx_ref(
+                dis_L2_ref.data(), 
+                x.data(), 
+                y.data(),
+                ids.data(),
+                dim,
+                nx,
+                ny);
+
+            faiss::fvec_L2sqr_by_idx(
+                dis_L2.data(), 
+                x.data(), 
+                y.data(),
+                ids.data(),
+                dim,
+                nx,
+                ny);
+
+            ASSERT_EQ(dis_L2, dis_L2_ref) << "ny = " << ny;
+        }
+    }
+}
\ No newline at end of file
diff --git a/thirdparty/faiss/tests/test_distances_simd.cpp b/thirdparty/faiss/tests/test_distances_simd.cpp
new file mode 100644
index 000000000..762d242bc
--- /dev/null
+++ b/thirdparty/faiss/tests/test_distances_simd.cpp
@@ -0,0 +1,110 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <faiss/FaissHook.h>
+#include <faiss/utils/distances.h>
+
+// reference implementations
+void fvec_inner_products_ny_ref(
+        float* ip,
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t ny) {
+    for (size_t i = 0; i < ny; i++) {
+        ip[i] = faiss::fvec_inner_product(x, y, d);
+        y += d;
+    }
+}
+
+void fvec_L2sqr_ny_ref(
+        float* dis,
+        const float* x,
+        const float* y,
+        size_t d,
+        size_t ny) {
+    for (size_t i = 0; i < ny; i++) {
+        dis[i] = faiss::fvec_L2sqr(x, y, d);
+        y += d;
+    }
+}
+
+// test templated versions of fvec_L2sqr_ny
+TEST(TestFvecL2sqrNy, D2) {
+    // we're using int values in order to get 100% accurate
+    // results with floats.
+    std::default_random_engine rng(123);
+    std::uniform_int_distribution<int32_t> u(0, 32);
+
+    for (const auto dim : {2, 4, 8, 12}) {
+        std::vector<float> x(dim, 0);
+        for (size_t i = 0; i < x.size(); i++) {
+            x[i] = u(rng);
+        }
+
+        for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
+            std::vector<float> y(nrows * dim);
+            for (size_t i = 0; i < y.size(); i++) {
+                y[i] = u(rng);
+            }
+
+            std::vector<float> distances(nrows, 0);
+            faiss::fvec_L2sqr_ny(
+                    distances.data(), x.data(), y.data(), dim, nrows);
+
+            std::vector<float> distances_ref(nrows, 0);
+            fvec_L2sqr_ny_ref(
+                    distances_ref.data(), x.data(), y.data(), dim, nrows);
+
+            ASSERT_EQ(distances, distances_ref)
+                    << "Mismatching results for dim = " << dim
+                    << ", nrows = " << nrows;
+        }
+    }
+}
+
+// fvec_inner_products_ny
+TEST(TestFvecInnerProductsNy, D2) {
+    // we're using int values in order to get 100% accurate
+    // results with floats.
+    std::default_random_engine rng(123);
+    std::uniform_int_distribution<int32_t> u(0, 32);
+
+    for (const auto dim : {2, 4, 8, 12}) {
+        std::vector<float> x(dim, 0);
+        for (size_t i = 0; i < x.size(); i++) {
+            x[i] = u(rng);
+        }
+
+        for (const auto nrows : {1, 2, 5, 10, 15, 20, 25}) {
+            std::vector<float> y(nrows * dim);
+            for (size_t i = 0; i < y.size(); i++) {
+                y[i] = u(rng);
+            }
+
+            std::vector<float> distances(nrows, 0);
+            faiss::fvec_inner_products_ny(
+                    distances.data(), x.data(), y.data(), dim, nrows);
+
+            std::vector<float> distances_ref(nrows, 0);
+            fvec_inner_products_ny_ref(
+                    distances_ref.data(), x.data(), y.data(), dim, nrows);
+
+            ASSERT_EQ(distances, distances_ref)
+                    << "Mismatching results for dim = " << dim
+                    << ", nrows = " << nrows;
+        }
+    }
+}
diff --git a/thirdparty/faiss/tests/test_extra_distances.py b/thirdparty/faiss/tests/test_extra_distances.py
index eec955e14..a474dd6ba 100644
--- a/thirdparty/faiss/tests/test_extra_distances.py
+++ b/thirdparty/faiss/tests/test_extra_distances.py
@@ -82,6 +82,18 @@ def xx_test_jensenshannon(self):
         self.run_simple_dis_test(scipy.spatial.distance.jensenshannon,
                                  faiss.METRIC_JensenShannon)
 
+    def test_jaccard(self):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [
+                (np.min([x, y], axis=0).sum() / np.max([x, y], axis=0).sum())
+                for y in yb
+            ]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_Jaccard)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """
diff --git a/thirdparty/faiss/tests/test_factory.py b/thirdparty/faiss/tests/test_factory.py
index cd51b6e44..79662af75 100644
--- a/thirdparty/faiss/tests/test_factory.py
+++ b/thirdparty/faiss/tests/test_factory.py
@@ -6,10 +6,11 @@
 
 import numpy as np
 import unittest
+import gc
 import faiss
 
 from faiss.contrib import factory_tools
-
+from faiss.contrib import datasets
 
 class TestFactory(unittest.TestCase):
 
@@ -81,6 +82,9 @@ def test_factory_HNSW_newstyle(self):
         index = faiss.index_factory(12, "HNSW32,PQ4np")
         indexpq = faiss.downcast_index(index.storage)
         assert not indexpq.do_polysemous_training
+        index = faiss.index_factory(12, "HNSW32,PQ4x12np")
+        indexpq = faiss.downcast_index(index.storage)
+        self.assertEqual(indexpq.pq.nbits, 12)
 
     def test_factory_NSG(self):
         index = faiss.index_factory(12, "NSG64")
@@ -96,6 +100,12 @@ def test_factory_NSG(self):
         assert isinstance(index, faiss.IndexNSGFlat)
         assert index.nsg.R == 64
 
+        index = faiss.index_factory(12, "NSG64,PQ3x10")
+        assert isinstance(index, faiss.IndexNSGPQ)
+        assert index.nsg.R == 64
+        indexpq = faiss.downcast_index(index.storage)
+        self.assertEqual(indexpq.pq.nbits, 10)
+
         index = faiss.index_factory(12, "IVF65536_NSG64,Flat")
         index_nsg = faiss.downcast_index(index.quantizer)
         assert isinstance(index, faiss.IndexIVFFlat)
@@ -160,6 +170,7 @@ def test_residual(self):
         index = faiss.index_factory(50, "IVF1000,PQ25x4fsr")
         self.assertTrue(index.by_residual)
 
+
 class TestCodeSize(unittest.TestCase):
 
     def test_1(self):
@@ -225,6 +236,16 @@ def test_idmap(self):
         index = faiss.index_factory(123, "Flat,IDMap")
         self.assertEqual(index.__class__, faiss.IndexIDMap)
 
+    def test_idmap2_suffix(self):
+        index = faiss.index_factory(123, "Flat,IDMap2")
+        index = faiss.downcast_index(index)
+        self.assertEqual(index.__class__, faiss.IndexIDMap2)
+
+    def test_idmap2_prefix(self):
+        index = faiss.index_factory(123, "IDMap2,Flat")
+        index = faiss.downcast_index(index)
+        self.assertEqual(index.__class__, faiss.IndexIDMap2)
+
     def test_ivf_hnsw(self):
         index = faiss.index_factory(123, "IVF100_HNSW,Flat")
         quantizer = faiss.downcast_index(index.quantizer)
@@ -265,3 +286,37 @@ class TestSpectralHash(unittest.TestCase):
     def test_sh(self):
         index = faiss.index_factory(123, "IVF256,ITQ64,SH1.2")
         self.assertEqual(index.__class__, faiss.IndexIVFSpectralHash)
+
+
+class TestQuantizerClone(unittest.TestCase):
+
+    def test_clone(self):
+        ds = datasets.SyntheticDataset(32, 200, 10, 0)
+
+        quant = faiss.ScalarQuantizer(32, faiss.ScalarQuantizer.QT_4bit)
+        quant.train(ds.get_train())
+
+        codes = quant.compute_codes(ds.get_database())
+
+        quant2 = faiss.clone_Quantizer(quant)
+        self.assertTrue(quant2.this.own())
+
+        # make sure typemap works
+        self.assertEqual(quant2.__class__, faiss.ScalarQuantizer)
+
+        codes2 = quant2.compute_codes(ds.get_database())
+        np.testing.assert_array_equal(codes, codes2)
+
+
+class TestIVFSpectralHashOwnership(unittest.TestCase):
+
+    def test_constructor(self):
+        index = faiss.IndexIVFSpectralHash(faiss.IndexFlat(10), 10, 20, 10, 1)
+        gc.collect()
+        index.quantizer.ntotal   # this should not crash
+
+    def test_replace_vt(self):
+        index = faiss.IndexIVFSpectralHash(faiss.IndexFlat(10), 10, 20, 10, 1)
+        index.replace_vt(faiss.ITQTransform(10, 10))
+        gc.collect()
+        index.vt.d_out # this should not crash
diff --git a/thirdparty/faiss/tests/test_fast_scan.py b/thirdparty/faiss/tests/test_fast_scan.py
index 8b18b62b8..b061ee3af 100644
--- a/thirdparty/faiss/tests/test_fast_scan.py
+++ b/thirdparty/faiss/tests/test_fast_scan.py
@@ -6,22 +6,16 @@
 
 import unittest
 import time
+import os
+import tempfile
 
 import numpy as np
 import faiss
 
 from faiss.contrib import datasets
-import platform
-
-
-class TestCompileOptions(unittest.TestCase):
-
-    def test_compile_options(self):
-        options = faiss.get_compile_options()
-        options = options.split(' ')
-        for option in options:
-            assert option in ['AVX2', 'NEON', 'GENERIC', 'OPTIMIZE']
 
+# the tests tend to timeout in stress modes + dev otherwise
+faiss.omp_set_num_threads(4)
 
 class TestSearch(unittest.TestCase):
 
@@ -79,7 +73,7 @@ def test_PQ4_speed(self):
         t1 = time.time()
         pqfs_t = t1 - t0
         print('PQ16x4fs search time:', pqfs_t)
-        self.assertLess(pqfs_t * 5, pq_t)
+        self.assertLess(pqfs_t * 4, pq_t)
 
 
 class TestRounding(unittest.TestCase):
@@ -110,7 +104,7 @@ def do_test_rounding(self, implem=4, metric=faiss.METRIC_L2):
             recalls[rank] = (Iref[:, :1] == I4[:, :rank]).sum() / nq
 
         min_r1 = 0.98 if metric == faiss.METRIC_INNER_PRODUCT else 0.99
-        self.assertGreater(recalls[1], min_r1)
+        self.assertGreaterEqual(recalls[1], min_r1)
         self.assertGreater(recalls[10], 0.995)
         # check accuracy of distances
         # err3 = ((D3 - D2) ** 2).sum()
@@ -136,12 +130,41 @@ def test_implem_14(self):
     def test_implem_14_ip(self):
         self.do_test_rounding(12, faiss.METRIC_INNER_PRODUCT)
 
+
+class TestReconstruct(unittest.TestCase):
+
+    def test_pqfastscan(self):
+        ds = datasets.SyntheticDataset(20, 1000, 1000, 0)
+
+        index = faiss.index_factory(20, 'PQ5x4')
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        recons = index.reconstruct_n(0, index.ntotal)
+
+        index2 = faiss.IndexPQFastScan(index)
+        recons2 = index2.reconstruct_n(0, index.ntotal)
+
+        np.testing.assert_array_equal(recons, recons2)
+
+    def test_aqfastscan(self):
+        ds = datasets.SyntheticDataset(20, 1000, 1000, 0)
+
+        index = faiss.index_factory(20, 'RQ5x4_Nrq2x4')
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        recons = index.reconstruct_n(0, index.ntotal)
+
+        index2 = faiss.IndexAdditiveQuantizerFastScan(index)
+        recons2 = index2.reconstruct_n(0, index.ntotal)
+
+        np.testing.assert_array_equal(recons, recons2)
+
+
 #########################################################
 # Kernel unit test
 #########################################################
 
 
-
 def reference_accu(codes, LUT):
     nq, nsp, is_16 = LUT.shape
     nb, nsp_2 = codes.shape
@@ -211,9 +234,6 @@ def test_22(self):
         self.do_loop5_kernel(2, 2)
 
 
-
-
-
 ##########################################################
 # Tests for various IndexPQFastScan implementations
 ##########################################################
@@ -255,6 +275,17 @@ def get_index(self, d, metric):
             index2.implem = 4
             Dref, Iref = index2.search(ds.get_queries(), 10)
 
+            # check CodePacker
+            codes_ref = faiss.vector_to_array(index.codes)
+            codes_ref = codes_ref.reshape(-1, index.code_size)
+            index2codes = faiss.vector_to_array(index2.codes)
+            code_packer = index2.get_CodePacker()
+            index2codes = index2codes.reshape(-1, code_packer.block_size)
+
+            for i in range(0, len(codes_ref), 13):
+                code_new = code_packer.unpack_1(index2codes, i)
+                np.testing.assert_array_equal(codes_ref[i], code_new)
+
             self.cache[(d, metric)] = (ds, index, Dref, Iref)
 
         return self.cache[(d, metric)]
@@ -271,14 +302,12 @@ def do_with_params(self, d, params, metric=faiss.METRIC_L2):
 
         verify_with_draws(self, Dref, Iref, Dnew, Inew)
 
-
     def build_fast_scan_index(self, index, params):
         index2 = faiss.IndexPQFastScan(index)
         index2.implem = 5
         return index2
 
 
-
 class TestImplem12(TestImplems):
 
     def build_fast_scan_index(self, index, qbs):
@@ -307,8 +336,6 @@ def test_qbs6_odd_dim(self):
         self.do_with_params(30, 0x33)
 
 
-
-
 class TestImplem13(TestImplems):
 
     def build_fast_scan_index(self, index, qbs):
@@ -376,6 +403,7 @@ def test_1_32(self):
     def test_2_64(self):
         self.do_with_params(32, (2, 64))
 
+
 class TestAdd(unittest.TestCase):
 
     def do_test_add(self, d, bbs):
@@ -425,7 +453,7 @@ def test_constructor(self):
 
         recall_at_1 = (Iref[:, 0] == Inew[:, 0]).sum() / nq
 
-        self.assertGreater(recall_at_1, 0.99)
+        self.assertGreaterEqual(recall_at_1, 0.99)
 
         data = faiss.serialize_index(index2)
         index3 = faiss.deserialize_index(data)
@@ -435,3 +463,254 @@ def test_constructor(self):
         D3, I3 = index3.search(ds.get_queries(), 10)
         np.testing.assert_array_equal(D3, Dnew)
         np.testing.assert_array_equal(I3, Inew)
+
+
+class TestAQFastScan(unittest.TestCase):
+
+    def subtest_accuracy(self, aq, st, implem, metric_type='L2'):
+        """
+        Compare IndexAdditiveQuantizerFastScan with IndexAQ (qint8)
+        """
+        d = 16
+        ds = datasets.SyntheticDataset(d, 1000, 1000, 500, metric_type)
+        gt = ds.get_groundtruth(k=1)
+
+        if metric_type == 'L2':
+            metric = faiss.METRIC_L2
+            postfix1 = '_Nqint8'
+            postfix2 = f'_N{st}2x4'
+        else:
+            metric = faiss.METRIC_INNER_PRODUCT
+            postfix1 = postfix2 = ''
+
+        index = faiss.index_factory(d, f'{aq}3x4{postfix1}', metric)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        indexfs = faiss.index_factory(d, f'{aq}3x4fs_32{postfix2}', metric)
+        indexfs.train(ds.get_train())
+        indexfs.add(ds.get_database())
+        indexfs.implem = implem
+        Da, Ia = indexfs.search(ds.get_queries(), 1)
+
+        nq = Iref.shape[0]
+        recall_ref = (Iref == gt).sum() / nq
+        recall = (Ia == gt).sum() / nq
+
+        print(aq, st, implem, metric_type, recall_ref, recall)
+        assert abs(recall_ref - recall) < 0.05
+
+    def xx_test_accuracy(self):
+        for metric in 'L2', 'IP':
+            for implem in 0, 12, 13, 14, 15:
+                self.subtest_accuracy('RQ', 'rq', implem, metric)
+                self.subtest_accuracy('LSQ', 'lsq', implem, metric)
+
+    def subtest_from_idxaq(self, implem, metric):
+        if metric == 'L2':
+            metric_type = faiss.METRIC_L2
+            st = '_Nrq2x4'
+        else:
+            metric_type = faiss.METRIC_INNER_PRODUCT
+            st = ''
+
+        d = 16
+        ds = datasets.SyntheticDataset(d, 1000, 2000, 1000, metric=metric)
+        gt = ds.get_groundtruth(k=1)
+        index = faiss.index_factory(d, 'RQ8x4' + st, metric_type)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 16
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        indexfs = faiss.IndexAdditiveQuantizerFastScan(index)
+        indexfs.implem = implem
+        D1, I1 = indexfs.search(ds.get_queries(), 1)
+
+        nq = Iref.shape[0]
+        recall_ref = (Iref == gt).sum() / nq
+        recall1 = (I1 == gt).sum() / nq
+        print(recall_ref, recall1)
+        assert abs(recall_ref - recall1) < 0.05
+
+    def xx_test_from_idxaq(self):
+        for implem in 2, 3, 4:
+            self.subtest_from_idxaq(implem, 'L2')
+            self.subtest_from_idxaq(implem, 'IP')
+
+    def subtest_factory(self, aq, M, bbs, st):
+        """
+        Format: {AQ}{M}x4fs_{bbs}_N{st}
+
+            AQ (str):    `LSQ` or `RQ`
+            M (int):     number of subquantizers
+            bbs (int):   build block size
+            st (str):    search type, `lsq2x4` or `rq2x4`
+        """
+        AQ = faiss.AdditiveQuantizer
+        d = 16
+
+        if bbs > 0:
+            index = faiss.index_factory(d, f'{aq}{M}x4fs_{bbs}_N{st}2x4')
+        else:
+            index = faiss.index_factory(d, f'{aq}{M}x4fs_N{st}2x4')
+            bbs = 32
+
+        assert index.bbs == bbs
+        aq = faiss.downcast_AdditiveQuantizer(index.aq)
+        assert aq.M == M
+
+        if aq == 'LSQ':
+            assert isinstance(aq, faiss.LocalSearchQuantizer)
+        if aq == 'RQ':
+            assert isinstance(aq, faiss.ResidualQuantizer)
+
+        if st == 'lsq':
+            assert aq.search_type == AQ.ST_norm_lsq2x4
+        if st == 'rq':
+            assert aq.search_type == AQ.ST_norm_rq2x4
+
+    def test_factory(self):
+        self.subtest_factory('LSQ', 16, 64, 'lsq')
+        self.subtest_factory('LSQ', 16, 64, 'rq')
+        self.subtest_factory('RQ', 16, 64, 'rq')
+        self.subtest_factory('RQ', 16, 64, 'lsq')
+        self.subtest_factory('LSQ', 64, 0, 'lsq')
+
+    def subtest_io(self, factory_str):
+        d = 8
+        ds = datasets.SyntheticDataset(d, 1000, 500, 100)
+
+        index = faiss.index_factory(d, factory_str)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        D1, I1 = index.search(ds.get_queries(), 1)
+
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+        try:
+            faiss.write_index(index, fname)
+            index2 = faiss.read_index(fname)
+            D2, I2 = index2.search(ds.get_queries(), 1)
+            np.testing.assert_array_equal(I1, I2)
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def test_io(self):
+        self.subtest_io('LSQ4x4fs_Nlsq2x4')
+        self.subtest_io('LSQ4x4fs_Nrq2x4')
+        self.subtest_io('RQ4x4fs_Nrq2x4')
+        self.subtest_io('RQ4x4fs_Nlsq2x4')
+
+
+# programatically generate tests to get finer test granularity.
+
+def add_TestAQFastScan_subset_accuracy(aq, st, implem, metric):
+    setattr(
+        TestAQFastScan,
+        f"test_accuracy_{metric}_{aq}_implem{implem}",
+        lambda self: self.subtest_accuracy(aq, st, implem, metric)
+    )
+
+
+for metric in 'L2', 'IP':
+    for implem in 0, 12, 13, 14, 15:
+        add_TestAQFastScan_subset_accuracy('LSQ', 'lsq', implem, metric)
+        add_TestAQFastScan_subset_accuracy('RQ', 'rq', implem, metric)
+
+
+def add_TestAQFastScan_subtest_from_idxaq(implem, metric):
+    setattr(
+        TestAQFastScan,
+        f"test_from_idxaq_{metric}_implem{implem}",
+        lambda self: self.subtest_from_idxaq(implem, metric)
+    )
+
+
+for implem in 2, 3, 4:
+    add_TestAQFastScan_subtest_from_idxaq(implem, 'L2')
+    add_TestAQFastScan_subtest_from_idxaq(implem, 'IP')
+
+
+class TestPAQFastScan(unittest.TestCase):
+
+    def subtest_accuracy(self, paq):
+        """
+        Compare IndexPAQFastScan with IndexPAQ (qint8)
+        """
+        d = 16
+        ds = datasets.SyntheticDataset(d, 1000, 1000, 500)
+        gt = ds.get_groundtruth(k=1)
+
+        index = faiss.index_factory(d, f'{paq}2x3x4_Nqint8')
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        indexfs = faiss.index_factory(d, f'{paq}2x3x4fs_Nlsq2x4')
+        indexfs.train(ds.get_train())
+        indexfs.add(ds.get_database())
+        Da, Ia = indexfs.search(ds.get_queries(), 1)
+
+        nq = Iref.shape[0]
+        recall_ref = (Iref == gt).sum() / nq
+        recall = (Ia == gt).sum() / nq
+
+        assert abs(recall_ref - recall) < 0.05
+
+    def test_accuracy_PLSQ(self):
+        self.subtest_accuracy("PLSQ")
+
+    def test_accuracy_PRQ(self):
+        self.subtest_accuracy("PRQ")
+
+    def subtest_factory(self, paq):
+        index = faiss.index_factory(16, f'{paq}2x3x4fs_Nlsq2x4')
+        q = faiss.downcast_Quantizer(index.aq)
+        self.assertEqual(q.nsplits, 2)
+        self.assertEqual(q.subquantizer(0).M, 3)
+
+    def test_factory(self):
+        self.subtest_factory('PRQ')
+        self.subtest_factory('PLSQ')
+
+    def subtest_io(self, factory_str):
+        d = 8
+        ds = datasets.SyntheticDataset(d, 1000, 500, 100)
+
+        index = faiss.index_factory(d, factory_str)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        D1, I1 = index.search(ds.get_queries(), 1)
+
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+        try:
+            faiss.write_index(index, fname)
+            index2 = faiss.read_index(fname)
+            D2, I2 = index2.search(ds.get_queries(), 1)
+            np.testing.assert_array_equal(I1, I2)
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def test_io(self):
+        self.subtest_io('PLSQ2x3x4fs_Nlsq2x4')
+        self.subtest_io('PRQ2x3x4fs_Nrq2x4')
+
+
+class TestBlockDecode(unittest.TestCase):
+
+    def test_issue_2739(self):
+        ds = datasets.SyntheticDataset(960, 200, 1, 0)
+        M = 32
+        index = faiss.index_factory(ds.d, f"PQ{M}x4fs")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        np.testing.assert_array_equal(
+            index.pq.decode(index.pq.compute_codes(ds.get_database()))[0, ::100],
+            index.reconstruct(0)[::100]
+        )
diff --git a/thirdparty/faiss/tests/test_fast_scan_ivf.py b/thirdparty/faiss/tests/test_fast_scan_ivf.py
index f903f9b6e..5a57a39ca 100644
--- a/thirdparty/faiss/tests/test_fast_scan_ivf.py
+++ b/thirdparty/faiss/tests/test_fast_scan_ivf.py
@@ -4,8 +4,9 @@
 # LICENSE file in the root directory of this source tree.
 
 
+import os
 import unittest
-import platform
+import tempfile
 
 import numpy as np
 import faiss
@@ -13,6 +14,8 @@
 from faiss.contrib import datasets
 from faiss.contrib.inspect_tools import get_invlist
 
+# the tests tend to timeout in stress modes + dev otherwise
+faiss.omp_set_num_threads(4)
 
 class TestLUTQuantization(unittest.TestCase):
 
@@ -295,8 +298,8 @@ class TestIVFImplem12(unittest.TestCase):
 
     IMPLEM = 12
 
-    def do_test(self, by_residual, metric=faiss.METRIC_L2, d=32):
-        ds = datasets.SyntheticDataset(d, 2000, 5000, 200)
+    def do_test(self, by_residual, metric=faiss.METRIC_L2, d=32, nq=200):
+        ds = datasets.SyntheticDataset(d, 2000, 5000, nq)
 
         index = faiss.index_factory(d, f"IVF32,PQ{d//2}x4np", metric)
         # force coarse quantizer
@@ -347,6 +350,26 @@ def test_no_residual_odd_dim(self):
     def test_by_residual_odd_dim(self):
         self.do_test(True, d=30)
 
+    # testin single query
+    def test_no_residual_single_query(self):
+        self.do_test(False, nq=1)
+
+    def test_by_residual_single_query(self):
+        self.do_test(True, nq=1)
+
+    def test_no_residual_ip_single_query(self):
+        self.do_test(False, metric=faiss.METRIC_INNER_PRODUCT, nq=1)
+
+    def test_by_residual_ip_single_query(self):
+        self.do_test(True, metric=faiss.METRIC_INNER_PRODUCT, nq=1)
+
+    def test_no_residual_odd_dim_single_query(self):
+        self.do_test(False, d=30, nq=1)
+
+    def test_by_residual_odd_dim_single_query(self):
+        self.do_test(True, d=30, nq=1)
+
+
 
 class TestIVFImplem10(TestIVFImplem12):
     IMPLEM = 10
@@ -355,10 +378,19 @@ class TestIVFImplem10(TestIVFImplem12):
 class TestIVFImplem11(TestIVFImplem12):
     IMPLEM = 11
 
+
 class TestIVFImplem13(TestIVFImplem12):
     IMPLEM = 13
 
 
+class TestIVFImplem14(TestIVFImplem12):
+    IMPLEM = 14
+
+
+class TestIVFImplem15(TestIVFImplem12):
+    IMPLEM = 15
+
+
 class TestAdd(unittest.TestCase):
 
     def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32):
@@ -436,15 +468,15 @@ def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32):
         m3 = three_metrics(Dref, Iref, Dnew, Inew)
         #   print((by_residual, metric, d), ":", m3)
         ref_m3_tab = {
-            (True, 1, 32) : (0.995, 1.0, 9.91),
-            (True, 0, 32) : (0.99, 1.0, 9.91),
-            (True, 1, 30) : (0.99, 1.0, 9.885),
-            (False, 1, 32) : (0.99, 1.0, 9.875),
-            (False, 0, 32) : (0.99, 1.0, 9.92),
-            (False, 1, 30) : (1.0, 1.0, 9.895)
+            (True, 1, 32): (0.995, 1.0, 9.91),
+            (True, 0, 32): (0.99, 1.0, 9.91),
+            (True, 1, 30): (0.989, 1.0, 9.885),
+            (False, 1, 32): (0.99, 1.0, 9.875),
+            (False, 0, 32): (0.99, 1.0, 9.92),
+            (False, 1, 30): (1.0, 1.0, 9.895)
         }
         ref_m3 = ref_m3_tab[(by_residual, metric, d)]
-        self.assertGreater(m3[0], ref_m3[0] * 0.99)
+        self.assertGreaterEqual(m3[0], ref_m3[0] * 0.99)
         self.assertGreater(m3[1], ref_m3[1] * 0.99)
         self.assertGreater(m3[2], ref_m3[2] * 0.99)
 
@@ -484,3 +516,299 @@ def test_issue_2019(self):
         )
         des = faiss.rand((1000, 32))
         index.train(des)
+
+
+class TestIVFAQFastScan(unittest.TestCase):
+
+    def subtest_accuracy(self, aq, st, by_residual, implem, metric_type='L2'):
+        """
+        Compare IndexIVFAdditiveQuantizerFastScan with
+        IndexIVFAdditiveQuantizer
+        """
+        nlist, d = 16, 8
+        ds = datasets.SyntheticDataset(d, 1000, 1000, 500, metric_type)
+        gt = ds.get_groundtruth(k=1)
+
+        if metric_type == 'L2':
+            metric = faiss.METRIC_L2
+            postfix1 = '_Nqint8'
+            postfix2 = f'_N{st}2x4'
+        else:
+            metric = faiss.METRIC_INNER_PRODUCT
+            postfix1 = postfix2 = ''
+
+        index = faiss.index_factory(d, f'IVF{nlist},{aq}3x4{postfix1}', metric)
+        index.by_residual = by_residual
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 16
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        indexfs = faiss.index_factory(
+            d, f'IVF{nlist},{aq}3x4fs_32{postfix2}', metric)
+        indexfs.by_residual = by_residual
+        indexfs.train(ds.get_train())
+        indexfs.add(ds.get_database())
+        indexfs.nprobe = 16
+        indexfs.implem = implem
+        D1, I1 = indexfs.search(ds.get_queries(), 1)
+
+        nq = Iref.shape[0]
+        recall_ref = (Iref == gt).sum() / nq
+        recall1 = (I1 == gt).sum() / nq
+
+        print(aq, st, by_residual, implem, metric_type, recall_ref, recall1)
+        assert abs(recall_ref - recall1) < 0.051
+
+    def xx_test_accuracy(self):
+        # generated programatically below
+        for metric in 'L2', 'IP':
+            for byr in True, False:
+                for implem in 0, 10, 11, 12, 13, 14, 15:
+                    self.subtest_accuracy('RQ', 'rq', byr, implem, metric)
+                    self.subtest_accuracy('LSQ', 'lsq', byr, implem, metric)
+
+    def subtest_rescale_accuracy(self, aq, st, by_residual, implem):
+        """
+        we set norm_scale to 2 and compare it with IndexIVFAQ
+        """
+        nlist, d = 16, 8
+        ds = datasets.SyntheticDataset(d, 1000, 1000, 500)
+        gt = ds.get_groundtruth(k=1)
+
+        metric = faiss.METRIC_L2
+        postfix1 = '_Nqint8'
+        postfix2 = f'_N{st}2x4'
+
+        index = faiss.index_factory(
+            d, f'IVF{nlist},{aq}3x4{postfix1}', metric)
+        index.by_residual = by_residual
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 16
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        indexfs = faiss.index_factory(
+            d, f'IVF{nlist},{aq}3x4fs_32{postfix2}', metric)
+        indexfs.by_residual = by_residual
+        indexfs.norm_scale = 2
+        indexfs.train(ds.get_train())
+        indexfs.add(ds.get_database())
+        indexfs.nprobe = 16
+        indexfs.implem = implem
+        D1, I1 = indexfs.search(ds.get_queries(), 1)
+
+        nq = Iref.shape[0]
+        recall_ref = (Iref == gt).sum() / nq
+        recall1 = (I1 == gt).sum() / nq
+
+        print(aq, st, by_residual, implem, recall_ref, recall1)
+        assert abs(recall_ref - recall1) < 0.05
+
+    def xx_test_rescale_accuracy(self):
+        for byr in True, False:
+            for implem in 0, 10, 11, 12, 13, 14, 15:
+                self.subtest_accuracy('RQ', 'rq', byr, implem, 'L2')
+                self.subtest_accuracy('LSQ', 'lsq', byr, implem, 'L2')
+
+    def subtest_from_ivfaq(self, implem):
+        d = 8
+        ds = datasets.SyntheticDataset(d, 1000, 2000, 1000, metric='IP')
+        gt = ds.get_groundtruth(k=1)
+        index = faiss.index_factory(d, 'IVF16,RQ8x4', faiss.METRIC_INNER_PRODUCT)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 16
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        indexfs = faiss.IndexIVFAdditiveQuantizerFastScan(index)
+        D1, I1 = indexfs.search(ds.get_queries(), 1)
+
+        nq = Iref.shape[0]
+        recall_ref = (Iref == gt).sum() / nq
+        recall1 = (I1 == gt).sum() / nq
+        print(recall_ref, recall1)
+        assert abs(recall_ref - recall1) < 0.02
+
+    def test_from_ivfaq(self):
+        for implem in 0, 1, 2:
+            self.subtest_from_ivfaq(implem)
+
+    def subtest_factory(self, aq, M, bbs, st, r='r'):
+        """
+        Format: IVF{nlist},{AQ}{M}x4fs{r}_{bbs}_N{st}
+
+            nlist (int): number of inverted lists
+            AQ (str):    `LSQ` or `RQ`
+            M (int):     number of sub-quantizers
+            bbs (int):   build block size
+            st (str):    search type, `lsq2x4` or `rq2x4`
+            r  (str):    `r` or ``, by_residual or not
+        """
+        AQ = faiss.AdditiveQuantizer
+        nlist, d = 128, 16
+
+        if bbs > 0:
+            index = faiss.index_factory(
+                d, f'IVF{nlist},{aq}{M}x4fs{r}_{bbs}_N{st}2x4')
+        else:
+            index = faiss.index_factory(
+                d, f'IVF{nlist},{aq}{M}x4fs{r}_N{st}2x4')
+            bbs = 32
+
+        assert index.nlist == nlist
+        assert index.bbs == bbs
+        q = faiss.downcast_Quantizer(index.aq)
+        assert q.M == M
+
+        if aq == 'LSQ':
+            assert isinstance(q, faiss.LocalSearchQuantizer)
+        if aq == 'RQ':
+            assert isinstance(q, faiss.ResidualQuantizer)
+
+        if st == 'lsq':
+            assert q.search_type == AQ.ST_norm_lsq2x4
+        if st == 'rq':
+            assert q.search_type == AQ.ST_norm_rq2x4
+
+        assert index.by_residual == (r == 'r')
+
+    def test_factory(self):
+        self.subtest_factory('LSQ', 16, 64, 'lsq')
+        self.subtest_factory('LSQ', 16, 64, 'rq')
+        self.subtest_factory('RQ', 16, 64, 'rq')
+        self.subtest_factory('RQ', 16, 64, 'lsq')
+        self.subtest_factory('LSQ', 64, 0, 'lsq')
+
+        self.subtest_factory('LSQ', 64, 0, 'lsq', r='')
+
+    def subtest_io(self, factory_str):
+        d = 8
+        ds = datasets.SyntheticDataset(d, 1000, 2000, 1000)
+
+        index = faiss.index_factory(d, factory_str)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        D1, I1 = index.search(ds.get_queries(), 1)
+
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+        try:
+            faiss.write_index(index, fname)
+            index2 = faiss.read_index(fname)
+            D2, I2 = index2.search(ds.get_queries(), 1)
+            np.testing.assert_array_equal(I1, I2)
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def test_io(self):
+        self.subtest_io('IVF16,LSQ4x4fs_Nlsq2x4')
+        self.subtest_io('IVF16,LSQ4x4fs_Nrq2x4')
+        self.subtest_io('IVF16,RQ4x4fs_Nrq2x4')
+        self.subtest_io('IVF16,RQ4x4fs_Nlsq2x4')
+
+
+# add more tests programatically
+
+def add_TestIVFAQFastScan_subtest_accuracy(
+        aq, st, by_residual, implem, metric='L2'):
+    setattr(
+        TestIVFAQFastScan,
+        f"test_accuracy_{metric}_{aq}_implem{implem}_residual{by_residual}",
+        lambda self:
+        self.subtest_accuracy(aq, st, by_residual, implem, metric)
+    )
+
+
+def add_TestIVFAQFastScan_subtest_rescale_accuracy(aq, st, by_residual, implem):
+    setattr(
+        TestIVFAQFastScan,
+        f"test_rescale_accuracy_{aq}_implem{implem}_residual{by_residual}",
+        lambda self:
+        self.subtest_rescale_accuracy(aq, st, by_residual, implem)
+    )
+
+for byr in True, False:
+    for implem in 0, 10, 11, 12, 13, 14, 15:
+        for mt in 'L2', 'IP':
+            add_TestIVFAQFastScan_subtest_accuracy('RQ', 'rq', byr, implem, mt)
+            add_TestIVFAQFastScan_subtest_accuracy('LSQ', 'lsq', byr, implem, mt)
+
+        add_TestIVFAQFastScan_subtest_rescale_accuracy('LSQ', 'lsq', byr, implem)
+        add_TestIVFAQFastScan_subtest_rescale_accuracy('RQ', 'rq', byr, implem)
+
+
+class TestIVFPAQFastScan(unittest.TestCase):
+
+    def subtest_accuracy(self, paq):
+        """
+        Compare IndexIVFAdditiveQuantizerFastScan with
+        IndexIVFAdditiveQuantizer
+        """
+        nlist, d = 16, 8
+        ds = datasets.SyntheticDataset(d, 1000, 1000, 500)
+        gt = ds.get_groundtruth(k=1)
+
+        index = faiss.index_factory(d, f'IVF{nlist},{paq}2x3x4_Nqint8')
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 4
+        Dref, Iref = index.search(ds.get_queries(), 1)
+
+        indexfs = faiss.index_factory(d, f'IVF{nlist},{paq}2x3x4fsr_Nlsq2x4')
+        indexfs.train(ds.get_train())
+        indexfs.add(ds.get_database())
+        indexfs.nprobe = 4
+        D1, I1 = indexfs.search(ds.get_queries(), 1)
+
+        nq = Iref.shape[0]
+        recall_ref = (Iref == gt).sum() / nq
+        recall1 = (I1 == gt).sum() / nq
+
+        print(paq, recall_ref, recall1)
+        assert abs(recall_ref - recall1) < 0.05
+
+    def test_accuracy_PLSQ(self):
+        self.subtest_accuracy("PLSQ")
+
+    def test_accuracy_PRQ(self):
+        self.subtest_accuracy("PRQ")
+
+    def subtest_factory(self, paq):
+        nlist, d = 128, 16
+        index = faiss.index_factory(d, f'IVF{nlist},{paq}2x3x4fsr_Nlsq2x4')
+        q = faiss.downcast_Quantizer(index.aq)
+
+        self.assertEqual(index.nlist, nlist)
+        self.assertEqual(q.nsplits, 2)
+        self.assertEqual(q.subquantizer(0).M, 3)
+        self.assertTrue(index.by_residual)
+
+    def test_factory(self):
+        self.subtest_factory('PLSQ')
+        self.subtest_factory('PRQ')
+
+    def subtest_io(self, factory_str):
+        d = 8
+        ds = datasets.SyntheticDataset(d, 1000, 2000, 1000)
+
+        index = faiss.index_factory(d, factory_str)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        D1, I1 = index.search(ds.get_queries(), 1)
+
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+        try:
+            faiss.write_index(index, fname)
+            index2 = faiss.read_index(fname)
+            D2, I2 = index2.search(ds.get_queries(), 1)
+            np.testing.assert_array_equal(I1, I2)
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    def test_io(self):
+        self.subtest_io('IVF16,PLSQ2x3x4fsr_Nlsq2x4')
+        self.subtest_io('IVF16,PRQ2x3x4fs_Nrq2x4')
diff --git a/thirdparty/faiss/tests/test_heap.cpp b/thirdparty/faiss/tests/test_heap.cpp
new file mode 100644
index 000000000..9481003db
--- /dev/null
+++ b/thirdparty/faiss/tests/test_heap.cpp
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <faiss/utils/Heap.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <numeric>
+
+using namespace faiss;
+
+TEST(Heap, addn_with_ids) {
+    size_t n = 1000;
+    size_t k = 1;
+    std::vector<int64_t> heap_labels(n, -1);
+    std::vector<float> heap_distances(n, 0);
+    float_minheap_array_t heaps = {
+            n, k, heap_labels.data(), heap_distances.data()};
+    heaps.heapify();
+    std::vector<int64_t> labels(n, 1);
+    std::vector<float> distances(n, 0.0f);
+    std::vector<int64_t> subset(n);
+    std::iota(subset.begin(), subset.end(), 0);
+    heaps.addn_with_ids(1, distances.data(), labels.data(), 1);
+    heaps.reorder();
+    EXPECT_TRUE(
+            std::all_of(heap_labels.begin(), heap_labels.end(), [](int64_t i) {
+                return i == 1;
+            }));
+}
+
+TEST(Heap, addn_query_subset_with_ids) {
+    size_t n = 20000000; // more than 2^24
+    size_t k = 1;
+    std::vector<int64_t> heap_labels(n, -1);
+    std::vector<float> heap_distances(n, 0);
+    float_minheap_array_t heaps = {
+            n, k, heap_labels.data(), heap_distances.data()};
+    heaps.heapify();
+    std::vector<int64_t> labels(n, 1);
+    std::vector<float> distances(n, 0.0f);
+    std::vector<int64_t> subset(n);
+    std::iota(subset.begin(), subset.end(), 0);
+    heaps.addn_query_subset_with_ids(
+            n, subset.data(), 1, distances.data(), labels.data(), 1);
+    heaps.reorder();
+    EXPECT_TRUE(
+            std::all_of(heap_labels.begin(), heap_labels.end(), [](int64_t i) {
+                return i == 1;
+            }));
+}
diff --git a/thirdparty/faiss/tests/test_hnsw.cpp b/thirdparty/faiss/tests/test_hnsw.cpp
new file mode 100644
index 000000000..9d90cf25e
--- /dev/null
+++ b/thirdparty/faiss/tests/test_hnsw.cpp
@@ -0,0 +1,192 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+#include <faiss/impl/HNSW.h>
+
+int reference_pop_min(faiss::HNSW::MinimaxHeap& heap, float* vmin_out) {
+    assert(heap.k > 0);
+    // returns min. This is an O(n) operation
+    int i = heap.k - 1;
+    while (i >= 0) {
+        if (heap.ids[i] != -1)
+            break;
+        i--;
+    }
+    if (i == -1)
+        return -1;
+    int imin = i;
+    float vmin = heap.dis[i];
+    i--;
+    while (i >= 0) {
+        if (heap.ids[i] != -1 && heap.dis[i] < vmin) {
+            vmin = heap.dis[i];
+            imin = i;
+        }
+        i--;
+    }
+    if (vmin_out)
+        *vmin_out = vmin;
+    int ret = heap.ids[imin];
+    heap.ids[imin] = -1;
+    --heap.nvalid;
+
+    return ret;
+}
+
+void test_popmin(int heap_size, int amount_to_put) {
+    // create a heap
+    faiss::HNSW::MinimaxHeap mm_heap(heap_size);
+
+    using storage_idx_t = faiss::HNSW::storage_idx_t;
+
+    std::default_random_engine rng(123 + heap_size * amount_to_put);
+    std::uniform_int_distribution<storage_idx_t> u(0, 65536);
+    std::uniform_real_distribution<float> uf(0, 1);
+
+    // generate random unique indices
+    std::unordered_set<storage_idx_t> indices;
+    while (indices.size() < amount_to_put) {
+        const storage_idx_t index = u(rng);
+        indices.insert(index);
+    }
+
+    // put ones into the heap
+    for (const auto index : indices) {
+        float distance = uf(rng);
+        if (distance >= 0.7f) {
+            // add infinity values from time to time
+            distance = std::numeric_limits<float>::infinity();
+        }
+        mm_heap.push(index, distance);
+    }
+
+    // clone the heap
+    faiss::HNSW::MinimaxHeap cloned_mm_heap = mm_heap;
+
+    // takes ones out one by one
+    while (mm_heap.size() > 0) {
+        // compare heaps
+        ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
+        ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
+        ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
+        ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
+        ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
+
+        // use the reference pop_min for the cloned heap
+        float cloned_vmin_dis = std::numeric_limits<float>::quiet_NaN();
+        storage_idx_t cloned_vmin_idx =
+                reference_pop_min(cloned_mm_heap, &cloned_vmin_dis);
+
+        float vmin_dis = std::numeric_limits<float>::quiet_NaN();
+        storage_idx_t vmin_idx = mm_heap.pop_min(&vmin_dis);
+
+        // compare returns
+        ASSERT_EQ(vmin_dis, cloned_vmin_dis);
+        ASSERT_EQ(vmin_idx, cloned_vmin_idx);
+    }
+
+    // compare heaps again
+    ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
+    ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
+    ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
+    ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
+    ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
+}
+
+void test_popmin_identical_distances(
+        int heap_size,
+        int amount_to_put,
+        const float distance) {
+    // create a heap
+    faiss::HNSW::MinimaxHeap mm_heap(heap_size);
+
+    using storage_idx_t = faiss::HNSW::storage_idx_t;
+
+    std::default_random_engine rng(123 + heap_size * amount_to_put);
+    std::uniform_int_distribution<storage_idx_t> u(0, 65536);
+
+    // generate random unique indices
+    std::unordered_set<storage_idx_t> indices;
+    while (indices.size() < amount_to_put) {
+        const storage_idx_t index = u(rng);
+        indices.insert(index);
+    }
+
+    // put ones into the heap
+    for (const auto index : indices) {
+        mm_heap.push(index, distance);
+    }
+
+    // clone the heap
+    faiss::HNSW::MinimaxHeap cloned_mm_heap = mm_heap;
+
+    // takes ones out one by one
+    while (mm_heap.size() > 0) {
+        // compare heaps
+        ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
+        ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
+        ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
+        ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
+        ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
+
+        // use the reference pop_min for the cloned heap
+        float cloned_vmin_dis = std::numeric_limits<float>::quiet_NaN();
+        storage_idx_t cloned_vmin_idx =
+                reference_pop_min(cloned_mm_heap, &cloned_vmin_dis);
+
+        float vmin_dis = std::numeric_limits<float>::quiet_NaN();
+        storage_idx_t vmin_idx = mm_heap.pop_min(&vmin_dis);
+
+        // compare returns
+        ASSERT_EQ(vmin_dis, cloned_vmin_dis);
+        ASSERT_EQ(vmin_idx, cloned_vmin_idx);
+    }
+
+    // compare heaps again
+    ASSERT_EQ(mm_heap.n, cloned_mm_heap.n);
+    ASSERT_EQ(mm_heap.k, cloned_mm_heap.k);
+    ASSERT_EQ(mm_heap.nvalid, cloned_mm_heap.nvalid);
+    ASSERT_EQ(mm_heap.ids, cloned_mm_heap.ids);
+    ASSERT_EQ(mm_heap.dis, cloned_mm_heap.dis);
+}
+
+TEST(HNSW, Test_popmin) {
+    std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32, 64, 128};
+    for (const size_t size : sizes) {
+        for (size_t amount = size; amount > 0; amount /= 2) {
+            test_popmin(size, amount);
+        }
+    }
+}
+
+TEST(HNSW, Test_popmin_identical_distances) {
+    std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32};
+    for (const size_t size : sizes) {
+        for (size_t amount = size; amount > 0; amount /= 2) {
+            test_popmin_identical_distances(size, amount, 1.0f);
+        }
+    }
+}
+
+TEST(HNSW, Test_popmin_infinite_distances) {
+    std::vector<size_t> sizes = {1, 2, 3, 4, 5, 7, 9, 11, 16, 27, 32};
+    for (const size_t size : sizes) {
+        for (size_t amount = size; amount > 0; amount /= 2) {
+            test_popmin_identical_distances(
+                    size, amount, std::numeric_limits<float>::infinity());
+        }
+    }
+}
diff --git a/thirdparty/faiss/tests/test_index.py b/thirdparty/faiss/tests/test_index.py
index 5398a965f..0e828e08c 100644
--- a/thirdparty/faiss/tests/test_index.py
+++ b/thirdparty/faiss/tests/test_index.py
@@ -46,7 +46,9 @@ def do_test(self, nq, metric_type=faiss.METRIC_L2, k=10):
             Iref = all_dis.argsort(axis=1)[:, ::-1][:, :k]
 
         Dref = all_dis[np.arange(nq)[:, None], Iref]
-        self.assertLessEqual((Iref != I1).sum(), Iref.size * 0.0001)
+
+        # not too many elements are off.
+        self.assertLessEqual((Iref != I1).sum(), Iref.size * 0.0002)
         #  np.testing.assert_equal(Iref, I1)
         np.testing.assert_almost_equal(Dref, D1, decimal=5)
 
@@ -108,7 +110,39 @@ def test_with_blas_reservoir_ip(self):
         self.do_test(200, faiss.METRIC_INNER_PRODUCT, k=150)
 
 
+class TestIndexFlatL2(unittest.TestCase):
+    def test_indexflat_l2_sync_norms_1(self):
+        d = 32
+        nb = 10000
+        nt = 0
+        nq = 16
+        k = 10
+
+        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
+
+        # instantiate IndexHNSWFlat
+        index = faiss.IndexHNSWFlat(d, 32)
+        index.hnsw.efConstruction = 40
+
+        index.add(xb)
+        D1, I1 = index.search(xq, k)
 
+        index_l2 = faiss.downcast_index(index.storage)
+        index_l2.sync_l2norms()
+        D2, I2 = index.search(xq, k)
+
+        index_l2.clear_l2norms()
+        D3, I3 = index.search(xq, k)
+
+        #  not too many elements are off.
+        self.assertLessEqual((I2 != I1).sum(), 1)
+        #  np.testing.assert_equal(Iref, I1)
+        np.testing.assert_almost_equal(D2, D1, decimal=5)
+
+        #  not too many elements are off.
+        self.assertLessEqual((I3 != I1).sum(), 0)
+        #  np.testing.assert_equal(Iref, I1)
+        np.testing.assert_equal(D3, D1)
 
 
 class EvalIVFPQAccuracy(unittest.TestCase):
@@ -635,16 +669,14 @@ def subtest_io_and_clone(self, index, Dnsg, Insg):
                 os.unlink(tmpfile)
 
         Dnsg2, Insg2 = index2.search(self.xq, 1)
-
-        self.assertTrue(np.all(Dnsg2 == Dnsg))
-        self.assertTrue(np.all(Insg2 == Insg))
+        np.testing.assert_array_equal(Dnsg2, Dnsg)
+        np.testing.assert_array_equal(Insg2, Insg)
 
         # also test clone
         index3 = faiss.clone_index(index)
         Dnsg3, Insg3 = index3.search(self.xq, 1)
-
-        self.assertTrue(np.all(Dnsg3 == Dnsg))
-        self.assertTrue(np.all(Insg3 == Insg))
+        np.testing.assert_array_equal(Dnsg3, Dnsg)
+        np.testing.assert_array_equal(Insg3, Insg)
 
     def subtest_connectivity(self, index, nb):
         vt = faiss.VisitedTable(nb)
@@ -772,7 +804,127 @@ def test_order(self):
         indices = np.argsort(D, axis=1)
         gt = np.arange(0, k)[np.newaxis, :]  # [1, k]
         gt = np.repeat(gt, nq, axis=0)  # [nq, k]
-        assert np.array_equal(indices, gt)
+        np.testing.assert_array_equal(indices, gt)
+
+    def test_nsg_pq(self):
+        """Test IndexNSGPQ"""
+        d = self.xq.shape[1]
+        R, pq_M = 32, 4
+        index = faiss.index_factory(d, f"NSG{R}_PQ{pq_M}np")
+        assert isinstance(index, faiss.IndexNSGPQ)
+        idxpq = faiss.downcast_index(index.storage)
+        assert index.nsg.R == R and idxpq.pq.M == pq_M
+
+        flat_index = faiss.IndexFlat(d)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, k=1)
+
+        index.GK = 32
+        index.train(self.xb)
+        index.add(self.xb)
+        D, I = index.search(self.xq, k=1)
+
+        # test accuracy
+        recalls = (Iref == I).sum()
+        print("IndexNSGPQ", recalls)
+        self.assertGreaterEqual(recalls, 190)  # 193
+
+        # test I/O
+        self.subtest_io_and_clone(index, D, I)
+
+    def test_nsg_sq(self):
+        """Test IndexNSGSQ"""
+        d = self.xq.shape[1]
+        R = 32
+        index = faiss.index_factory(d, f"NSG{R}_SQ8")
+        assert isinstance(index, faiss.IndexNSGSQ)
+        idxsq = faiss.downcast_index(index.storage)
+        assert index.nsg.R == R
+        assert idxsq.sq.qtype == faiss.ScalarQuantizer.QT_8bit
+
+        flat_index = faiss.IndexFlat(d)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, k=1)
+
+        index.train(self.xb)
+        index.add(self.xb)
+        D, I = index.search(self.xq, k=1)
+
+        # test accuracy
+        recalls = (Iref == I).sum()
+        print("IndexNSGSQ", recalls)
+        self.assertGreaterEqual(recalls, 405)  # 411
+
+        # test I/O
+        self.subtest_io_and_clone(index, D, I)
+
+
+class TestNNDescent(unittest.TestCase):
+
+    def __init__(self, *args, **kwargs):
+        unittest.TestCase.__init__(self, *args, **kwargs)
+        d = 32
+        nt = 0
+        nb = 1500
+        nq = 500
+        self.GK = 32
+
+        _, self.xb, self.xq = get_dataset_2(d, nt, nb, nq)
+
+    def test_nndescentflat(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexNNDescentFlat(d, 32)
+        index.nndescent.search_L = 8
+
+        flat_index = faiss.IndexFlat(d)
+        flat_index.add(self.xb)
+        Dref, Iref = flat_index.search(self.xq, k=1)
+
+        index.train(self.xb)
+        index.add(self.xb)
+        D, I = index.search(self.xq, k=1)
+
+        # test accuracy
+        recalls = (Iref == I).sum()
+        print("IndexNNDescentFlat", recalls)
+        self.assertGreaterEqual(recalls, 450)  # 462
+
+        # do some IO tests
+        fd, tmpfile = tempfile.mkstemp()
+        os.close(fd)
+        try:
+            faiss.write_index(index, tmpfile)
+            index2 = faiss.read_index(tmpfile)
+        finally:
+            if os.path.exists(tmpfile):
+                os.unlink(tmpfile)
+
+        D2, I2 = index2.search(self.xq, 1)
+        np.testing.assert_array_equal(D2, D)
+        np.testing.assert_array_equal(I2, I)
+
+        # also test clone
+        index3 = faiss.clone_index(index)
+        D3, I3 = index3.search(self.xq, 1)
+        np.testing.assert_array_equal(D3, D)
+        np.testing.assert_array_equal(I3, I)
+
+    def test_order(self):
+        """make sure that output results are sorted"""
+        d = self.xq.shape[1]
+        index = faiss.IndexNNDescentFlat(d, 32)
+
+        index.train(self.xb)
+        index.add(self.xb)
+
+        k = 10
+        nq = self.xq.shape[0]
+        D, _ = index.search(self.xq, k)
+
+        indices = np.argsort(D, axis=1)
+        gt = np.arange(0, k)[np.newaxis, :]  # [1, k]
+        gt = np.repeat(gt, nq, axis=0)  # [nq, k]
+        np.testing.assert_array_equal(indices, gt)
 
 
 class TestDistancesPositive(unittest.TestCase):
@@ -960,9 +1112,6 @@ def test_IVFSQ(self):
     def test_IVFPQ(self):
         self.do_test("IVF5,PQ4x4np")
 
-if __name__ == '__main__':
-    unittest.main()
-
 
 class TestValidIndexParams(unittest.TestCase):
 
@@ -1040,3 +1189,19 @@ def test_range_search(self):
         lims, D, I = index.range_search(xq, 1.0)
 
         assert len(D) == len(xb) * len(xq)
+
+
+class TestRandomIndex(unittest.TestCase):
+
+    def test_random(self):
+        """ just check if several runs of search retrieve the
+        same results """
+        index = faiss.IndexRandom(32, 1000000000)
+        (xt, xb, xq) = get_dataset_2(32, 0, 0, 10)
+
+        Dref, Iref = index.search(xq, 10)
+        self.assertTrue(np.all(Dref[:, 1:] >= Dref[:, :-1]))
+
+        Dnew, Inew = index.search(xq, 10)
+        np.testing.assert_array_equal(Dref, Dnew)
+        np.testing.assert_array_equal(Iref, Inew)
diff --git a/thirdparty/faiss/tests/test_index_accuracy.py b/thirdparty/faiss/tests/test_index_accuracy.py
index bf3d51623..44b4ca365 100644
--- a/thirdparty/faiss/tests/test_index_accuracy.py
+++ b/thirdparty/faiss/tests/test_index_accuracy.py
@@ -4,13 +4,15 @@
 # LICENSE file in the root directory of this source tree.
 
 from __future__ import absolute_import, division, print_function
-# noqa E741
-# translation of test_knn.lua
 
-import numpy as np
 import unittest
+
 import faiss
 
+# noqa E741
+# translation of test_knn.lua
+
+import numpy as np
 from common_faiss_tests import Randu10k, get_dataset_2, Randu10kUnbalanced
 
 ev = Randu10k()
@@ -25,28 +27,27 @@
 nbits = d
 
 # Parameters for indexes involving PQ
-M = int(d / 8)           # for PQ: #subquantizers
-nbits_per_index = 8      # for PQ
+M = int(d / 8)  # for PQ: #subquantizers
+nbits_per_index = 8  # for PQ
 
 
 class IndexAccuracy(unittest.TestCase):
-
     def test_IndexFlatIP(self):
         q = faiss.IndexFlatIP(d)  # Ask inner product
-        res = ev.launch('FLAT / IP', q)
+        res = ev.launch("FLAT / IP", q)
         e = ev.evalres(res)
         assert e[1] == 1.0
 
     def test_IndexFlatL2(self):
         q = faiss.IndexFlatL2(d)
-        res = ev.launch('FLAT / L2', q)
+        res = ev.launch("FLAT / L2", q)
         e = ev.evalres(res)
         assert e[1] == 1.0
 
     def test_ivf_kmeans(self):
         ivfk = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, ncentroids)
         ivfk.nprobe = kprobe
-        res = ev.launch('IndexIVFFlat', ivfk)
+        res = ev.launch("IndexIVFFlat", ivfk)
         e = ev.evalres(res)
         # should give 0.260  0.260  0.260
         assert e[1] > 0.2
@@ -61,7 +62,7 @@ def test_ivf_kmeans(self):
 
     def test_indexLSH(self):
         q = faiss.IndexLSH(d, nbits)
-        res = ev.launch('FLAT / LSH Cosine', q)
+        res = ev.launch("FLAT / LSH Cosine", q)
         e = ev.evalres(res)
         # should give 0.070  0.250  0.580
         assert e[10] > 0.2
@@ -70,14 +71,14 @@ def test_IndexLSH_32_48(self):
         # CHECK: the difference between 32 and 48 does not make much sense
         for nbits2 in 32, 48:
             q = faiss.IndexLSH(d, nbits2)
-            res = ev.launch('LSH half size', q)
+            res = ev.launch("LSH half size", q)
             e = ev.evalres(res)
             # should give 0.003  0.019  0.108
             assert e[10] > 0.018
 
     def test_IndexPQ(self):
         q = faiss.IndexPQ(d, M, nbits_per_index)
-        res = ev.launch('FLAT / PQ L2', q)
+        res = ev.launch("FLAT / PQ L2", q)
         e = ev.evalres(res)
         # should give 0.070  0.230  0.260
         assert e[10] > 0.2
@@ -85,16 +86,16 @@ def test_IndexPQ(self):
     # Approximate search module: PQ with inner product distance
     def test_IndexPQ_ip(self):
         q = faiss.IndexPQ(d, M, nbits_per_index, faiss.METRIC_INNER_PRODUCT)
-        res = ev.launch('FLAT / PQ IP', q)
+        res = ev.launch("FLAT / PQ IP", q)
         e = ev.evalres(res)
         # should give 0.070  0.230  0.260
-        #(same result as regular PQ on normalized distances)
+        # (same result as regular PQ on normalized distances)
         assert e[10] > 0.2
 
     def test_IndexIVFPQ(self):
         ivfpq = faiss.IndexIVFPQ(faiss.IndexFlatL2(d), d, ncentroids, M, 8)
         ivfpq.nprobe = kprobe
-        res = ev.launch('IVF PQ', ivfpq)
+        res = ev.launch("IVF PQ", ivfpq)
         e = ev.evalres(res)
         # should give 0.070  0.230  0.260
         assert e[10] > 0.2
@@ -104,17 +105,17 @@ def test_IndexIVFPQ(self):
     # Approximate search: PQ with full vector refinement
     def test_IndexPQ_refined(self):
         q = faiss.IndexPQ(d, M, nbits_per_index)
-        res = ev.launch('PQ non-refined', q)
+        res = ev.launch("PQ non-refined", q)
         e = ev.evalres(res)
         q.reset()
 
         rq = faiss.IndexRefineFlat(q)
-        res = ev.launch('PQ refined', rq)
+        res = ev.launch("PQ refined", rq)
         e2 = ev.evalres(res)
         assert e2[10] >= e[10]
         rq.k_factor = 4
 
-        res = ev.launch('PQ refined*4', rq)
+        res = ev.launch("PQ refined*4", rq)
         e3 = ev.evalres(res)
         assert e3[10] >= e2[10]
 
@@ -124,17 +125,16 @@ def test_polysemous(self):
         # reduce nb iterations to speed up training for the test
         index.polysemous_training.n_iter = 50000
         index.polysemous_training.n_redo = 1
-        res = ev.launch('normal PQ', index)
+        res = ev.launch("normal PQ", index)
         e_baseline = ev.evalres(res)
         index.search_type = faiss.IndexPQ.ST_polysemous
 
-        index.polysemous_ht = int(M / 16. * 58)
+        index.polysemous_ht = int(M / 16.0 * 58)
 
         stats = faiss.cvar.indexPQ_stats
         stats.reset()
 
-        res = ev.launch('Polysemous ht=%d' % index.polysemous_ht,
-                        index)
+        res = ev.launch("Polysemous ht=%d" % index.polysemous_ht, index)
         e_polysemous = ev.evalres(res)
         print(e_baseline, e_polysemous, index.polysemous_ht)
         print(stats.n_hamming_pass, stats.ncode)
@@ -149,16 +149,16 @@ def test_polysemous(self):
     def test_ScalarQuantizer(self):
         quantizer = faiss.IndexFlatL2(d)
         ivfpq = faiss.IndexIVFScalarQuantizer(
-            quantizer, d, ncentroids,
-            faiss.ScalarQuantizer.QT_8bit)
+            quantizer, d, ncentroids, faiss.ScalarQuantizer.QT_8bit
+        )
         ivfpq.nprobe = kprobe
-        res = ev.launch('IVF SQ', ivfpq)
+        res = ev.launch("IVF SQ", ivfpq)
         e = ev.evalres(res)
         # should give 0.234  0.236  0.236
         assert e[10] > 0.235
 
     def test_polysemous_OOM(self):
-        """ this used to cause OOM when training polysemous with large
+        """this used to cause OOM when training polysemous with large
         nb bits"""
         d = 32
         xt, xb, xq = get_dataset_2(d, 10000, 0, 0)
@@ -170,13 +170,10 @@ def test_polysemous_OOM(self):
 
 
 class TestSQFlavors(unittest.TestCase):
-    """ tests IP in addition to L2, non multiple of 8 dimensions
-    """
+    """tests IP in addition to L2, non multiple of 8 dimensions"""
 
     def add2columns(self, x):
-        return np.hstack((
-            x, np.zeros((x.shape[0], 2), dtype='float32')
-        ))
+        return np.hstack((x, np.zeros((x.shape[0], 2), dtype="float32")))
 
     def subtest_add2col(self, xb, xq, index, qname):
         """Test with 2 additional dimensions to take also the non-SIMD
@@ -197,19 +194,17 @@ def subtest_add2col(self, xb, xq, index, qname):
         centroids2 = self.add2columns(centroids)
         quantizer2.add(centroids2)
         index2 = faiss.IndexIVFScalarQuantizer(
-            quantizer2, d2, index.nlist, index.sq.qtype,
-            index.metric_type)
+            quantizer2, d2, index.nlist, index.sq.qtype, index.metric_type
+        )
         index2.nprobe = 4
-        if qname in ('8bit', '4bit'):
+        if qname in ("8bit", "4bit"):
             trained = faiss.vector_to_array(index.sq.trained).reshape(2, -1)
             nt = trained.shape[1]
             # 2 lines: vmins and vdiffs
             new_nt = int(nt * d2 / d)
-            trained2 = np.hstack((
-                trained,
-                np.zeros((2, new_nt - nt), dtype='float32')
-            ))
-            trained2[1, nt:] = 1.0   # set vdiff to 1 to avoid div by 0
+            trained2 = np.hstack((trained, np.zeros((2, new_nt - nt),
+                                  dtype="float32")))
+            trained2[1, nt:] = 1.0  # set vdiff to 1 to avoid div by 0
             faiss.copy_array_to_vector(trained2.ravel(), index2.sq.trained)
         else:
             index2.sq.trained = index.sq.trained
@@ -218,22 +213,21 @@ def subtest_add2col(self, xb, xq, index, qname):
         index2.add(xb2)
         return index2.search(xq2, 10)
 
-
     # run on Sept 18, 2018 with nprobe=4 + 4 bit bugfix
     ref_results = {
-        (0, '8bit'): 984,
-        (0, '4bit'): 978,
-        (0, '8bit_uniform'): 985,
-        (0, '4bit_uniform'): 979,
-        (0, 'fp16'): 985,
-        (1, '8bit'): 979,
-        (1, '4bit'): 973,
-        (1, '8bit_uniform'): 979,
-        (1, '4bit_uniform'): 972,
-        (1, 'fp16'): 979,
+        (0, "8bit"): 984,
+        (0, "4bit"): 978,
+        (0, "8bit_uniform"): 985,
+        (0, "4bit_uniform"): 979,
+        (0, "fp16"): 985,
+        (1, "8bit"): 979,
+        (1, "4bit"): 973,
+        (1, "8bit_uniform"): 979,
+        (1, "4bit_uniform"): 972,
+        (1, "fp16"): 979,
         # added 2019-06-26
-        (0, '6bit'): 985,
-        (1, '6bit'): 987,
+        (0, "6bit"): 985,
+        (1, "6bit"): 987,
     }
 
     def subtest(self, mt):
@@ -245,19 +239,19 @@ def subtest(self, mt):
         gt_index.add(xb)
         gt_D, gt_I = gt_index.search(xq, 10)
         quantizer = faiss.IndexFlat(d, mt)
-        for qname in '8bit 4bit 8bit_uniform 4bit_uniform fp16 6bit'.split():
-            qtype = getattr(faiss.ScalarQuantizer, 'QT_' + qname)
-            index = faiss.IndexIVFScalarQuantizer(
-                quantizer, d, nlist, qtype, mt)
+        for qname in "8bit 4bit 8bit_uniform 4bit_uniform fp16 6bit".split():
+            qtype = getattr(faiss.ScalarQuantizer, "QT_" + qname)
+            index = faiss.IndexIVFScalarQuantizer(quantizer, d, nlist, qtype,
+                                                  mt)
             index.train(xt)
             index.add(xb)
-            index.nprobe = 4   # hopefully more robust than 1
+            index.nprobe = 4  # hopefully more robust than 1
             D, I = index.search(xq, 10)
             ninter = faiss.eval_intersection(I, gt_I)
-            print('(%d, %s): %d, ' % (mt, repr(qname), ninter))
+            print("(%d, %s): %d, " % (mt, repr(qname), ninter))
             assert abs(ninter - self.ref_results[(mt, qname)]) <= 10
 
-            if qname == '6bit':
+            if qname == "6bit":
                 # the test below fails triggers ASAN. TODO check what's wrong
                 continue
 
@@ -270,7 +264,7 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            print('radius', radius)
+            # print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -284,19 +278,22 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            print('ndiff %d / %d' % (ndiff, ntot))
+            # print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.01
 
             for pm in 1, 2:
-                print('parallel_mode=%d' % pm)
+                # print("parallel_mode=%d" % pm)
                 index.parallel_mode = pm
                 lims4, D4, I4 = index.range_search(xq, radius)
-                print('sizes', lims4[1:] - lims4[:-1])
+                # print("sizes", lims4[1:] - lims4[:-1])
                 for qno in range(len(lims) - 1):
-                    Iref = I3[lims[qno]: lims[qno+1]]
-                    Inew = I4[lims4[qno]: lims4[qno+1]]
+                    Iref = I3[lims[qno]: lims[qno + 1]]
+                    Inew = I4[lims4[qno]: lims4[qno + 1]]
                     assert set(Iref) == set(Inew), "q %d ref %s new %s" % (
-                        qno, Iref, Inew)
+                        qno,
+                        Iref,
+                        Inew,
+                    )
 
     def test_SQ_IP(self):
         self.subtest(faiss.METRIC_INNER_PRODUCT)
@@ -311,7 +308,7 @@ def test_parallel_mode(self):
         index = faiss.index_factory(d, "IVF64,SQ8")
         index.train(xt)
         index.add(xb)
-        index.nprobe = 4   # hopefully more robust than 1
+        index.nprobe = 4  # hopefully more robust than 1
         Dref, Iref = index.search(xq, 10)
 
         for pm in 1, 2, 3:
@@ -323,7 +320,6 @@ def test_parallel_mode(self):
 
 
 class TestSQByte(unittest.TestCase):
-
     def subtest_8bit_direct(self, metric_type, d):
         xt, xb, xq = get_dataset_2(d, 500, 1000, 30)
 
@@ -345,7 +341,8 @@ def rescale(x):
         Dref, Iref = gt_index.search(xq, 10)
 
         index = faiss.IndexScalarQuantizer(
-            d, faiss.ScalarQuantizer.QT_8bit_direct, metric_type)
+            d, faiss.ScalarQuantizer.QT_8bit_direct, metric_type
+        )
         index.add(xb)
         D, I = index.search(xq, 10)
 
@@ -364,8 +361,9 @@ def rescale(x):
         Dref, Iref = gt_index.search(xq, 10)
 
         index = faiss.IndexIVFScalarQuantizer(
-            quantizer, d, nlist,
-            faiss.ScalarQuantizer.QT_8bit_direct, metric_type)
+            quantizer, d, nlist, faiss.ScalarQuantizer.QT_8bit_direct,
+            metric_type
+        )
         index.nprobe = 4
         index.by_residual = False
         index.train(xt)
@@ -382,7 +380,6 @@ def test_8bit_direct(self):
 
 
 class TestNNDescent(unittest.TestCase):
-
     def test_L1(self):
         search_Ls = [10, 20, 30]
         thresholds = [0.83, 0.92, 0.95]
@@ -402,9 +399,11 @@ def test_IP(self):
             self.subtest(32, faiss.METRIC_INNER_PRODUCT, 10, search_L, threshold)
 
     def subtest(self, d, metric, topk, search_L, threshold):
-        metric_names = {faiss.METRIC_L1: 'L1',
-                        faiss.METRIC_L2: 'L2',
-                        faiss.METRIC_INNER_PRODUCT: 'IP'}
+        metric_names = {
+            faiss.METRIC_L1: "L1",
+            faiss.METRIC_L2: "L2",
+            faiss.METRIC_INNER_PRODUCT: "IP",
+        }
         topk = 10
         nt, nb, nq = 2000, 1000, 200
         xt, xb, xq = get_dataset_2(d, nt, nb, nq)
@@ -432,9 +431,12 @@ def subtest(self, d, metric, topk, search_L, threshold):
                         recalls += 1
                         break
         recall = 1.0 * recalls / (nq * topk)
-        print('Metric: {}, L: {}, Recall@{}: {}'.format(
-            metric_names[metric], search_L, topk, recall))
-        assert recall > threshold, '{} <= {}'.format(recall, threshold)
+        print(
+            "Metric: {}, L: {}, Recall@{}: {}".format(
+                metric_names[metric], search_L, topk, recall
+            )
+        )
+        assert recall > threshold, "{} <= {}".format(recall, threshold)
 
 
 class TestPQFlavors(unittest.TestCase):
@@ -466,8 +468,7 @@ def subtest(self, mt):
         quantizer = faiss.IndexFlat(d, mt)
         for by_residual in True, False:
 
-            index = faiss.IndexIVFPQ(
-                quantizer, d, nlist, 4, 8)
+            index = faiss.IndexIVFPQ(quantizer, d, nlist, 4, 8)
             index.metric_type = mt
             index.by_residual = by_residual
             if by_residual:
@@ -484,7 +485,7 @@ def subtest(self, mt):
             D, I = index.search(xq, 10)
 
             ninter = faiss.eval_intersection(I, gt_I)
-            print('(%d, %s): %d, ' % (mt, by_residual, ninter))
+            print("(%d, %s): %d, " % (mt, by_residual, ninter))
 
             assert abs(ninter - self.ref_results[mt, by_residual]) <= 3
 
@@ -498,12 +499,16 @@ def subtest(self, mt):
                 index.polysemous_ht = 20
                 D, I = index.search(xq, 10)
                 ninter = faiss.eval_intersection(I, gt_I)
-                print('(%d, %s, %d): %d, ' % (
-                    mt, by_residual, index.polysemous_ht, ninter))
+                print(
+                    "(%d, %s, %d): %d, "
+                    % (mt, by_residual, index.polysemous_ht, ninter)
+                )
 
                 # polysemous behaves bizarrely on ARM
-                assert (ninter >= self.ref_results[
-                    mt, by_residual, index.polysemous_ht] - 4)
+                assert (
+                    ninter >= self.ref_results[mt, by_residual,
+                                               index.polysemous_ht] - 4
+                )
 
             # also test range search
 
@@ -511,7 +516,7 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            print('radius', radius)
+            print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -525,7 +530,7 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            print('ndiff %d / %d' % (ndiff, ntot))
+            print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.02
 
     def test_IVFPQ_non8bit(self):
@@ -539,36 +544,33 @@ def test_IVFPQ_non8bit(self):
 
         quantizer = faiss.IndexFlat(d)
         ninter = {}
-        for v in '2x8', '8x2':
-            if v == '8x2':
-                index = faiss.IndexIVFPQ(
-                    quantizer, d, nlist, 2, 8)
+        for v in "2x8", "8x2":
+            if v == "8x2":
+                index = faiss.IndexIVFPQ(quantizer, d, nlist, 2, 8)
             else:
-                index = faiss.IndexIVFPQ(
-                    quantizer, d, nlist, 8, 2)
+                index = faiss.IndexIVFPQ(quantizer, d, nlist, 8, 2)
             index.train(xt)
             index.add(xb)
             index.npobe = 16
 
             D, I = index.search(xq, 10)
             ninter[v] = faiss.eval_intersection(I, gt_I)
-        print('ninter=', ninter)
+        print("ninter=", ninter)
         # this should be the case but we don't observe
         # that... Probavly too few test points
         #  assert ninter['2x8'] > ninter['8x2']
         # ref numbers on 2019-11-02
-        assert abs(ninter['2x8'] - 458) < 4
-        assert abs(ninter['8x2'] - 465) < 4
+        assert abs(ninter["2x8"] - 458) < 4
+        assert abs(ninter["8x2"] - 465) < 4
 
 
 class TestFlat1D(unittest.TestCase):
-
     def test_flat_1d(self):
         rs = np.random.RandomState(123545)
         k = 10
-        xb = rs.uniform(size=(100, 1)).astype('float32')
+        xb = rs.uniform(size=(100, 1)).astype("float32")
         # make sure to test below and above
-        xq = rs.uniform(size=(1000, 1)).astype('float32') * 1.1 - 0.05
+        xq = rs.uniform(size=(1000, 1)).astype("float32") * 1.1 - 0.05
 
         ref = faiss.IndexFlatL2(1)
         ref.add(xb)
@@ -581,10 +583,20 @@ def test_flat_1d(self):
 
         ndiff = (np.abs(ref_I - new_I) != 0).sum()
 
-        assert(ndiff < 100)
+        assert ndiff < 100
         new_D = new_D ** 2
         max_diff_D = np.abs(ref_D - new_D).max()
-        assert(max_diff_D < 1e-5)
+        assert max_diff_D < 1e-5
+
+    def test_size_0(self):
+        # just make sure it does not crash on small nb
+        index = faiss.IndexFlat1D()
+        rs = np.random.RandomState(123)
+        for i in range(3):
+            x = np.array([[rs.rand()]])
+            D, I = index.search(x, 10)
+            self.assertEqual((I == -1).sum(), 10 - i)
+            index.add(x)
 
 
 class OPQRelativeAccuracy(unittest.TestCase):
@@ -598,7 +610,7 @@ def test_OPQ(self):
         d = ev.d
         index = faiss.IndexPQ(d, M, 8)
 
-        res = ev.launch('PQ', index)
+        res = ev.launch("PQ", index)
         e_pq = ev.evalres(res)
 
         index_pq = faiss.IndexPQ(d, M, 8)
@@ -608,15 +620,15 @@ def test_OPQ(self):
         opq_matrix.niter_pq = 4
         index = faiss.IndexPreTransform(opq_matrix, index_pq)
 
-        res = ev.launch('OPQ', index)
+        res = ev.launch("OPQ", index)
         e_opq = ev.evalres(res)
 
-        print('e_pq=%s' % e_pq)
-        print('e_opq=%s' % e_opq)
+        print("e_pq=%s" % e_pq)
+        print("e_opq=%s" % e_opq)
 
         # verify that OPQ better than PQ
         for r in 1, 10, 100:
-            assert(e_opq[r] > e_pq[r])
+            assert e_opq[r] > e_pq[r]
 
     def test_OIVFPQ(self):
         # Parameters inverted indexes
@@ -629,7 +641,7 @@ def test_OIVFPQ(self):
         index = faiss.IndexIVFPQ(quantizer, d, ncentroids, M, 8)
         index.nprobe = 5
 
-        res = ev.launch('IVFPQ', index)
+        res = ev.launch("IVFPQ", index)
         e_ivfpq = ev.evalres(res)
 
         quantizer = faiss.IndexFlatL2(d)
@@ -639,23 +651,22 @@ def test_OIVFPQ(self):
         opq_matrix.niter = 10
         index = faiss.IndexPreTransform(opq_matrix, index_ivfpq)
 
-        res = ev.launch('O+IVFPQ', index)
+        res = ev.launch("O+IVFPQ", index)
         e_oivfpq = ev.evalres(res)
 
         # verify same on OIVFPQ
         for r in 1, 10, 100:
             print(e_oivfpq[r], e_ivfpq[r])
-            assert(e_oivfpq[r] >= e_ivfpq[r])
+            assert e_oivfpq[r] >= e_ivfpq[r]
 
 
 class TestRoundoff(unittest.TestCase):
-
     def test_roundoff(self):
         # params that force use of BLAS implementation
         nb = 100
         nq = 25
         d = 4
-        xb = np.zeros((nb, d), dtype='float32')
+        xb = np.zeros((nb, d), dtype="float32")
 
         xb[:, 0] = np.arange(nb) + 12345
         xq = xb[:nq] + 0.3
@@ -668,9 +679,8 @@ def test_roundoff(self):
         # this does not work
         assert not np.all(I.ravel() == np.arange(nq))
 
-        index = faiss.IndexPreTransform(
-            faiss.CenteringTransform(d),
-            faiss.IndexFlat(d))
+        index = faiss.IndexPreTransform(faiss.CenteringTransform(d),
+                                        faiss.IndexFlat(d))
 
         index.train(xb)
         index.add(xb)
@@ -685,30 +695,30 @@ class TestSpectralHash(unittest.TestCase):
 
     # run on 2019-04-02
     ref_results = {
-        (32, 'global', 10): 505,
-        (32, 'centroid', 10): 524,
-        (32, 'centroid_half', 10): 21,
-        (32, 'median', 10): 510,
-        (32, 'global', 1): 8,
-        (32, 'centroid', 1): 20,
-        (32, 'centroid_half', 1): 26,
-        (32, 'median', 1): 14,
-        (64, 'global', 10): 768,
-        (64, 'centroid', 10): 767,
-        (64, 'centroid_half', 10): 21,
-        (64, 'median', 10): 765,
-        (64, 'global', 1): 28,
-        (64, 'centroid', 1): 21,
-        (64, 'centroid_half', 1): 20,
-        (64, 'median', 1): 29,
-        (128, 'global', 10): 968,
-        (128, 'centroid', 10): 945,
-        (128, 'centroid_half', 10): 21,
-        (128, 'median', 10): 958,
-        (128, 'global', 1): 271,
-        (128, 'centroid', 1): 279,
-        (128, 'centroid_half', 1): 171,
-        (128, 'median', 1): 253,
+        (32, "global", 10): 505,
+        (32, "centroid", 10): 524,
+        (32, "centroid_half", 10): 21,
+        (32, "median", 10): 510,
+        (32, "global", 1): 8,
+        (32, "centroid", 1): 20,
+        (32, "centroid_half", 1): 26,
+        (32, "median", 1): 14,
+        (64, "global", 10): 768,
+        (64, "centroid", 10): 767,
+        (64, "centroid_half", 10): 21,
+        (64, "median", 10): 765,
+        (64, "global", 1): 28,
+        (64, "centroid", 1): 21,
+        (64, "centroid_half", 1): 20,
+        (64, "median", 1): 29,
+        (128, "global", 10): 968,
+        (128, "centroid", 10): 945,
+        (128, "centroid_half", 10): 21,
+        (128, "median", 10): 958,
+        (128, "global", 1): 271,
+        (128, "centroid", 1): 279,
+        (128, "centroid_half", 1): 171,
+        (128, "median", 1): 253,
     }
 
     def test_sh(self):
@@ -728,17 +738,17 @@ def test_sh(self):
             D, I = index_lsh.search(xq, 10)
             ninter = faiss.eval_intersection(I, gt_I)
 
-            print('LSH baseline: %d' % ninter)
+            print("LSH baseline: %d" % ninter)
 
             for period in 10.0, 1.0:
 
-                for tt in 'global centroid centroid_half median'.split():
-                    index = faiss.IndexIVFSpectralHash(quantizer, d, nlist,
-                                                       nbit, period)
+                for tt in "global centroid centroid_half median".split():
+                    index = faiss.IndexIVFSpectralHash(
+                        quantizer, d, nlist, nbit, period
+                    )
                     index.nprobe = nprobe
                     index.threshold_type = getattr(
-                        faiss.IndexIVFSpectralHash,
-                        'Thresh_' + tt
+                        faiss.IndexIVFSpectralHash, "Thresh_" + tt
                     )
 
                     index.train(xt)
@@ -748,12 +758,13 @@ def test_sh(self):
                     ninter = faiss.eval_intersection(I, gt_I)
                     key = (nbit, tt, period)
 
-                    print('(%d, %s, %g): %d, ' % (nbit, repr(tt), period, ninter))
-                    assert abs(ninter - self.ref_results[key]) <= 12
+                    print("(%d, %s, %g): %d, " % (nbit, repr(tt), period,
+                                                  ninter))
+                    print(abs(ninter - self.ref_results[key]))
+                    assert abs(ninter - self.ref_results[key]) <= 14
 
 
 class TestRefine(unittest.TestCase):
-
     def do_test(self, metric):
         d = 32
         xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
diff --git a/thirdparty/faiss/tests/test_index_binary.py b/thirdparty/faiss/tests/test_index_binary.py
index 91957baf4..312530ad4 100644
--- a/thirdparty/faiss/tests/test_index_binary.py
+++ b/thirdparty/faiss/tests/test_index_binary.py
@@ -6,7 +6,6 @@
 """this is a basic test script for simple indices work"""
 
 import os
-import sys
 import numpy as np
 import unittest
 import faiss
@@ -289,6 +288,21 @@ def test_ivf_nprobe(self):
         assert np.all(D == ref_D)
         # assert np.all(I == ref_I)  # id may be different
 
+    def test_search_per_invlist(self):
+        d = self.xq.shape[1] * 8
+
+        quantizer = faiss.IndexBinaryFlat(d)
+        index = faiss.IndexBinaryIVF(quantizer, d, 10)
+        index.cp.min_points_per_centroid = 5    # quiet warning
+        index.train(self.xt)
+        index.add(self.xb)
+        index.nprobe = 3
+
+        Dref, Iref = index.search(self.xq, 10)
+        index.per_invlist_search = True
+        D2, I2 = index.search(self.xq, 10)
+        compare_binary_result_lists(Dref, Iref, D2, I2)
+
 
 class TestHNSW(unittest.TestCase):
 
@@ -337,7 +351,6 @@ def test_hnsw(self):
         self.assertTrue((Dref == Dbin).all())
 
 
-
 class TestReplicasAndShards(unittest.TestCase):
 
     @unittest.skipIf(os.name == "posix" and os.uname().sysname == "Darwin",
@@ -360,6 +373,7 @@ def test_replicas(self):
             sub_idx = faiss.IndexBinaryFlat(d)
             sub_idx.add(xb)
             index.addIndex(sub_idx)
+        self.assertEqual(index_ref.code_size, index.code_size)
 
         D, I = index.search(xq, 10)
 
diff --git a/thirdparty/faiss/tests/test_index_composite.py b/thirdparty/faiss/tests/test_index_composite.py
index 30b6dc7ab..81a00cb93 100644
--- a/thirdparty/faiss/tests/test_index_composite.py
+++ b/thirdparty/faiss/tests/test_index_composite.py
@@ -15,6 +15,46 @@
 import platform
 
 from common_faiss_tests import get_dataset_2
+from faiss.contrib.datasets import SyntheticDataset
+from faiss.contrib.inspect_tools import make_LinearTransform_matrix
+from faiss.contrib.evaluation import check_ref_knn_with_draws
+
+class TestRemoveFastScan(unittest.TestCase):
+    def do_test(self, ntotal, removed):
+        d = 20
+        xt, xb, _ = get_dataset_2(d, ntotal, ntotal, 0)
+        index = faiss.index_factory(20, 'IDMap2,PQ5x4fs')
+        index.train(xt)
+        index.add_with_ids(xb, np.arange(ntotal).astype("int64"))
+        before = index.reconstruct_n(0, ntotal)
+        index.remove_ids(np.array(removed))
+        for i in range(ntotal):
+            if i in removed:
+                # should throw RuntimeError as this vector should be removed
+                try:
+                    after = index.reconstruct(i)
+                    assert False
+                except RuntimeError:
+                    pass
+            else:
+                after = index.reconstruct(i)
+                np.testing.assert_array_equal(before[i], after)
+        assert index.ntotal == ntotal - len(removed)
+
+    def test_remove_last_vector(self):
+        self.do_test(993, [992])
+
+    # test remove element from every address 0 -> 31
+    # [0, 32 + 1, 2 * 32 + 2, ....]
+    # [0,   33  ,     66    , 99, 132, .....]
+    def test_remove_every_address(self):
+        removed = (33 * np.arange(32)).tolist()
+        self.do_test(1100, removed)
+
+    # test remove range of vectors and leave ntotal divisible by 32
+    def test_leave_complete_block(self):
+        self.do_test(1000, np.arange(8).tolist())
+
 
 class TestRemove(unittest.TestCase):
 
@@ -97,6 +137,26 @@ def test_remove_id_map(self):
         else:
             assert False, 'should have raised an exception'
 
+    def test_factory_idmap2_suffix(self):
+        xb = np.zeros((10, 5), dtype='float32')
+        xb[:, 0] = np.arange(10) + 1000
+        index = faiss.index_factory(5, "Flat,IDMap2")
+        ids = np.arange(10, dtype='int64') + 100
+        index.add_with_ids(xb, ids)
+        assert index.reconstruct(104)[0] == 1004
+        index.remove_ids(np.array([103], dtype='int64'))
+        assert index.reconstruct(104)[0] == 1004
+
+    def test_factory_idmap2_prefix(self):
+        xb = np.zeros((10, 5), dtype='float32')
+        xb[:, 0] = np.arange(10) + 1000
+        index = faiss.index_factory(5, "IDMap2,Flat")
+        ids = np.arange(10, dtype='int64') + 100
+        index.add_with_ids(xb, ids)
+        assert index.reconstruct(109)[0] == 1009
+        index.remove_ids(np.array([100], dtype='int64'))
+        assert index.reconstruct(109)[0] == 1009
+
     def test_remove_id_map_2(self):
         # from https://github.com/facebookresearch/faiss/issues/255
         rs = np.random.RandomState(1234)
@@ -151,7 +211,6 @@ def test_remove_id_map_binary(self):
             assert False, 'should have raised an exception'
 
 
-
 class TestRangeSearch(unittest.TestCase):
 
     def test_range_search_id_map(self):
@@ -311,6 +370,7 @@ def manual_trans(x):
 
         assert np.all(I == I2)
 
+
 @unittest.skipIf(platform.system() == 'Windows', \
                  'Mmap not supported on Windows.')
 class TestRareIO(unittest.TestCase):
@@ -370,12 +430,6 @@ def test_mmappedIO_pretrans(self):
 
 class TestIVFFlatDedup(unittest.TestCase):
 
-    def normalize_res(self, D, I):
-        dmax = D[-1]
-        res = [(d, i) for d, i in zip(D, I) if d < dmax]
-        res.sort()
-        return res
-
     def test_dedup(self):
         d = 10
         nb = 1000
@@ -411,10 +465,7 @@ def test_dedup(self):
         Dref, Iref = index_ref.search(xq, 20)
         Dnew, Inew = index_new.search(xq, 20)
 
-        for i in range(nq):
-            ref = self.normalize_res(Dref[i], Iref[i])
-            new = self.normalize_res(Dnew[i], Inew[i])
-            assert ref == new
+        check_ref_knn_with_draws(Dref, Iref, Dnew, Inew)
 
         # test I/O
         fd, tmpfile = tempfile.mkstemp()
@@ -427,10 +478,7 @@ def test_dedup(self):
                 os.unlink(tmpfile)
         Dst, Ist = index_st.search(xq, 20)
 
-        for i in range(nq):
-            new = self.normalize_res(Dnew[i], Inew[i])
-            st = self.normalize_res(Dst[i], Ist[i])
-            assert st == new
+        check_ref_knn_with_draws(Dnew, Inew, Dst, Ist)
 
         # test remove
         toremove = np.hstack((np.arange(3, 1000, 5), np.arange(850, 950)))
@@ -441,10 +489,7 @@ def test_dedup(self):
         Dref, Iref = index_ref.search(xq, 20)
         Dnew, Inew = index_new.search(xq, 20)
 
-        for i in range(nq):
-            ref = self.normalize_res(Dref[i], Iref[i])
-            new = self.normalize_res(Dnew[i], Inew[i])
-            assert ref == new
+        check_ref_knn_with_draws(Dref, Iref, Dnew, Inew)
 
 
 class TestSerialize(unittest.TestCase):
@@ -484,6 +529,7 @@ def test_serialize_to_vector(self):
         Dnew, Inew = index3.search(xq, 5)
         assert np.all(Dnew == Dref) and np.all(Inew == Iref)
 
+
 @unittest.skipIf(platform.system() == 'Windows',
                  'OnDiskInvertedLists is unsupported on Windows.')
 class TestRenameOndisk(unittest.TestCase):
@@ -618,7 +664,161 @@ def test_stop_words(self):
         index.replace_invlists(il, True)
 
 
+class TestSplitMerge(unittest.TestCase):
 
-
-if __name__ == '__main__':
-    unittest.main()
+    def do_test(self, index_key, subset_type):
+        xt, xb, xq = get_dataset_2(32, 1000, 100, 10)
+        index = faiss.index_factory(32, index_key)
+        index.train(xt)
+        nsplit = 3
+        sub_indexes = [faiss.clone_index(index) for i in range(nsplit)]
+        index.add(xb)
+        Dref, Iref = index.search(xq, 10)
+        nlist = index.nlist
+        for i in range(nsplit):
+            if subset_type in (1, 3):
+                index.copy_subset_to(sub_indexes[i], subset_type, nsplit, i)
+            elif subset_type in (0, 2):
+                j0 = index.ntotal * i // nsplit
+                j1 = index.ntotal * (i + 1) // nsplit
+                index.copy_subset_to(sub_indexes[i], subset_type, j0, j1)
+            elif subset_type == 4:
+                index.copy_subset_to(
+                    sub_indexes[i], subset_type,
+                    i * nlist // nsplit, (i + 1) * nlist // nsplit)
+
+        index_shards = faiss.IndexShards(False, False)
+        for i in range(nsplit):
+            index_shards.add_shard(sub_indexes[i])
+        Dnew, Inew = index_shards.search(xq, 10)
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_equal(Dref, Dnew)
+
+    def test_Flat_subset_type_0(self):
+        self.do_test("IVF30,Flat", subset_type=0)
+
+    def test_Flat_subset_type_1(self):
+        self.do_test("IVF30,Flat", subset_type=1)
+
+    def test_Flat_subset_type_2(self):
+        self.do_test("IVF30,PQ4np", subset_type=2)
+
+    def test_Flat_subset_type_3(self):
+        self.do_test("IVF30,Flat", subset_type=3)
+
+    def test_Flat_subset_type_4(self):
+        self.do_test("IVF30,Flat", subset_type=4)
+
+
+class TestIndependentQuantizer(unittest.TestCase):
+
+    def test_sidebyside(self):
+        """ provide double-sized vectors to the index, where each vector
+        is the concatenation of twice the same vector """
+        ds = SyntheticDataset(32, 1000, 500, 50)
+
+        index = faiss.index_factory(ds.d, "IVF32,SQ8")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 4
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        select32first = make_LinearTransform_matrix(
+            np.eye(64, dtype='float32')[:32])
+
+        select32last = make_LinearTransform_matrix(
+            np.eye(64, dtype='float32')[32:])
+
+        quantizer = faiss.IndexPreTransform(
+            select32first,
+            index.quantizer
+        )
+
+        index2 = faiss.IndexIVFIndependentQuantizer(
+            quantizer,
+            index, select32last
+        )
+
+        xq2 = np.hstack([ds.get_queries()] * 2)
+        quantizer.search(xq2, 30)
+        Dnew, Inew = index2.search(xq2, 10)
+
+        np.testing.assert_array_equal(Dref, Dnew)
+        np.testing.assert_array_equal(Iref, Inew)
+
+        # test add
+        index2.reset()
+        xb2 = np.hstack([ds.get_database()] * 2)
+        index2.add(xb2)
+        Dnew, Inew = index2.search(xq2, 10)
+
+        np.testing.assert_array_equal(Dref, Dnew)
+        np.testing.assert_array_equal(Iref, Inew)
+
+    def test_half_store(self):
+        """ the index stores only the first half of each vector
+        but the coarse quantizer sees them entirely """
+        ds = SyntheticDataset(32, 1000, 500, 50)
+        gt = ds.get_groundtruth(10)
+
+        select32first = make_LinearTransform_matrix(
+            np.eye(32, dtype='float32')[:16])
+
+        index_ivf = faiss.index_factory(ds.d // 2, "IVF32,Flat")
+        index_ivf.nprobe = 4
+        index = faiss.IndexPreTransform(select32first, index_ivf)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        Dref, Iref = index.search(ds.get_queries(), 10)
+        perf_ref = faiss.eval_intersection(Iref, gt)
+
+        index_ivf = faiss.index_factory(ds.d // 2, "IVF32,Flat")
+        index_ivf.nprobe = 4
+        index = faiss.IndexIVFIndependentQuantizer(
+            faiss.IndexFlatL2(ds.d),
+            index_ivf, select32first
+        )
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        Dnew, Inew = index.search(ds.get_queries(), 10)
+        perf_new = faiss.eval_intersection(Inew, gt)
+
+        self.assertLess(perf_ref, perf_new)
+
+    def test_precomputed_tables(self):
+        """ see how precomputed tables behave with centroid distance estimates from a mismatching
+        coarse quantizer """
+        ds = SyntheticDataset(48, 2000, 500, 250)
+        gt = ds.get_groundtruth(10)
+
+        index = faiss.IndexIVFIndependentQuantizer(
+            faiss.IndexFlatL2(48),
+            faiss.index_factory(16, "IVF64,PQ4np"),
+            faiss.PCAMatrix(48, 16)
+        )
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
+        index_ivf.nprobe = 4
+
+        Dref, Iref = index.search(ds.get_queries(), 10)
+        perf_ref = faiss.eval_intersection(Iref, gt)
+
+        index_ivf.use_precomputed_table = 1
+        index_ivf.precompute_table()
+
+        Dnew, Inew = index.search(ds.get_queries(), 10)
+        perf_new = faiss.eval_intersection(Inew, gt)
+
+        # to be honest, it is not clear which one is better...
+        self.assertNotEqual(perf_ref, perf_new)
+
+        # check IO while we are at it
+        index2 = faiss.deserialize_index(faiss.serialize_index(index))
+        D2, I2 = index2.search(ds.get_queries(), 10)
+
+        np.testing.assert_array_equal(Dnew, D2)
+        np.testing.assert_array_equal(Inew, I2)
diff --git a/thirdparty/faiss/tests/test_io.py b/thirdparty/faiss/tests/test_io.py
index 4e5d67c48..dc8ac3dcf 100644
--- a/thirdparty/faiss/tests/test_io.py
+++ b/thirdparty/faiss/tests/test_io.py
@@ -11,7 +11,7 @@
 import io
 import sys
 import pickle
-from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing.pool import ThreadPool
 
 
 class TestIOVariants(unittest.TestCase):
@@ -80,7 +80,7 @@ def do_write_callback(self, bsz):
             faiss.vector_to_array(index2.codes)
         )
 
-        # This is not a callable function: shoudl raise an exception
+        # This is not a callable function: should raise an exception
         writer = faiss.PyCallbackIOWriter("blabla")
         self.assertRaises(
             Exception,
@@ -278,3 +278,94 @@ def test_hnsw(self):
 
     def test_ivf(self):
         self.dump_load_factory("IVF5,Flat")
+
+
+class Test_IO_VectorTransform(unittest.TestCase):
+    """
+    test write_VectorTransform using IOWriter Pointer
+    and read_VectorTransform using file name
+    """
+    def test_write_vector_transform(self):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFSpectralHash(quantizer, d, n, 8, 1.0)
+        index.train(x)
+        index.add(x)
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+        try:
+
+            writer = faiss.FileIOWriter(fname)
+            faiss.write_VectorTransform(index.vt, writer)
+            del writer
+
+            vt = faiss.read_VectorTransform(fname)
+
+            assert vt.d_in == index.vt.d_in
+            assert vt.d_out == index.vt.d_out
+            assert vt.is_trained
+
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+    """
+    test write_VectorTransform using file name
+    and read_VectorTransform using IOWriter Pointer
+    """
+    def test_read_vector_transform(self):
+        d, n = 32, 1000
+        x = np.random.uniform(size=(n, d)).astype('float32')
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFSpectralHash(quantizer, d, n, 8, 1.0)
+        index.train(x)
+        index.add(x)
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+        try:
+
+            faiss.write_VectorTransform(index.vt, fname)
+
+            reader = faiss.FileIOReader(fname)
+            vt = faiss.read_VectorTransform(reader)
+            del reader
+
+            assert vt.d_in == index.vt.d_in
+            assert vt.d_out == index.vt.d_out
+            assert vt.is_trained
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
+
+
+class TestIVFPQRead(unittest.TestCase):
+    def test_reader(self):
+        d, n = 32, 1000
+        xq = np.random.uniform(size=(n, d)).astype('float32')
+        xb = np.random.uniform(size=(n, d)).astype('float32')
+
+        index = faiss.index_factory(32, "IVF32,PQ16np", faiss.METRIC_L2)
+        index.train(xb)
+        index.add(xb)
+        fd, fname = tempfile.mkstemp()
+        os.close(fd)
+
+        try:
+            faiss.write_index(index, fname)
+
+            index_a = faiss.read_index(fname)
+            index_b = faiss.read_index(fname, faiss.IO_FLAG_SKIP_PRECOMPUTE_TABLE)
+
+            Da, Ia = index_a.search(xq, 10)
+            Db, Ib = index_b.search(xq, 10)
+            np.testing.assert_array_equal(Ia, Ib)
+            np.testing.assert_almost_equal(Da, Db, decimal=5)
+
+            codes_a = index_a.sa_encode(xq)
+            codes_b = index_b.sa_encode(xq)
+            np.testing.assert_array_equal(codes_a, codes_b)
+
+        finally:
+            if os.path.exists(fname):
+                os.unlink(fname)
diff --git a/thirdparty/faiss/tests/test_ivflib.py b/thirdparty/faiss/tests/test_ivflib.py
index c5f2cd325..f19c3da45 100644
--- a/thirdparty/faiss/tests/test_ivflib.py
+++ b/thirdparty/faiss/tests/test_ivflib.py
@@ -47,10 +47,8 @@ def search_single_scan(index, xq, k, bs=128):
         sub_assign[skip_rows, skip_cols] = -1
 
         index.search_preassigned(
-            nq, faiss.swig_ptr(xq), k,
-            faiss.swig_ptr(sub_assign), faiss.swig_ptr(coarse_dis),
-            faiss.swig_ptr(rh.D), faiss.swig_ptr(rh.I),
-            False, None
+            xq, k, sub_assign, coarse_dis,
+            D=rh.D, I=rh.I
         )
 
     rh.finalize()
diff --git a/thirdparty/faiss/tests/test_ivfpq_codec.cpp b/thirdparty/faiss/tests/test_ivfpq_codec.cpp
index 5431be5cd..47757ba59 100644
--- a/thirdparty/faiss/tests/test_ivfpq_codec.cpp
+++ b/thirdparty/faiss/tests/test_ivfpq_codec.cpp
@@ -13,12 +13,13 @@
 
 #include <gtest/gtest.h>
 
-#include <faiss/FaissHook.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/utils/distances.h>
 #include <faiss/utils/utils.h>
 
+#include "simd/hook.h"
+
 namespace {
 
 // dimension of the vectors to index
@@ -35,7 +36,7 @@ double eval_codec_error(long ncentroids, long m, const std::vector<float>& v) {
 
     // encode and decode to compute reconstruction error
 
-    std::vector<faiss::Index::idx_t> keys(nb);
+    std::vector<faiss::idx_t> keys(nb);
     std::vector<uint8_t> codes(nb * m);
     index.encode_multiple(nb, keys.data(), v.data(), codes.data(), true);
 
diff --git a/thirdparty/faiss/tests/test_ivfpq_indexing.cpp b/thirdparty/faiss/tests/test_ivfpq_indexing.cpp
index d46b2ed21..334256096 100644
--- a/thirdparty/faiss/tests/test_ivfpq_indexing.cpp
+++ b/thirdparty/faiss/tests/test_ivfpq_indexing.cpp
@@ -71,14 +71,14 @@ TEST(IVFPQ, accuracy) {
             queries[i] = distrib(rng);
         }
 
-        std::vector<faiss::Index::idx_t> gt_nns(nq);
+        std::vector<faiss::idx_t> gt_nns(nq);
         std::vector<float> gt_dis(nq);
 
         index_gt.search(nq, queries.data(), 1, gt_dis.data(), gt_nns.data());
 
         index.nprobe = 5;
         int k = 5;
-        std::vector<faiss::Index::idx_t> nns(k * nq);
+        std::vector<faiss::idx_t> nns(k * nq);
         std::vector<float> dis(k * nq);
 
         index.search(nq, queries.data(), k, dis.data(), nns.data());
diff --git a/thirdparty/faiss/tests/test_lsq.py b/thirdparty/faiss/tests/test_local_search_quantizer.py
similarity index 65%
rename from thirdparty/faiss/tests/test_lsq.py
rename to thirdparty/faiss/tests/test_local_search_quantizer.py
index 135ddc04f..01fec70cc 100644
--- a/thirdparty/faiss/tests/test_lsq.py
+++ b/thirdparty/faiss/tests/test_local_search_quantizer.py
@@ -14,6 +14,7 @@
 
 from faiss.contrib import datasets
 
+faiss.omp_set_num_threads(4)
 
 sp = faiss.swig_ptr
 
@@ -251,11 +252,21 @@ def test_icm_encode_step(self):
 
         rs = np.random.RandomState(123)
 
-        # randomly generate codes, binary terms and unary terms
+        # randomly generate codes and unary terms
         codes = rs.randint(0, K, (n, M)).astype(np.int32)
         new_codes = codes.copy()
         unaries = rs.rand(M, n, K).astype(np.float32)
-        binaries = rs.rand(M, M, K, K).astype(np.float32)
+
+        # binary terms should be symmetric, because binary terms
+        #  represent cached dot products between the code C1 in codebook M1
+        #  and the code C2 in codebook M2.
+        # so, binaries[M1, M2, C1, C2] == binaries[M2, M1, C2, C1]
+        #
+        # generate binary terms in a standard way that provides
+        #  the needed symmetry
+        codebooks = rs.rand(M, K, d).astype(np.float32)
+        binaries = compute_binary_terms_ref(codebooks)
+        binaries = np.ascontiguousarray(binaries)
 
         # do icm encoding given binary and unary terms
         lsq = faiss.LocalSearchQuantizer(d, M, nbits)
@@ -474,3 +485,198 @@ def test_index_accuracy_reconstruct_LUT(self):
 
     def test_index_accuracy_cqint(self):
         self.eval_index_accuracy("IVF100,LSQ4x5_Ncqint8")
+
+    def test_deterministic(self):
+        ds = datasets.SyntheticDataset(d=16, nt=1000, nb=10005, nq=1000)
+        index = faiss.index_factory(ds.d, "IVF100,LSQ4x4_Nqint8")
+
+        k = 1
+        xq = ds.get_queries()
+        xb = ds.get_database()
+        xt = ds.get_train()
+
+        index.train(xt)
+        index.add(xb)
+        D, I = index.search(xq, k=k)
+
+        index.reset()
+        index.train(xt)
+        index.add(xb)
+        D2, I2 = index.search(xq, k=k)
+
+        np.testing.assert_array_equal(I, I2)
+
+
+class TestProductLocalSearchQuantizer(unittest.TestCase):
+
+    def test_codec(self):
+        """check that the error is in the same ballpark as PQ."""
+        ds = datasets.SyntheticDataset(64, 3000, 3000, 0)
+
+        xt = ds.get_train()
+        xb = ds.get_database()
+
+        nsplits = 2
+        Msub = 2
+        nbits = 4
+
+        plsq = faiss.ProductLocalSearchQuantizer(ds.d, nsplits, Msub, nbits)
+        plsq.train(xt)
+        err_plsq = eval_codec(plsq, xb)
+
+        pq = faiss.ProductQuantizer(ds.d, nsplits * Msub, nbits)
+        pq.train(xt)
+        err_pq = eval_codec(pq, xb)
+
+        print(err_plsq, err_pq)
+        self.assertLess(err_plsq, err_pq)
+
+    def test_with_lsq(self):
+        """compare with LSQ when nsplits = 1"""
+        ds = datasets.SyntheticDataset(32, 3000, 3000, 0)
+
+        xt = ds.get_train()
+        xb = ds.get_database()
+
+        M = 4
+        nbits = 4
+
+        plsq = faiss.ProductLocalSearchQuantizer(ds.d, 1, M, nbits)
+        plsq.train(xt)
+        err_plsq = eval_codec(plsq, xb)
+
+        lsq = faiss.LocalSearchQuantizer(ds.d, M, nbits)
+        lsq.train(xt)
+        err_lsq = eval_codec(lsq, xb)
+
+        print(err_plsq, err_lsq)
+        self.assertEqual(err_plsq, err_lsq)
+
+    def test_lut(self):
+        """test compute_LUT function"""
+        ds = datasets.SyntheticDataset(16, 1000, 0, 100)
+
+        xt = ds.get_train()
+        xq = ds.get_queries()
+
+        nsplits = 2
+        Msub = 2
+        nbits = 4
+        nq, d = xq.shape
+        dsub = d // nsplits
+
+        plsq = faiss.ProductLocalSearchQuantizer(ds.d, nsplits, Msub, nbits)
+        plsq.train(xt)
+
+        subcodebook_size = Msub * (1 << nbits)
+        codebook_size = nsplits * subcodebook_size
+        lut = np.zeros((nq, codebook_size), dtype=np.float32)
+        plsq.compute_LUT(nq, sp(xq), sp(lut))
+
+        codebooks = faiss.vector_to_array(plsq.codebooks)
+        codebooks = codebooks.reshape(nsplits, subcodebook_size, dsub)
+        xq = xq.reshape(nq, nsplits, dsub)
+        lut_ref = np.zeros((nq, nsplits, subcodebook_size), dtype=np.float32)
+        for i in range(nsplits):
+            lut_ref[:, i] = xq[:, i] @ codebooks[i].T
+        lut_ref = lut_ref.reshape(nq, codebook_size)
+
+        np.testing.assert_allclose(lut, lut_ref, rtol=5e-04)
+
+
+class TestIndexProductLocalSearchQuantizer(unittest.TestCase):
+
+    def test_accuracy1(self):
+        """check that the error is in the same ballpark as LSQ."""
+        recall1 = self.eval_index_accuracy("PLSQ4x3x5_Nqint8")
+        recall2 = self.eval_index_accuracy("LSQ12x5_Nqint8")
+        self.assertGreaterEqual(recall1, recall2)  # 622 vs 551
+
+    def test_accuracy2(self):
+        """when nsplits = 1, PLSQ should be almost the same as LSQ"""
+        recall1 = self.eval_index_accuracy("PLSQ1x3x5_Nqint8")
+        recall2 = self.eval_index_accuracy("LSQ3x5_Nqint8")
+        diff = abs(recall1 - recall2)  # 273 vs 275 in OSX
+        self.assertGreaterEqual(5, diff)
+
+    def eval_index_accuracy(self, index_key):
+        ds = datasets.SyntheticDataset(32, 1000, 1000, 100)
+        index = faiss.index_factory(ds.d, index_key)
+
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        D, I = index.search(ds.get_queries(), 10)
+        inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
+
+        # do a little I/O test
+        index2 = faiss.deserialize_index(faiss.serialize_index(index))
+        D2, I2 = index2.search(ds.get_queries(), 10)
+        np.testing.assert_array_equal(I2, I)
+        np.testing.assert_array_equal(D2, D)
+
+        return inter
+
+    def test_factory(self):
+        AQ = faiss.AdditiveQuantizer
+        ns, Msub, nbits = 2, 4, 8
+        index = faiss.index_factory(64, f"PLSQ{ns}x{Msub}x{nbits}_Nqint8")
+        assert isinstance(index, faiss.IndexProductLocalSearchQuantizer)
+        self.assertEqual(index.plsq.nsplits, ns)
+        self.assertEqual(index.plsq.subquantizer(0).M, Msub)
+        self.assertEqual(index.plsq.subquantizer(0).nbits.at(0), nbits)
+        self.assertEqual(index.plsq.search_type, AQ.ST_norm_qint8)
+
+        code_size = (ns * Msub * nbits + 7) // 8 + 1
+        self.assertEqual(index.plsq.code_size, code_size)
+
+
+class TestIndexIVFProductLocalSearchQuantizer(unittest.TestCase):
+
+    def eval_index_accuracy(self, factory_key):
+        ds = datasets.SyntheticDataset(32, 1000, 1000, 100)
+        index = faiss.index_factory(ds.d, factory_key)
+
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        inters = []
+        for nprobe in 1, 2, 4, 8, 16:
+            index.nprobe = nprobe
+            D, I = index.search(ds.get_queries(), 10)
+            inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
+            inters.append(inter)
+
+        inters = np.array(inters)
+        self.assertTrue(np.all(inters[1:] >= inters[:-1]))
+
+        # do a little I/O test
+        index2 = faiss.deserialize_index(faiss.serialize_index(index))
+        D2, I2 = index2.search(ds.get_queries(), 10)
+        np.testing.assert_array_equal(I2, I)
+        np.testing.assert_array_equal(D2, D)
+
+        return inter
+
+    def test_index_accuracy(self):
+        self.eval_index_accuracy("IVF32,PLSQ2x2x5_Nqint8")
+
+    def test_index_accuracy2(self):
+        """check that the error is in the same ballpark as LSQ."""
+        inter1 = self.eval_index_accuracy("IVF32,PLSQ2x2x5_Nqint8")
+        inter2 = self.eval_index_accuracy("IVF32,LSQ4x5_Nqint8")
+        # print(inter1, inter2)  # 381 vs 374
+        self.assertGreaterEqual(inter1 * 1.1, inter2)
+
+    def test_factory(self):
+        AQ = faiss.AdditiveQuantizer
+        ns, Msub, nbits = 2, 4, 8
+        index = faiss.index_factory(64, f"IVF32,PLSQ{ns}x{Msub}x{nbits}_Nqint8")
+        assert isinstance(index, faiss.IndexIVFProductLocalSearchQuantizer)
+        self.assertEqual(index.nlist, 32)
+        self.assertEqual(index.plsq.nsplits, ns)
+        self.assertEqual(index.plsq.subquantizer(0).M, Msub)
+        self.assertEqual(index.plsq.subquantizer(0).nbits.at(0), nbits)
+        self.assertEqual(index.plsq.search_type, AQ.ST_norm_qint8)
+
+        code_size = (ns * Msub * nbits + 7) // 8 + 1
+        self.assertEqual(index.plsq.code_size, code_size)
diff --git a/thirdparty/faiss/tests/test_lowlevel_ivf.cpp b/thirdparty/faiss/tests/test_lowlevel_ivf.cpp
index 8d5ccc9d3..e28e2a946 100644
--- a/thirdparty/faiss/tests/test_lowlevel_ivf.cpp
+++ b/thirdparty/faiss/tests/test_lowlevel_ivf.cpp
@@ -29,8 +29,6 @@ using namespace faiss;
 
 namespace {
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
@@ -199,7 +197,7 @@ void test_lowlevel_access(const char* index_key, MetricType metric) {
                     float computed_D = scanner->distance_to_code(
                             codes.data() + vno * il->code_size);
 
-                    EXPECT_EQ(computed_D, D[jj]);
+                    EXPECT_FLOAT_EQ(computed_D, D[jj]);
                 }
             }
         }
diff --git a/thirdparty/faiss/tests/test_mem_leak.cpp b/thirdparty/faiss/tests/test_mem_leak.cpp
index c6ac27b0d..2a8e41b80 100644
--- a/thirdparty/faiss/tests/test_mem_leak.cpp
+++ b/thirdparty/faiss/tests/test_mem_leak.cpp
@@ -40,7 +40,7 @@ TEST(MEM_LEAK, ivfflat) {
         double t0 = getmillisecs();
 
         for (int i = 0; i < N2; i++) {
-            std::vector<Index::idx_t> I(10 * bs);
+            std::vector<idx_t> I(10 * bs);
             std::vector<float> D(10 * bs);
 
             tfidf_faiss_index.search(
@@ -61,7 +61,7 @@ TEST(MEM_LEAK, ivfflat) {
             }
         }
         printf("\n");
-
+        
         // TODO: caiyd
         // Memory usage increases after faiss updated, this behavior is as
         // same as faiss-1.7.0 updated. So disable memory usage check.
diff --git a/thirdparty/faiss/tests/test_merge.cpp b/thirdparty/faiss/tests/test_merge.cpp
index 493666f72..7e23f15f7 100644
--- a/thirdparty/faiss/tests/test_merge.cpp
+++ b/thirdparty/faiss/tests/test_merge.cpp
@@ -26,7 +26,7 @@ namespace {
 struct Tempfilename {
     static pthread_mutex_t mutex;
 
-    std::string filename = "faiss_tmp_XXXXXX";
+    std::string filename = "/tmp/faiss_tmp_XXXXXX";
 
     Tempfilename() {
         pthread_mutex_lock(&mutex);
@@ -48,7 +48,7 @@ struct Tempfilename {
 
 pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
 
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 
 // parameters to use for the test
 int d = 64;
@@ -148,7 +148,7 @@ int compare_merged(
 // test on IVFFlat with implicit numbering
 TEST(MERGE, merge_flat_no_ids) {
     faiss::IndexShards index_shards(d);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
     for (int i = 0; i < nindex; i++) {
         index_shards.add_shard(
                 new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
@@ -164,7 +164,7 @@ TEST(MERGE, merge_flat_no_ids) {
 // test on IVFFlat, explicit ids
 TEST(MERGE, merge_flat) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
 
     for (int i = 0; i < nindex; i++) {
         index_shards.add_shard(
@@ -180,7 +180,7 @@ TEST(MERGE, merge_flat) {
 // test on IVFFlat and a VectorTransform
 TEST(MERGE, merge_flat_vt) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
 
     // here we have to retrain because of the vectorTransform
     faiss::RandomRotationMatrix rot(d, d);
@@ -211,7 +211,7 @@ TEST(MERGE, merge_flat_vt) {
 // put the merged invfile on disk
 TEST(MERGE, merge_flat_ondisk) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
     Tempfilename filename;
 
     for (int i = 0; i < nindex; i++) {
@@ -234,7 +234,7 @@ TEST(MERGE, merge_flat_ondisk) {
 // now use ondisk specific merge
 TEST(MERGE, merge_flat_ondisk_2) {
     faiss::IndexShards index_shards(d, false, false);
-    index_shards.own_fields = true;
+    index_shards.own_indices = true;
 
     for (int i = 0; i < nindex; i++) {
         index_shards.add_shard(
diff --git a/thirdparty/faiss/tests/test_merge_index.py b/thirdparty/faiss/tests/test_merge_index.py
new file mode 100644
index 000000000..8c4c1f091
--- /dev/null
+++ b/thirdparty/faiss/tests/test_merge_index.py
@@ -0,0 +1,264 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+import faiss
+import numpy as np
+from faiss.contrib.datasets import SyntheticDataset
+
+from common_faiss_tests import Randu10k
+
+ru = Randu10k()
+xb = ru.xb
+xt = ru.xt
+xq = ru.xq
+nb, d = xb.shape
+nq, d = xq.shape
+
+
+class TestMerge1(unittest.TestCase):
+    def make_index_for_merge(self, quant, index_type, master_index):
+        ncent = 40
+        if index_type == 1:
+            index = faiss.IndexIVFFlat(quant, d, ncent, faiss.METRIC_L2)
+            if master_index:
+                index.is_trained = True
+        elif index_type == 2:
+            index = faiss.IndexIVFPQ(quant, d, ncent, 4, 8)
+            if master_index:
+                index.pq = master_index.pq
+                index.is_trained = True
+        elif index_type == 3:
+            index = faiss.IndexIVFPQR(quant, d, ncent, 4, 8, 8, 8)
+            if master_index:
+                index.pq = master_index.pq
+                index.refine_pq = master_index.refine_pq
+                index.is_trained = True
+        elif index_type == 4:
+            # quant used as the actual index
+            index = faiss.IndexIDMap(quant)
+        return index
+
+    def do_test_merge(self, index_type):
+        k = 16
+        quant = faiss.IndexFlatL2(d)
+        ref_index = self.make_index_for_merge(quant, index_type, False)
+
+        # trains the quantizer
+        ref_index.train(xt)
+
+        print('ref search')
+        ref_index.add(xb)
+        _Dref, Iref = ref_index.search(xq, k)
+        print(Iref[:5, :6])
+
+        indexes = []
+        ni = 3
+        for i in range(ni):
+            i0 = int(i * nb / ni)
+            i1 = int((i + 1) * nb / ni)
+            index = self.make_index_for_merge(quant, index_type, ref_index)
+            index.is_trained = True
+            index.add(xb[i0:i1])
+            indexes.append(index)
+
+        index = indexes[0]
+
+        for i in range(1, ni):
+            print('merge ntotal=%d other.ntotal=%d ' % (
+                index.ntotal, indexes[i].ntotal))
+            index.merge_from(indexes[i], index.ntotal)
+
+        _D, I = index.search(xq, k)
+        print(I[:5, :6])
+
+        ndiff = (I != Iref).sum()
+        print('%d / %d differences' % (ndiff, nq * k))
+        assert (ndiff < nq * k / 1000.)
+
+    def test_merge(self):
+        self.do_test_merge(1)
+        self.do_test_merge(2)
+        self.do_test_merge(3)
+
+    ######################################
+    # remove tests that piggyback on merge
+
+    def do_test_remove(self, index_type):
+        k = 16
+        quant = faiss.IndexFlatL2(d)
+        index = self.make_index_for_merge(quant, index_type, None)
+
+        # trains the quantizer
+        index.train(xt)
+
+        if index_type < 4:
+            index.add(xb)
+        else:
+            gen = np.random.RandomState(1234)
+            id_list = gen.permutation(nb * 7)[:nb].astype('int64')
+            index.add_with_ids(xb, id_list)
+
+        print('ref search ntotal=%d' % index.ntotal)
+        Dref, Iref = index.search(xq, k)
+
+        toremove = np.zeros(nq * k, dtype='int64')
+        nr = 0
+        for i in range(nq):
+            for j in range(k):
+                # remove all even results (it's ok if there are duplicates
+                # in the list of ids)
+                if Iref[i, j] % 2 == 0:
+                    nr = nr + 1
+                    toremove[nr] = Iref[i, j]
+
+        print('nr=', nr)
+
+        idsel = faiss.IDSelectorBatch(
+            nr, faiss.swig_ptr(toremove))
+
+        for i in range(nr):
+            assert (idsel.is_member(int(toremove[i])))
+
+        nremoved = index.remove_ids(idsel)
+
+        print('nremoved=%d ntotal=%d' % (nremoved, index.ntotal))
+
+        D, I = index.search(xq, k)
+
+        # make sure results are in the same order with even ones removed
+        ndiff = 0
+        for i in range(nq):
+            j2 = 0
+            for j in range(k):
+                if Iref[i, j] % 2 != 0:
+                    if I[i, j2] != Iref[i, j]:
+                        ndiff += 1
+                    assert abs(D[i, j2] - Dref[i, j]) < 1e-5
+                    j2 += 1
+        # draws are ordered arbitrarily
+        assert ndiff < 5
+
+    def test_remove(self):
+        self.do_test_remove(1)
+        self.do_test_remove(2)
+        self.do_test_remove(4)
+
+
+# Test merge_from method for all IndexFlatCodes Types
+class TestMerge2(unittest.TestCase):
+
+    def do_flat_codes_test(self, factory_key):
+        ds = SyntheticDataset(32, 300, 300, 100)
+        index1 = faiss.index_factory(ds.d, factory_key)
+        index1.train(ds.get_train())
+        index1.add(ds.get_database())
+        _, Iref = index1.search(ds.get_queries(), 5)
+        index1.reset()
+        index2 = faiss.clone_index(index1)
+        index1.add(ds.get_database()[:100])
+        index2.add(ds.get_database()[100:])
+        index1.merge_from(index2)
+        _, Inew = index1.search(ds.get_queries(), 5)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_merge_IndexFlat(self):
+        self.do_flat_codes_test("Flat")
+
+    def test_merge_IndexPQ(self):
+        self.do_flat_codes_test("PQ8np")
+
+    def test_merge_IndexLSH(self):
+        self.do_flat_codes_test("LSHr")
+
+    def test_merge_IndexScalarQuantizer(self):
+        self.do_flat_codes_test("SQ4")
+
+    def test_merge_PreTransform(self):
+        self.do_flat_codes_test("PCA16,SQ4")
+
+    def do_fast_scan_test(self, factory_key, size1, with_add_id=False):
+        ds = SyntheticDataset(110, 1000, 1000, 100)
+        index_trained = faiss.index_factory(ds.d, factory_key)
+        index_trained.train(ds.get_train())
+        # test both clone and index_read/write
+        if True:
+            index1 = faiss.deserialize_index(
+                faiss.serialize_index(index_trained))
+        else:
+            index1 = faiss.clone_index(index_trained)
+        # assert index1.aq.qnorm.ntotal == index_trained.aq.qnorm.ntotal
+
+        index1.add(ds.get_database())
+        _, Iref = index1.search(ds.get_queries(), 5)
+        index1.reset()
+        index2 = faiss.clone_index(index_trained)
+        index1.add(ds.get_database()[:size1])
+        index2.add(ds.get_database()[size1:])
+        if with_add_id:
+            index1.merge_from(index2, add_id=index1.ntotal)
+        else:
+            index1.merge_from(index2)
+        _, Inew = index1.search(ds.get_queries(), 5)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_merge_IndexFastScan_complete_block(self):
+        self.do_fast_scan_test("PQ5x4fs", 320)
+
+    def test_merge_IndexFastScan_not_complete_block(self):
+        self.do_fast_scan_test("PQ11x4fs", 310)
+
+    def test_merge_IndexFastScan_even_M(self):
+        self.do_fast_scan_test("PQ10x4fs", 500)
+
+    def test_merge_IndexAdditiveQuantizerFastScan(self):
+        self.do_fast_scan_test("RQ10x4fs_32_Nrq2x4", 330)
+
+    def test_merge_IVFFastScan(self):
+        self.do_fast_scan_test("IVF20,PQ5x4fs", 123, with_add_id=True)
+
+    def do_test_with_ids(self, factory_key):
+        ds = SyntheticDataset(32, 300, 300, 100)
+        rs = np.random.RandomState(123)
+        ids = rs.choice(10000, ds.nb, replace=False).astype('int64')
+        index1 = faiss.index_factory(ds.d, factory_key)
+        index1.train(ds.get_train())
+        index1.add_with_ids(ds.get_database(), ids)
+        _, Iref = index1.search(ds.get_queries(), 5)
+        index1.reset()
+        index2 = faiss.clone_index(index1)
+        index1.add_with_ids(ds.get_database()[:100], ids[:100])
+        index2.add_with_ids(ds.get_database()[100:], ids[100:])
+        index1.merge_from(index2)
+        _, Inew = index1.search(ds.get_queries(), 5)
+        np.testing.assert_array_equal(Inew, Iref)
+        if "IDMap2" in factory_key:
+            index1.check_consistency()
+
+    def test_merge_IDMap(self):
+        self.do_test_with_ids("Flat,IDMap")
+
+    def test_merge_IDMap2(self):
+        self.do_test_with_ids("Flat,IDMap2")
+
+
+class TestRemoveFastScan(unittest.TestCase):
+
+    def do_fast_scan_test(self, factory_key, size1):
+        ds = SyntheticDataset(110, 1000, 1000, 100)
+        index1 = faiss.index_factory(ds.d, factory_key)
+        index1.train(ds.get_train())
+        index1.reset()
+        tokeep = [i % 3 == 0 for i in range(ds.nb)]
+        index1.add(ds.get_database()[tokeep])
+        _, Iref = index1.search(ds.get_queries(), 5)
+        index1.reset()
+        index1.add(ds.get_database())
+        index1.remove_ids(np.where(np.logical_not(tokeep))[0])
+        _, Inew = index1.search(ds.get_queries(), 5)
+        np.testing.assert_array_equal(Inew, Iref)
+
+    def test_remove(self):
+        self.do_fast_scan_test("PQ5x4fs", 320)
diff --git a/thirdparty/faiss/tests/test_meta_index.py b/thirdparty/faiss/tests/test_meta_index.py
index 9f3ba1c1b..d53cad48f 100644
--- a/thirdparty/faiss/tests/test_meta_index.py
+++ b/thirdparty/faiss/tests/test_meta_index.py
@@ -4,13 +4,14 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
-import sys
 import numpy as np
 import faiss
 import unittest
 
 from common_faiss_tests import Randu10k
 
+from faiss.contrib.datasets import SyntheticDataset
+
 ru = Randu10k()
 
 xb = ru.xb
@@ -129,139 +130,54 @@ def test_shards(self):
             ndiff = (I != Iref).sum()
 
             print('%d / %d differences' % (ndiff, nq * k))
-            assert(ndiff < nq * k / 1000.)
-
-
-class Merge(unittest.TestCase):
-
-    def make_index_for_merge(self, quant, index_type, master_index):
-        ncent = 40
-        if index_type == 1:
-            index = faiss.IndexIVFFlat(quant, d, ncent, faiss.METRIC_L2)
-            if master_index:
-                index.is_trained = True
-        elif index_type == 2:
-            index = faiss.IndexIVFPQ(quant, d, ncent, 4, 8)
-            if master_index:
-                index.pq = master_index.pq
-                index.is_trained = True
-        elif index_type == 3:
-            index = faiss.IndexIVFPQR(quant, d, ncent, 4, 8, 8, 8)
-            if master_index:
-                index.pq = master_index.pq
-                index.refine_pq = master_index.refine_pq
-                index.is_trained = True
-        elif index_type == 4:
-            # quant used as the actual index
-            index = faiss.IndexIDMap(quant)
-        return index
-
-    def do_test_merge(self, index_type):
-        k = 16
-        quant = faiss.IndexFlatL2(d)
-        ref_index = self.make_index_for_merge(quant, index_type, False)
-
-        # trains the quantizer
-        ref_index.train(xt)
-
-        print('ref search')
-        ref_index.add(xb)
-        _Dref, Iref = ref_index.search(xq, k)
-        print(Iref[:5, :6])
-
-        indexes = []
-        ni = 3
-        for i in range(ni):
-            i0 = int(i * nb / ni)
-            i1 = int((i + 1) * nb / ni)
-            index = self.make_index_for_merge(quant, index_type, ref_index)
-            index.is_trained = True
-            index.add(xb[i0:i1])
-            indexes.append(index)
-
-        index = indexes[0]
-
-        for i in range(1, ni):
-            print('merge ntotal=%d other.ntotal=%d ' % (
-                index.ntotal, indexes[i].ntotal))
-            index.merge_from(indexes[i], index.ntotal)
-
-        _D, I = index.search(xq, k)
-        print(I[:5, :6])
-
-        ndiff = (I != Iref).sum()
-        print('%d / %d differences' % (ndiff, nq * k))
-        assert(ndiff < nq * k / 1000.)
-
-    def test_merge(self):
-        self.do_test_merge(1)
-        self.do_test_merge(2)
-        self.do_test_merge(3)
-
-    def do_test_remove(self, index_type):
-        k = 16
-        quant = faiss.IndexFlatL2(d)
-        index = self.make_index_for_merge(quant, index_type, None)
-
-        # trains the quantizer
-        index.train(xt)
-
-        if index_type < 4:
-            index.add(xb)
-        else:
-            gen = np.random.RandomState(1234)
-            id_list = gen.permutation(nb * 7)[:nb].astype('int64')
-            index.add_with_ids(xb, id_list)
-
-        print('ref search ntotal=%d' % index.ntotal)
-        Dref, Iref = index.search(xq, k)
-
-        toremove = np.zeros(nq * k, dtype='int64')
-        nr = 0
-        for i in range(nq):
-            for j in range(k):
-                # remove all even results (it's ok if there are duplicates
-                # in the list of ids)
-                if Iref[i, j] % 2 == 0:
-                    nr = nr + 1
-                    toremove[nr] = Iref[i, j]
-
-        print('nr=', nr)
-
-        idsel = faiss.IDSelectorBatch(
-            nr, faiss.swig_ptr(toremove))
-
-        for i in range(nr):
-            assert(idsel.is_member(int(toremove[i])))
-
-        nremoved = index.remove_ids(idsel)
-
-        print('nremoved=%d ntotal=%d' % (nremoved, index.ntotal))
-
-        D, I = index.search(xq, k)
-
-        # make sure results are in the same order with even ones removed
-        ndiff = 0
-        for i in range(nq):
-            j2 = 0
-            for j in range(k):
-                if Iref[i, j] % 2 != 0:
-                    if I[i, j2] != Iref[i, j]:
-                        ndiff += 1
-                    assert abs(D[i, j2] - Dref[i, j]) < 1e-5
-                    j2 += 1
-        # draws are ordered arbitrarily
-        assert ndiff < 5
-
-    def test_remove(self):
-        self.do_test_remove(1)
-        self.do_test_remove(2)
-        self.do_test_remove(4)
-
-
-
-
-
-
-if __name__ == '__main__':
-    unittest.main()
+            assert (ndiff < nq * k / 1000.)
+
+    def test_shards_ivf(self):
+        ds = SyntheticDataset(32, 1000, 100, 20)
+        ref_index = faiss.index_factory(ds.d, "IVF32,SQ8")
+        ref_index.train(ds.get_train())
+        xb = ds.get_database()
+        ref_index.add(ds.get_database())
+
+        Dref, Iref = ref_index.search(ds.get_database(), 10)
+        ref_index.reset()
+
+        sharded_index = faiss.IndexShardsIVF(
+            ref_index.quantizer, ref_index.nlist, False, True)
+        for shard in range(3):
+            index_i = faiss.clone_index(ref_index)
+            index_i.add(xb[shard * nb // 3: (shard + 1)* nb // 3])
+            sharded_index.add_shard(index_i)
+
+        Dnew, Inew = sharded_index.search(ds.get_database(), 10)
+
+        np.testing.assert_equal(Inew, Iref)
+        np.testing.assert_allclose(Dnew, Dref)
+
+    def test_shards_ivf_train_add(self):
+        ds = SyntheticDataset(32, 1000, 600, 20)
+        quantizer = faiss.IndexFlatL2(ds.d)
+        sharded_index = faiss.IndexShardsIVF(quantizer, 40, False, False)
+
+        for _ in range(3):
+            sharded_index.add_shard(faiss.index_factory(ds.d, "IVF40,Flat"))
+
+        sharded_index.train(ds.get_train())
+        sharded_index.add(ds.get_database())
+        Dnew, Inew = sharded_index.search(ds.get_queries(), 10)
+
+        index_ref = faiss.IndexIVFFlat(quantizer, ds.d, sharded_index.nlist)
+        index_ref.train(ds.get_train())
+        index_ref.add(ds.get_database())
+        Dref, Iref = index_ref.search(ds.get_queries(), 10)
+        np.testing.assert_equal(Inew, Iref)
+        np.testing.assert_allclose(Dnew, Dref)
+
+        # mess around with the quantizer's centroids
+        centroids = quantizer.reconstruct_n()
+        centroids = centroids[::-1].copy()
+        quantizer.reset()
+        quantizer.add(centroids)
+
+        D2, I2 = sharded_index.search(ds.get_queries(), 10)
+        self.assertFalse(np.all(I2 == Inew))
diff --git a/thirdparty/faiss/tests/test_ondisk_ivf.cpp b/thirdparty/faiss/tests/test_ondisk_ivf.cpp
index cb5c741fd..94c23381e 100644
--- a/thirdparty/faiss/tests/test_ondisk_ivf.cpp
+++ b/thirdparty/faiss/tests/test_ondisk_ivf.cpp
@@ -28,7 +28,7 @@ namespace {
 struct Tempfilename {
     static pthread_mutex_t mutex;
 
-    std::string filename = "faiss_tmp_XXXXXX";
+    std::string filename = "/tmp/faiss_tmp_XXXXXX";
 
     Tempfilename() {
         pthread_mutex_lock(&mutex);
@@ -80,10 +80,10 @@ TEST(ONDISK, make_invlists) {
     int ntot = 0;
     for (int i = 0; i < nlist; i++) {
         int size = ivf.list_size(i);
-        const faiss::Index::idx_t* ids = ivf.get_ids(i);
+        const faiss::idx_t* ids = ivf.get_ids(i);
         const uint8_t* codes = ivf.get_codes(i);
         for (int j = 0; j < size; j++) {
-            faiss::Index::idx_t id = ids[j];
+            faiss::idx_t id = ids[j];
             const int* ar = (const int*)&codes[code_size * j];
             EXPECT_EQ(ar[0], id);
             EXPECT_EQ(ar[1], i);
@@ -113,7 +113,7 @@ TEST(ONDISK, test_add) {
     faiss::float_rand(xq.data(), d * nq, 34567);
 
     std::vector<float> ref_D(nq * k);
-    std::vector<faiss::Index::idx_t> ref_I(nq * k);
+    std::vector<faiss::idx_t> ref_I(nq * k);
 
     index.search(nq, xq.data(), k, ref_D.data(), ref_I.data());
 
@@ -131,7 +131,7 @@ TEST(ONDISK, test_add) {
         index2.add(nb, xb.data());
 
         std::vector<float> new_D(nq * k);
-        std::vector<faiss::Index::idx_t> new_I(nq * k);
+        std::vector<faiss::idx_t> new_I(nq * k);
 
         index2.search(nq, xq.data(), k, new_D.data(), new_I.data());
 
@@ -146,7 +146,7 @@ TEST(ONDISK, test_add) {
         faiss::Index* index3 = faiss::read_index(filename2.c_str());
 
         std::vector<float> new_D(nq * k);
-        std::vector<faiss::Index::idx_t> new_I(nq * k);
+        std::vector<faiss::idx_t> new_I(nq * k);
 
         index3->search(nq, xq.data(), k, new_D.data(), new_I.data());
 
@@ -192,10 +192,10 @@ TEST(ONDISK, make_invlists_threaded) {
     int ntot = 0;
     for (int i = 0; i < nlist; i++) {
         int size = ivf.list_size(i);
-        const faiss::Index::idx_t* ids = ivf.get_ids(i);
+        const faiss::idx_t* ids = ivf.get_ids(i);
         const uint8_t* codes = ivf.get_codes(i);
         for (int j = 0; j < size; j++) {
-            faiss::Index::idx_t id = ids[j];
+            faiss::idx_t id = ids[j];
             const int* ar = (const int*)&codes[code_size * j];
             EXPECT_EQ(ar[0], id);
             EXPECT_EQ(ar[1], i);
diff --git a/thirdparty/faiss/tests/test_pairs_decoding.cpp b/thirdparty/faiss/tests/test_pairs_decoding.cpp
index daaa0ff31..d21136cb7 100644
--- a/thirdparty/faiss/tests/test_pairs_decoding.cpp
+++ b/thirdparty/faiss/tests/test_pairs_decoding.cpp
@@ -21,7 +21,7 @@
 
 namespace {
 
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 
 /*************************************************************
  * Test utils
@@ -171,23 +171,23 @@ int test_search_and_return_centroids(const char* index_key) {
  * Test entry points
  *************************************************************/
 
-TEST(test_search_centroid, IVFFlat) {
+TEST(testSearchCentroid, IVFFlat) {
     bool ok = test_search_centroid("IVF32,Flat");
     EXPECT_TRUE(ok);
 }
 
-TEST(test_search_centroid, PCAIVFFlat) {
+TEST(testSearchCentroid, PCAIVFFlat) {
     bool ok = test_search_centroid("PCA16,IVF32,Flat");
     EXPECT_TRUE(ok);
 }
 
-TEST(test_search_and_return_centroids, IVFFlat) {
+TEST(testSearchAndReturnCentroids, IVFFlat) {
     int err = test_search_and_return_centroids("IVF32,Flat");
     EXPECT_NE(err, 1);
     EXPECT_NE(err, 2);
 }
 
-TEST(test_search_and_return_centroids, PCAIVFFlat) {
+TEST(testSearchAndReturnCentroids, PCAIVFFlat) {
     int err = test_search_and_return_centroids("PCA16,IVF32,Flat");
     EXPECT_NE(err, 1);
     EXPECT_NE(err, 2);
diff --git a/thirdparty/faiss/tests/test_params_override.cpp b/thirdparty/faiss/tests/test_params_override.cpp
index 1001ad339..8891d0e55 100644
--- a/thirdparty/faiss/tests/test_params_override.cpp
+++ b/thirdparty/faiss/tests/test_params_override.cpp
@@ -18,14 +18,15 @@
 #include <faiss/IVFlib.h>
 #include <faiss/IndexBinaryIVF.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/index_factory.h>
 
 using namespace faiss;
 
 namespace {
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
@@ -50,6 +51,8 @@ std::unique_ptr<Index> make_index(
         const char* index_type,
         MetricType metric,
         const std::vector<float>& x) {
+    assert(x.size() % d == 0);
+    idx_t nb = x.size() / d;
     std::unique_ptr<Index> index(index_factory(d, index_type, metric));
     index->train(nb, x.data());
     index->add(nb, x.data());
@@ -109,6 +112,50 @@ int test_params_override(const char* index_key, MetricType metric) {
     return 0;
 }
 
+/*************************************************************
+ * Test subsets
+ *************************************************************/
+
+int test_selector(const char* index_key) {
+    std::vector<float> xb = make_data(nb); // database vectors
+    std::vector<float> xq = make_data(nq);
+    ParameterSpace ps;
+
+    std::vector<float> sub_xb;
+    std::vector<idx_t> kept;
+    for (idx_t i = 0; i < nb; i++) {
+        if (i % 10 == 2) {
+            kept.push_back(i);
+            sub_xb.insert(
+                    sub_xb.end(), xb.begin() + i * d, xb.begin() + (i + 1) * d);
+        }
+    }
+
+    // full index
+    auto index = make_index(index_key, METRIC_L2, xb);
+    ps.set_index_parameter(index.get(), "nprobe", 3);
+
+    // restricted index
+    std::unique_ptr<Index> sub_index(clone_index(index.get()));
+    sub_index->reset();
+    sub_index->add_with_ids(kept.size(), sub_xb.data(), kept.data());
+
+    auto ref_result = search_index(sub_index.get(), xq.data());
+
+    IVFSearchParameters params;
+    params.max_codes = 0;
+    params.nprobe = 3;
+    IDSelectorBatch sel(kept.size(), kept.data());
+    params.sel = &sel;
+    auto new_result = search_index_with_params(index.get(), xq.data(), &params);
+
+    if (ref_result != new_result) {
+        return 1;
+    }
+
+    return 0;
+}
+
 } // namespace
 
 /*************************************************************
@@ -143,6 +190,21 @@ TEST(TPO, IVFFlatPP) {
     EXPECT_EQ(err2, 0);
 }
 
+TEST(TSEL, IVFFlat) {
+    int err = test_selector("PCA16,IVF32,Flat");
+    EXPECT_EQ(err, 0);
+}
+
+TEST(TSEL, IVFFPQ) {
+    int err = test_selector("PCA16,IVF32,PQ4x8np");
+    EXPECT_EQ(err, 0);
+}
+
+TEST(TSEL, IVFFSQ) {
+    int err = test_selector("PCA16,IVF32,SQ8");
+    EXPECT_EQ(err, 0);
+}
+
 /*************************************************************
  * Same for binary indexes
  *************************************************************/
diff --git a/thirdparty/faiss/tests/test_partition.py b/thirdparty/faiss/tests/test_partition.py
index 08087fb7c..02de7e8c2 100644
--- a/thirdparty/faiss/tests/test_partition.py
+++ b/thirdparty/faiss/tests/test_partition.py
@@ -40,6 +40,8 @@ def test_partition_fuzzy_2(self):
         self.do_partition(160, (70, 80))
 
 
+def pointer_to_minus1():
+    return np.array([-1], dtype='int64').view("uint64")
 
 class TestPartitioningFloat(unittest.TestCase, PartitionTests):
 
@@ -67,7 +69,7 @@ def do_partition(self, n, q, maxval=None, seed=None):
             )
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             faiss.CMax_float_partition_fuzzy(
                 sp(vals), sp(ids), n,
                 q_min, q_max, sp(q)
@@ -117,7 +119,7 @@ def do_partition(self, n, q, maxval=None, seed=None):
             )
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             faiss.CMin_float_partition_fuzzy(
                 sp(vals), sp(ids), n,
                 q_min, q_max, sp(q)
@@ -164,7 +166,7 @@ def do_partition(self, n, q, maxval=65536, seed=None):
                 tab_a.get(), sp(ids), n, q, q, None)
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             faiss.CMax_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n,
                 q_min, q_max, sp(q)
@@ -213,7 +215,7 @@ def do_partition(self, n, q, maxval=65536, seed=None):
                 tab_a.get(), sp(ids), n, q, q, None)
         else:
             q_min, q_max = q
-            q = np.array([-1], dtype='uint64')
+            q = pointer_to_minus1()
             thresh2 = faiss.CMin_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n,
                 q_min, q_max, sp(q)
diff --git a/thirdparty/faiss/tests/test_partitioning.cpp b/thirdparty/faiss/tests/test_partitioning.cpp
new file mode 100644
index 000000000..b719fcfe0
--- /dev/null
+++ b/thirdparty/faiss/tests/test_partitioning.cpp
@@ -0,0 +1,33 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/utils/AlignedTable.h>
+#include <faiss/utils/partitioning.h>
+
+using namespace faiss;
+
+typedef AlignedTable<uint16_t> AlignedTableUint16;
+
+// TODO: This test fails when Faiss is compiled with
+// GCC 13.2 from conda-forge with AVX2 enabled. This may be
+// a GCC bug that needs to be investigated further.
+// As of 16-AUG-2023 the Faiss conda packages are built
+// with GCC 11.2, so the published binaries are not affected.
+TEST(TestPartitioning, TestPartitioningBigRange) {
+    auto n = 1024;
+    AlignedTableUint16 tab(n);
+    for (auto i = 0; i < n; i++) {
+        tab[i] = i * 64;
+    }
+    int32_t hist[16]{};
+    simd_histogram_16(tab.get(), n, 0, 12, hist);
+    for (auto i = 0; i < 16; i++) {
+        ASSERT_EQ(hist[i], 64);
+    }
+}
diff --git a/thirdparty/faiss/tests/test_pq_encoding.cpp b/thirdparty/faiss/tests/test_pq_encoding.cpp
index f67af5181..be09ba234 100644
--- a/thirdparty/faiss/tests/test_pq_encoding.cpp
+++ b/thirdparty/faiss/tests/test_pq_encoding.cpp
@@ -11,7 +11,9 @@
 
 #include <gtest/gtest.h>
 
+#include <faiss/IndexPQFastScan.h>
 #include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/pq4_fast_scan.h>
 
 namespace {
 
@@ -24,6 +26,15 @@ const std::vector<uint64_t> random_vector(size_t s) {
     return v;
 }
 
+const std::vector<float> random_vector_float(size_t s) {
+    std::vector<float> v(s, 0);
+    for (size_t i = 0; i < s; ++i) {
+        v[i] = rand();
+    }
+
+    return v;
+}
+
 } // namespace
 
 TEST(PQEncoderGeneric, encode) {
@@ -91,3 +102,44 @@ TEST(PQEncoder16, encode) {
         EXPECT_EQ(values[i] & mask, v);
     }
 }
+
+TEST(PQFastScan, set_packed_element) {
+    int d = 20, ntotal = 1000, M = 5, nbits = 4;
+    const std::vector<float> ds = random_vector_float(ntotal * d);
+    faiss::IndexPQFastScan index(d, M, nbits);
+    index.train(ntotal, ds.data());
+    index.add(ntotal, ds.data());
+
+    for (int j = 0; j < 10; j++) {
+        int vector_id = rand() % ntotal;
+        std::vector<uint8_t> old(ntotal * M);
+        std::vector<uint8_t> code(M);
+        for (int i = 0; i < ntotal; i++) {
+            for (int sq = 0; sq < M; sq++) {
+                old[i * M + sq] = faiss::pq4_get_packed_element(
+                        index.codes.data(), index.bbs, M, i, sq);
+            }
+        }
+        for (int sq = 0; sq < M; sq++) {
+            faiss::pq4_set_packed_element(
+                    index.codes.data(),
+                    ((old[vector_id * M + sq] + 3) % 16),
+                    index.bbs,
+                    M,
+                    vector_id,
+                    sq);
+        }
+        for (int i = 0; i < ntotal; i++) {
+            for (int sq = 0; sq < M; sq++) {
+                uint8_t newcode = faiss::pq4_get_packed_element(
+                        index.codes.data(), index.bbs, M, i, sq);
+                uint8_t oldcode = old[i * M + sq];
+                if (i == vector_id) {
+                    EXPECT_EQ(newcode, (oldcode + 3) % 16);
+                } else {
+                    EXPECT_EQ(newcode, oldcode);
+                }
+            }
+        }
+    }
+}
diff --git a/thirdparty/faiss/tests/test_product_quantizer.py b/thirdparty/faiss/tests/test_product_quantizer.py
index 0b97ebdbf..1cdee7f14 100644
--- a/thirdparty/faiss/tests/test_product_quantizer.py
+++ b/thirdparty/faiss/tests/test_product_quantizer.py
@@ -75,6 +75,44 @@ def test_codec(self):
             self.do_test_codec(i + 1)
 
 
+class TestPQTransposedCentroids(unittest.TestCase):
+
+    def do_test(self, d, dsub):
+        M = d // dsub
+        pq = faiss.ProductQuantizer(d, M, 8)
+        xt = faiss.randn((max(1000, pq.ksub * 50), d), 123)
+        pq.cp.niter = 4    # to avoid timeouts in tests
+        pq.train(xt)
+
+        codes = pq.compute_codes(xt)
+
+        # enable transposed centroids table to speedup compute_codes()
+        pq.sync_transposed_centroids()
+        codes_transposed = pq.compute_codes(xt)
+
+        # disable transposed centroids table
+        pq.clear_transposed_centroids()
+        codes_cleared = pq.compute_codes(xt)
+
+        assert np.all(codes == codes_transposed)
+        assert np.all(codes == codes_cleared)
+
+    def test_dsub2(self):
+        self.do_test(16, 2)
+
+    def test_dsub5(self):
+        self.do_test(20, 5)
+
+    def test_dsub2_odd(self):
+        self.do_test(18, 2)
+
+    def test_dsub4(self):
+        self.do_test(32, 4)
+
+    def test_dsub4_odd(self):
+        self.do_test(36, 4)
+
+
 class TestPQTables(unittest.TestCase):
 
     def do_test(self, d, dsub, nbit=8, metric=None):
diff --git a/thirdparty/faiss/tests/test_refine.py b/thirdparty/faiss/tests/test_refine.py
index 8b0d9222f..aff285f40 100644
--- a/thirdparty/faiss/tests/test_refine.py
+++ b/thirdparty/faiss/tests/test_refine.py
@@ -21,15 +21,22 @@ def do_test(self, factory_string, metric_type=faiss.METRIC_L2):
         index.add(ds.get_database())
         xq = ds.get_queries()
         Dref, Iref = index.search(xq, 10)
-        dc = index.get_distance_computer()
-        self.assertTrue(dc.this.own())
-        for q in range(ds.nq):
-            dc.set_query(faiss.swig_ptr(xq[q]))
-            for j in range(10):
-                ref_dis = Dref[q, j]
-                new_dis = dc(int(Iref[q, j]))
-                np.testing.assert_almost_equal(
-                    new_dis, ref_dis, decimal=5)
+
+        for is_FlatCodesDistanceComputer in False, True:
+            if not is_FlatCodesDistanceComputer:
+                dc = index.get_distance_computer()
+            else:
+                if not isinstance(index, faiss.IndexFlatCodes):
+                    continue
+                dc = index.get_FlatCodesDistanceComputer()
+            self.assertTrue(dc.this.own())
+            for q in range(ds.nq):
+                dc.set_query(faiss.swig_ptr(xq[q]))
+                for j in range(10):
+                    ref_dis = Dref[q, j]
+                    new_dis = dc(int(Iref[q, j]))
+                    np.testing.assert_almost_equal(
+                        new_dis, ref_dis, decimal=5)
 
     def test_distance_computer_PQ(self):
         self.do_test("PQ8np")
@@ -49,5 +56,11 @@ def test_distance_computer_PQbit6_ip(self):
     def test_distance_computer_VT(self):
         self.do_test("PCA20,SQ8")
 
+    def test_distance_computer_AQ_decompress(self):
+        self.do_test("RQ3x4")    # test decompress path
 
+    def test_distance_computer_AQ_LUT(self):
+        self.do_test("RQ3x4_Nqint8")    # test LUT path
 
+    def test_distance_computer_AQ_LUT_IP(self):
+        self.do_test("RQ3x4_Nqint8", faiss.METRIC_INNER_PRODUCT)
diff --git a/thirdparty/faiss/tests/test_residual_quantizer.py b/thirdparty/faiss/tests/test_residual_quantizer.py
index 8886a4b55..2ca3e4c32 100644
--- a/thirdparty/faiss/tests/test_residual_quantizer.py
+++ b/thirdparty/faiss/tests/test_residual_quantizer.py
@@ -15,6 +15,8 @@
 # Reference implementation of encoding with beam search
 ###########################################################
 
+faiss.omp_set_num_threads(4)
+
 
 def pairwise_distances(a, b):
     anorms = (a ** 2).sum(1)
@@ -209,7 +211,7 @@ def test_training(self):
 
         # in practice RQ is often better than PQ but it does not the case here, so just check
         # that we are within some factor.
-        print(err_pq, err_rq)
+        # print(err_pq, err_rq)
         self.assertLess(err_rq, err_pq * 1.2)
 
     def test_beam_size(self):
@@ -258,11 +260,99 @@ def test_training_with_limited_mem(self):
         for c0, c1 in zip(cb0, cb1):
             self.assertTrue(np.all(c0 == c1))
 
+    def test_clipping(self):
+        """ verify that a clipped residual quantizer gives the same
+        code prefix + suffix as the full RQ """
+        ds = datasets.SyntheticDataset(32, 1000, 100, 0)
+
+        rq = faiss.ResidualQuantizer(ds.d, 5, 4)
+        rq.train_type = faiss.ResidualQuantizer.Train_default
+        rq.max_beam_size = 5
+        rq.train(ds.get_train())
+
+        rq.max_beam_size = 1   # is not he same for a large beam size
+        codes = rq.compute_codes(ds.get_database())
+
+        rq2 = faiss.ResidualQuantizer(ds.d, 2, 4)
+        rq2.initialize_from(rq)
+        self.assertEqual(rq2.M, 2)
+        # verify that the beginning of the codes are the same
+        codes2 = rq2.compute_codes(ds.get_database())
+
+        rq3 = faiss.ResidualQuantizer(ds.d, 3, 4)
+        rq3.initialize_from(rq, 2)
+        self.assertEqual(rq3.M, 3)
+        codes3 = rq3.compute_codes(ds.get_database() - rq2.decode(codes2))
+
+        # verify that prefixes are the same
+        for i in range(ds.nb):
+            br = faiss.BitstringReader(faiss.swig_ptr(codes[i]), rq.code_size)
+            br2 = faiss.BitstringReader(faiss.swig_ptr(codes2[i]), rq2.code_size)
+            self.assertEqual(br.read(rq2.tot_bits), br2.read(rq2.tot_bits))
+            br3 = faiss.BitstringReader(faiss.swig_ptr(codes3[i]), rq3.code_size)
+            self.assertEqual(br.read(rq3.tot_bits), br3.read(rq3.tot_bits))
+
 
 ###########################################################
 # Test index, index factory sa_encode / sa_decode
 ###########################################################
 
+def unpack_codes(rq, packed_codes):
+    nbits = faiss.vector_to_array(rq.nbits)
+    if np.all(nbits == 8):
+        return packed_codes.astype("uint32")
+    nbits = [int(x) for x in nbits]
+    nb = len(nbits)
+    n, code_size = packed_codes.shape
+    codes = np.zeros((n, nb), dtype="uint32")
+    for i in range(n):
+        br = faiss.BitstringReader(faiss.swig_ptr(packed_codes[i]), code_size)
+        for j, nbi in enumerate(nbits):
+            codes[i, j] = br.read(nbi)
+    return codes
+
+
+def retrain_AQ_codebook(index, xt):
+    """ reference implementation of codebook retraining """
+    rq = index.rq
+
+    codes_packed = index.sa_encode(xt)
+    n, code_size = codes_packed.shape
+
+    x_decoded = index.sa_decode(codes_packed)
+    MSE = ((xt - x_decoded) ** 2).sum() / n
+    # print(f"Initial MSE on training set: {MSE:g}")
+
+    codes = unpack_codes(index.rq, codes_packed)
+    # print("ref codes", codes[0])
+    codebook_offsets = faiss.vector_to_array(rq.codebook_offsets)
+
+    # build sparse code matrix (represented as a dense matrix)
+    C = np.zeros((n, rq.total_codebook_size))
+
+    for i in range(n):
+        C[i][codes[i] + codebook_offsets[:-1]] = 1
+
+    # import pdb; pdb.set_trace()
+    # import scipy
+    # B, residuals, rank, singvals = np.linalg.lstsq(C, xt, rcond=None)
+    if True:
+        B, residuals, rank, singvals = np.linalg.lstsq(C, xt, rcond=None)
+    else:
+        import scipy.linalg
+        B, residuals, rank, singvals = scipy.linalg.lstsq(C, xt, )
+
+    MSE = ((C @ B - xt) ** 2).sum() / n
+    # print(f"MSE after retrainining: {MSE:g}")
+
+    # replace codebook
+    # faiss.copy_array_to_vector(B.astype('float32').ravel(), index.rq.codebooks)
+    # update codebook tables
+    # index.rq.compute_codebook_tables()
+
+    return C, B
+
+
 class TestIndexResidualQuantizer(unittest.TestCase):
 
     def test_io(self):
@@ -350,7 +440,6 @@ def test_factory_norm(self):
             index.rq.search_type,
             faiss.AdditiveQuantizer.ST_norm_qint8)
 
-
     def test_search_decompress(self):
         ds = datasets.SyntheticDataset(32, 1000, 1000, 100)
 
@@ -372,6 +461,55 @@ def test_search_decompress(self):
         # recalls are {1: 0.05, 10: 0.37, 100: 0.37}
         self.assertGreater(recalls[10], 0.35)
 
+    def test_reestimate_codebook(self):
+        ds = datasets.SyntheticDataset(32, 1000, 1000, 100)
+
+        xt = ds.get_train()
+        xb = ds.get_database()
+
+        ir = faiss.IndexResidualQuantizer(ds.d, 3, 4)
+        ir.train(xt)
+
+        # ir.rq.verbose = True
+        xb_decoded = ir.sa_decode(ir.sa_encode(xb))
+        err_before = ((xb - xb_decoded) ** 2).sum()
+
+        # test manual call of retrain_AQ_codebook
+
+        ref_C, ref_codebook = retrain_AQ_codebook(ir, xb)
+        ir.rq.retrain_AQ_codebook(len(xb), faiss.swig_ptr(xb))
+
+        xb_decoded = ir.sa_decode(ir.sa_encode(xb))
+        err_after = ((xb - xb_decoded) ** 2).sum()
+
+        # ref run: 8347.857 vs. 7710.014
+        self.assertGreater(err_before, err_after * 1.05)
+
+    def test_reestimate_codebook_2(self):
+        ds = datasets.SyntheticDataset(32, 1000, 0, 0)
+        xt = ds.get_train()
+
+        ir = faiss.IndexResidualQuantizer(ds.d, 3, 4)
+        ir.rq.train_type = 0
+        ir.train(xt)
+
+        xt_decoded = ir.sa_decode(ir.sa_encode(xt))
+        err_before = ((xt - xt_decoded) ** 2).sum()
+
+        ir = faiss.IndexResidualQuantizer(ds.d, 3, 4)
+        ir.rq.train_type = faiss.ResidualQuantizer.Train_refine_codebook
+        ir.train(xt)
+
+        xt_decoded = ir.sa_decode(ir.sa_encode(xt))
+        err_after_refined = ((xt - xt_decoded) ** 2).sum()
+
+        # print(err_before, err_after_refined)
+        # ref run 7474.98 / 7006.1777
+        self.assertGreater(err_before, err_after_refined * 1.06)
+
+
+
+
 
 ###########################################################
 # As a coarse quantizer
@@ -505,6 +643,17 @@ def test_rcq_LUT(self):
         np.testing.assert_array_almost_equal(CDref, CDnew, decimal=5)
         np.testing.assert_array_equal(CIref, CInew)
 
+        # check that you can load the index without computing the tables
+        quantizer.set_beam_factor(2.0)
+        self.assertNotEqual(quantizer.rq.codebook_cross_products.size(), 0)
+        quantizer3 = faiss.deserialize_index(
+            faiss.serialize_index(quantizer),
+            faiss.IO_FLAG_SKIP_PRECOMPUTE_TABLE
+        )
+        self.assertEqual(quantizer3.rq.codebook_cross_products.size(), 0)
+        CD3, CI3 = quantizer3.search(ds.get_queries(), 10)
+
+
 ###########################################################
 # Test search with LUTs
 ###########################################################
@@ -631,7 +780,7 @@ def test_search_L2(self):
                 self.assertLess((Iref != I2).sum(), Iref.size * 0.05)
             else:
                 inter_2 = faiss.eval_intersection(I2, gt)
-                self.assertGreater(inter_ref, inter_2)
+                self.assertGreaterEqual(inter_ref, inter_2)
                 # print(st, inter_ref, inter_2)
 
 
@@ -788,9 +937,9 @@ def test_residual_IP(self):
 
 
 def precomp_codebooks(codebooks):
-
+    M = len(codebooks)
     codebook_cross_prods = [
-        [c1 @ c2.T for c1 in codebooks] for c2 in codebooks
+        [codebooks[m1] @ codebooks[m].T for m1 in range(m)] for m in range(M)
     ]
     cent_norms = [
         (c ** 2).sum(1)
@@ -800,7 +949,7 @@ def precomp_codebooks(codebooks):
 
 
 ############################################################
-# Reference imelementation of table-based beam search
+# Reference imelementation of table-based beam search (use_beam_LUT=1)
 ############################################################
 
 def beam_search_encode_step_tab(codes, L, distances, codebook_cross_prods_i,
@@ -951,28 +1100,29 @@ def test_precomp(self):
         precomp = precomp_codebooks(codebooks)
         codebook_cross_prods_ref, cent_norms_ref = precomp
 
-        # check C++ precomp tables
-        codebook_cross_prods_ref = np.hstack([
-            np.vstack(c) for c in codebook_cross_prods_ref])
+        # validate that the python tab-based encoding works
+        xb = ds.get_database()
+        ref_codes, _, _ = beam_search_encoding_ref(codebooks, xb, 7)
+        new_codes, _ = beam_search_encoding_tab(codebooks, xb, 7, precomp)
+        np.testing.assert_array_equal(ref_codes, new_codes)
 
+        # check C++ precomp tables
         rq.compute_codebook_tables()
         codebook_cross_prods = faiss.vector_to_array(
             rq.codebook_cross_products)
-        codebook_cross_prods = codebook_cross_prods.reshape(
-            rq.total_codebook_size, rq.total_codebook_size)
-        cent_norms = faiss.vector_to_array(rq.cent_norms)
+        ofs = 0
+        for m in range(1, rq.M):
+            py_table = np.vstack(codebook_cross_prods_ref[m])
+            kk = rq.codebook_offsets.at(m)
+            K = 1 << rq.nbits.at(m)
+            cpp_table = codebook_cross_prods[ofs:ofs + K * kk].reshape(kk, K)
+            ofs += kk * K
+            np.testing.assert_allclose(py_table, cpp_table, rtol=2e-4)
 
-        np.testing.assert_array_almost_equal(
-            codebook_cross_prods, codebook_cross_prods_ref, decimal=5)
+        cent_norms = faiss.vector_to_array(rq.cent_norms)
         np.testing.assert_array_almost_equal(
             np.hstack(cent_norms_ref), cent_norms, decimal=5)
 
-        # validate that the python tab-based encoding works
-        xb = ds.get_database()
-        ref_codes, _, _ = beam_search_encoding_ref(codebooks, xb, 7)
-        new_codes, _ = beam_search_encoding_tab(codebooks, xb, 7, precomp)
-        np.testing.assert_array_equal(ref_codes, new_codes)
-
         # validate the C++ beam_search_encode_step_tab function
         beam_search_encoding_tab(codebooks, xb, 7, precomp, implem="ref cpp")
 
@@ -993,3 +1143,147 @@ def test_precomp(self):
         rq.use_beam_LUT = 1
         codes_new = rq.compute_codes(xb)
         np.testing.assert_array_equal(codes_ref_residuals, codes_new)
+
+
+class TestProductResidualQuantizer(unittest.TestCase):
+
+    def test_codec(self):
+        """check that the error is in the same ballpark as PQ."""
+        ds = datasets.SyntheticDataset(64, 3000, 3000, 0)
+
+        xt = ds.get_train()
+        xb = ds.get_database()
+
+        nsplits = 2
+        Msub = 2
+        nbits = 4
+
+        prq = faiss.ProductResidualQuantizer(ds.d, nsplits, Msub, nbits)
+        prq.train(xt)
+        err_prq = eval_codec(prq, xb)
+
+        pq = faiss.ProductQuantizer(ds.d, nsplits * Msub, nbits)
+        pq.train(xt)
+        err_pq = eval_codec(pq, xb)
+
+        # print(err_prq, err_pq)
+        self.assertLess(err_prq, err_pq)
+
+    def test_with_rq(self):
+        """compare with RQ when nsplits = 1"""
+        ds = datasets.SyntheticDataset(32, 3000, 3000, 0)
+
+        xt = ds.get_train()
+        xb = ds.get_database()
+
+        M = 4
+        nbits = 4
+
+        prq = faiss.ProductResidualQuantizer(ds.d, 1, M, nbits)
+        prq.train(xt)
+        err_prq = eval_codec(prq, xb)
+
+        rq = faiss.ResidualQuantizer(ds.d, M, nbits)
+        rq.train(xt)
+        err_rq = eval_codec(rq, xb)
+
+        # print(err_prq, err_rq)
+        self.assertEqual(err_prq, err_rq)
+
+
+class TestIndexProductResidualQuantizer(unittest.TestCase):
+
+    def test_accuracy1(self):
+        """check that the error is in the same ballpark as RQ."""
+        recall1 = self.eval_index_accuracy("PRQ4x3x5_Nqint8")
+        recall2 = self.eval_index_accuracy("RQ12x5_Nqint8")
+        self.assertGreaterEqual(recall1 * 1.1, recall2)  # 657 vs 665
+
+    def test_accuracy2(self):
+        """when nsplits = 1, PRQ should be the same as RQ"""
+        recall1 = self.eval_index_accuracy("PRQ1x3x5_Nqint8")
+        recall2 = self.eval_index_accuracy("RQ3x5_Nqint8")
+        self.assertEqual(recall1, recall2)
+
+    def eval_index_accuracy(self, index_key):
+        ds = datasets.SyntheticDataset(32, 1000, 1000, 100)
+        index = faiss.index_factory(ds.d, index_key)
+
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        D, I = index.search(ds.get_queries(), 10)
+        inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
+
+        # do a little I/O test
+        index2 = faiss.deserialize_index(faiss.serialize_index(index))
+        D2, I2 = index2.search(ds.get_queries(), 10)
+        np.testing.assert_array_equal(I2, I)
+        np.testing.assert_array_equal(D2, D)
+
+        return inter
+
+    def test_factory(self):
+        AQ = faiss.AdditiveQuantizer
+        ns, Msub, nbits = 2, 4, 8
+        index = faiss.index_factory(64, f"PRQ{ns}x{Msub}x{nbits}_Nqint8")
+        assert isinstance(index, faiss.IndexProductResidualQuantizer)
+        self.assertEqual(index.prq.nsplits, ns)
+        self.assertEqual(index.prq.subquantizer(0).M, Msub)
+        self.assertEqual(index.prq.subquantizer(0).nbits.at(0), nbits)
+        self.assertEqual(index.prq.search_type, AQ.ST_norm_qint8)
+
+        code_size = (ns * Msub * nbits + 7) // 8 + 1
+        self.assertEqual(index.prq.code_size, code_size)
+
+
+class TestIndexIVFProductResidualQuantizer(unittest.TestCase):
+
+    def eval_index_accuracy(self, factory_key):
+        ds = datasets.SyntheticDataset(32, 1000, 1000, 100)
+        index = faiss.index_factory(ds.d, factory_key)
+
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        inters = []
+        for nprobe in 1, 2, 5, 10, 20, 50:
+            index.nprobe = nprobe
+            D, I = index.search(ds.get_queries(), 10)
+            inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
+            inters.append(inter)
+
+        inters = np.array(inters)
+        # 1.05: test relaxed for OSX on ARM
+        self.assertTrue(np.all(inters[1:] * 1.05 >= inters[:-1]))
+
+        # do a little I/O test
+        index2 = faiss.deserialize_index(faiss.serialize_index(index))
+        D2, I2 = index2.search(ds.get_queries(), 10)
+        np.testing.assert_array_equal(I2, I)
+        np.testing.assert_array_equal(D2, D)
+
+        return inter
+
+    def test_index_accuracy(self):
+        self.eval_index_accuracy("IVF100,PRQ2x2x5_Nqint8")
+
+    def test_index_accuracy2(self):
+        """check that the error is in the same ballpark as RQ."""
+        inter1 = self.eval_index_accuracy("IVF100,PRQ2x2x5_Nqint8")
+        inter2 = self.eval_index_accuracy("IVF100,RQ4x5_Nqint8")
+        # print(inter1, inter2)  # 392 vs 374
+        self.assertGreaterEqual(inter1 * 1.1, inter2)
+
+    def test_factory(self):
+        AQ = faiss.AdditiveQuantizer
+        ns, Msub, nbits = 2, 4, 8
+        index = faiss.index_factory(64, f"IVF100,PRQ{ns}x{Msub}x{nbits}_Nqint8")
+        assert isinstance(index, faiss.IndexIVFProductResidualQuantizer)
+        self.assertEqual(index.nlist, 100)
+        self.assertEqual(index.prq.nsplits, ns)
+        self.assertEqual(index.prq.subquantizer(0).M, Msub)
+        self.assertEqual(index.prq.subquantizer(0).nbits.at(0), nbits)
+        self.assertEqual(index.prq.search_type, AQ.ST_norm_qint8)
+
+        code_size = (ns * Msub * nbits + 7) // 8 + 1
+        self.assertEqual(index.prq.code_size, code_size)
diff --git a/thirdparty/faiss/tests/test_rowwise_minmax.py b/thirdparty/faiss/tests/test_rowwise_minmax.py
new file mode 100644
index 000000000..dbd14de38
--- /dev/null
+++ b/thirdparty/faiss/tests/test_rowwise_minmax.py
@@ -0,0 +1,56 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+import faiss
+import unittest
+
+from common_faiss_tests import get_dataset_2
+
+
+class TestIndexRowwiseMinmax(unittest.TestCase):
+    def compare_train_vs_train_inplace(self, factory_key):
+        d = 96
+        nb = 1000
+        nq = 0
+        nt = 2000
+
+        xt, x, _ = get_dataset_2(d, nt, nb, nq)
+
+        assert x.size > 0
+
+        codec = faiss.index_factory(d, factory_key)
+
+        # use the regular .train()
+        codec.train(xt)
+        codes_train = codec.sa_encode(x)
+
+        decoded = codec.sa_decode(codes_train)
+
+        # use .train_inplace()
+        xt_cloned = np.copy(xt)
+        codec.train_inplace(xt_cloned)
+        codes_train_inplace = codec.sa_encode(x)
+
+        # compare .train and .train_inplace codes
+        n_diff = (codes_train != codes_train_inplace).sum()
+        self.assertEqual(n_diff, 0)
+
+        # make sure that the array used for .train_inplace got affected
+        n_diff_xt = (xt_cloned != xt).sum()
+        self.assertNotEqual(n_diff_xt, 0)
+
+        # make sure that the reconstruction error is not crazy
+        reconstruction_err = ((x - decoded) ** 2).sum()
+        print(reconstruction_err)
+
+        self.assertLess(reconstruction_err, 0.6)
+
+    def test_fp32(self) -> None:
+        self.compare_train_vs_train_inplace("MinMax,SQ8")
+
+    def test_fp16(self) -> None:
+        self.compare_train_vs_train_inplace("MinMaxFP16,SQ8")
diff --git a/thirdparty/faiss/tests/test_search_params.py b/thirdparty/faiss/tests/test_search_params.py
new file mode 100644
index 000000000..d832a07cf
--- /dev/null
+++ b/thirdparty/faiss/tests/test_search_params.py
@@ -0,0 +1,468 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+
+import faiss
+import unittest
+import sys
+import gc
+
+from faiss.contrib import datasets
+from faiss.contrib.evaluation import sort_range_res_2, check_ref_range_results
+
+faiss.omp_set_num_threads(4)
+
+
+class TestSelector(unittest.TestCase):
+    """
+    Test the IDSelector filtering for as many (index class, id selector class)
+    combinations as possible.
+    """
+
+    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2):
+        """ Verify that the id selector returns the subset of results that are
+        members according to the IDSelector.
+        Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor"
+        """
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, index_key, mt)
+        index.train(ds.get_train())
+        k = 10
+
+        # reference result
+        if "range" in id_selector_type:
+            subset = np.arange(30, 80).astype('int64')
+        elif id_selector_type == "or":
+            lhs_rs = np.random.RandomState(123)
+            lhs_subset = lhs_rs.choice(ds.nb, 50, replace=False).astype("int64")
+            rhs_rs = np.random.RandomState(456)
+            rhs_subset = rhs_rs.choice(ds.nb, 20, replace=False).astype("int64")
+            subset = np.union1d(lhs_subset, rhs_subset)
+        elif id_selector_type == "and":
+            lhs_rs = np.random.RandomState(123)
+            lhs_subset = lhs_rs.choice(ds.nb, 50, replace=False).astype("int64")
+            rhs_rs = np.random.RandomState(456)
+            rhs_subset = rhs_rs.choice(ds.nb, 10, replace=False).astype("int64")
+            subset = np.intersect1d(lhs_subset, rhs_subset)
+        elif id_selector_type == "xor":
+            lhs_rs = np.random.RandomState(123)
+            lhs_subset = lhs_rs.choice(ds.nb, 50, replace=False).astype("int64")
+            rhs_rs = np.random.RandomState(456)
+            rhs_subset = rhs_rs.choice(ds.nb, 40, replace=False).astype("int64")
+            subset = np.setxor1d(lhs_subset, rhs_subset)
+        else:
+            rs = np.random.RandomState(123)
+            subset = rs.choice(ds.nb, 50, replace=False).astype("int64")
+        # add_with_ids not supported for all index types
+        # index.add_with_ids(ds.get_database()[subset], subset)
+        index.add(ds.get_database()[subset])
+        if "IVF" in index_key and id_selector_type == "range_sorted":
+            self.assertTrue(index.check_ids_sorted())
+        Dref, Iref0 = index.search(ds.get_queries(), k)
+        Iref = subset[Iref0]
+        Iref[Iref0 < 0] = -1
+
+        radius = float(Dref[Iref > 0].max()) * 1.01
+        try:
+            Rlims_ref, RDref, RIref = index.range_search(
+                ds.get_queries(), radius)
+        except RuntimeError as e:
+            if "not implemented" in str(e):
+                have_range_search = False
+            else:
+                raise
+        else:
+            RIref = subset[RIref]
+            # normalize the range search results
+            RDref, RIref = sort_range_res_2(Rlims_ref, RDref, RIref)
+            have_range_search = True
+
+        # result with selector: fill full database and search with selector
+        index.reset()
+        index.add(ds.get_database())
+        if id_selector_type == "range":
+            sel = faiss.IDSelectorRange(30, 80)
+        elif id_selector_type == "range_sorted":
+            sel = faiss.IDSelectorRange(30, 80, True)
+        elif id_selector_type == "array":
+            sel = faiss.IDSelectorArray(subset)
+        elif id_selector_type == "bitmap":
+            bitmap = np.zeros(ds.nb, dtype=bool)
+            bitmap[subset] = True
+            bitmap = np.packbits(bitmap, bitorder='little')
+            sel = faiss.IDSelectorBitmap(bitmap)
+        elif id_selector_type == "not":
+            ssubset = set(subset)
+            inverse_subset = np.array([
+                i for i in range(ds.nb)
+                if i not in ssubset
+            ]).astype('int64')
+            sel = faiss.IDSelectorNot(faiss.IDSelectorBatch(inverse_subset))
+        elif id_selector_type == "or":
+            sel = faiss.IDSelectorOr(
+                faiss.IDSelectorBatch(lhs_subset),
+                faiss.IDSelectorBatch(rhs_subset)
+            )
+        elif id_selector_type == "and":
+            sel = faiss.IDSelectorAnd(
+                faiss.IDSelectorBatch(lhs_subset),
+                faiss.IDSelectorBatch(rhs_subset)
+            )
+        elif id_selector_type == "xor":
+            sel = faiss.IDSelectorXOr(
+                faiss.IDSelectorBatch(lhs_subset),
+                faiss.IDSelectorBatch(rhs_subset)
+            )
+        else:
+            sel = faiss.IDSelectorBatch(subset)
+
+        params = (
+            faiss.SearchParametersIVF(sel=sel) if "IVF" in index_key else
+            faiss.SearchParametersPQ(sel=sel) if "PQ" in index_key else
+            faiss.SearchParameters(sel=sel)
+        )
+        Dnew, Inew = index.search(ds.get_queries(), k, params=params)
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_almost_equal(Dref, Dnew, decimal=5)
+
+        if have_range_search:
+            Rlims_new, RDnew, RInew = index.range_search(
+                ds.get_queries(), radius, params=params)
+            np.testing.assert_array_equal(Rlims_ref, Rlims_new)
+            RDref, RIref = sort_range_res_2(Rlims_ref, RDref, RIref)
+            np.testing.assert_array_equal(RIref, RInew)
+            np.testing.assert_almost_equal(RDref, RDnew, decimal=5)
+
+    def test_IVFFlat(self):
+        self.do_test_id_selector("IVF32,Flat")
+
+    def test_IVFFlat_range_sorted(self):
+        self.do_test_id_selector("IVF32,Flat", id_selector_type="range_sorted")
+
+    def test_IVFPQ(self):
+        self.do_test_id_selector("IVF32,PQ4x4np")
+
+    def test_IVFSQ(self):
+        self.do_test_id_selector("IVF32,SQ8")
+
+    def test_pretrans(self):
+        self.do_test_id_selector("PCA16,IVF32,Flat")
+
+    def test_SQ(self):
+        self.do_test_id_selector("SQ8")
+
+    def test_Flat(self):
+        self.do_test_id_selector("Flat")
+
+    def test_Flat_IP(self):
+        self.do_test_id_selector("Flat", mt=faiss.METRIC_INNER_PRODUCT)
+
+    def test_Flat_id_range(self):
+        self.do_test_id_selector("Flat", id_selector_type="range")
+
+    def test_Flat_IP_id_range(self):
+        self.do_test_id_selector(
+            "Flat", id_selector_type="range",
+            mt=faiss.METRIC_INNER_PRODUCT
+        )
+
+    def test_Flat_id_array(self):
+        self.do_test_id_selector("Flat", id_selector_type="array")
+
+    def test_Flat_IP_id_array(self):
+        self.do_test_id_selector(
+            "Flat", id_selector_type="array",
+            mt=faiss.METRIC_INNER_PRODUCT
+        )
+
+    def test_Flat_id_bitmap(self):
+        self.do_test_id_selector("Flat", id_selector_type="bitmap")
+
+    def test_Flat_id_not(self):
+        self.do_test_id_selector("Flat", id_selector_type="not")
+
+    def test_Flat_id_or(self):
+        self.do_test_id_selector("Flat", id_selector_type="or")
+
+    # not implemented
+
+    # def test_PQ(self):
+    #    self.do_test_id_selector("PQ4x4np")
+
+    # def test_AQ(self):
+    #    self.do_test_id_selector("RQ3x4")
+
+    def do_test_id_selector_weak(self, index_key):
+        """ verify that the selected subset is the subset  in the list"""
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, index_key)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        k = 10
+        Dref, Iref = index.search(ds.get_queries(), k)
+
+        # reference result
+        rs = np.random.RandomState(123)
+        subset = rs.choice(ds.nb, 50, replace=False).astype("int64")
+        sel = faiss.IDSelectorBatch(subset)
+        params = faiss.SearchParametersHNSW()
+        params.sel = sel
+        Dnew, Inew = index.search(ds.get_queries(), k, params=params)
+        mask = np.zeros(ds.nb, dtype=bool)
+        mask[subset] = True
+        for q in range(len(Iref)):
+            mask_q, = np.where(mask[Iref[q]])
+            l = len(mask_q)
+            np.testing.assert_array_equal(Iref[q, mask_q], Inew[q, :l])
+            np.testing.assert_array_equal(Dref[q, mask_q], Dnew[q, :l])
+
+    def test_HSNW(self):
+        self.do_test_id_selector_weak("HNSW")
+
+    def test_idmap(self):
+        ds = datasets.SyntheticDataset(32, 100, 100, 20)
+        rs = np.random.RandomState(123)
+        ids = rs.choice(10000, size=100, replace=False)
+        mask = ids % 2 == 0
+        index = faiss.index_factory(ds.d, "IDMap,SQ8")
+        index.train(ds.get_train())
+
+        # ref result
+        index.add_with_ids(ds.get_database()[mask], ids[mask])
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        # with selector
+        index.reset()
+        index.add_with_ids(ds.get_database(), ids)
+
+        valid_ids = ids[mask]
+        sel = faiss.IDSelectorTranslated(
+            index, faiss.IDSelectorBatch(valid_ids))
+
+        Dnew, Inew = index.search(
+            ds.get_queries(), 10,
+            params=faiss.SearchParameters(sel=sel)
+        )
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
+
+        # let the IDMap::search add the translation...
+        Dnew, Inew = index.search(
+            ds.get_queries(), 10,
+            params=faiss.SearchParameters(sel=faiss.IDSelectorBatch(valid_ids))
+        )
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
+
+
+class TestSearchParams(unittest.TestCase):
+
+    def do_test_with_param(
+            self, index_key, ps_params, params):
+        """
+        Test equivalence between setting
+        1. param_name_2 = value with ParameterSpace
+        2. pass in a SearchParameters with param_name = value
+        """
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, index_key)
+        if index_key.startswith("PQ"):
+            index.polysemous_training.n_iter = 50000
+            index.polysemous_training.n_redo = 1
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        I0, D0 = index.search(ds.get_queries(), 10)
+
+        Dnew, Inew = index.search(ds.get_queries(), 10, params=params)
+
+        # make sure rhe parameter does indeed change the result...
+        self.assertFalse(np.all(Inew == I0))
+
+        for param_name, value in ps_params.items():
+            faiss.ParameterSpace().set_index_parameter(
+                index, param_name, value)
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_array_equal(Dref, Dnew)
+
+    def test_nprobe(self):
+        self.do_test_with_param(
+                "IVF32,Flat", {"nprobe": 3},
+                faiss.SearchParametersIVF(nprobe=3))
+
+    def test_efSearch(self):
+        self.do_test_with_param(
+            "HNSW", {"efSearch": 4},
+            faiss.SearchParametersHNSW(efSearch=4))
+
+    def test_quantizer_hnsw(self):
+        self.do_test_with_param(
+            "IVF200_HNSW,Flat",
+            {"quantizer_efSearch": 5, "nprobe": 10},
+            faiss.SearchParametersIVF(
+                nprobe=10,
+                quantizer_params=faiss.SearchParametersHNSW(
+                    efSearch=5)
+            )
+        )
+
+    def test_PQ_polysemous_ht(self):
+        self.do_test_with_param(
+            "PQ4x8",
+            {"ht": 10},
+            faiss.SearchParametersPQ(
+                polysemous_ht=10,
+                search_type=faiss.IndexPQ.ST_polysemous
+            )
+        )
+
+    def test_max_codes(self):
+        " tests whether the max nb codes is taken into account "
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, "IVF32,Flat")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+
+        stats = faiss.cvar.indexIVF_stats
+        stats.reset()
+        D0, I0 = index.search(
+            ds.get_queries(), 10,
+            params=faiss.SearchParametersIVF(nprobe=8)
+        )
+        ndis0 = stats.ndis
+        target_ndis = ndis0 // ds.nq  # a few queries will be below, a few above
+        for q in range(ds.nq):
+            stats.reset()
+            Dq, Iq = index.search(
+                ds.get_queries()[q:q + 1], 10,
+                params=faiss.SearchParametersIVF(
+                    nprobe=8, max_codes=target_ndis
+                )
+            )
+            self.assertLessEqual(stats.ndis, target_ndis)
+            if stats.ndis < target_ndis:
+                np.testing.assert_equal(I0[q], Iq[0])
+
+    def test_ownership(self):
+        # see https://github.com/facebookresearch/faiss/issues/2996
+        subset = np.arange(0, 50)
+        sel = faiss.IDSelectorBatch(subset)
+        self.assertTrue(sel.this.own())
+        params = faiss.SearchParameters(sel=sel)
+        self.assertTrue(sel.this.own())  # otherwise mem leak!
+        # this is a somewhat fragile test because it assumes the
+        # gc decreases refcounts immediately.
+        prev_count = sys.getrefcount(sel)
+        del params
+        new_count = sys.getrefcount(sel)
+        self.assertEqual(new_count, prev_count - 1)
+
+        # check for other objects as well
+        sel1 = faiss.IDSelectorBatch([1, 2, 3])
+        sel2 = faiss.IDSelectorBatch([4, 5, 6])
+        sel = faiss.IDSelectorAnd(sel1, sel2)
+        # make storage is still managed by python
+        self.assertTrue(sel1.this.own())
+        self.assertTrue(sel2.this.own())
+
+
+class TestSelectorCallback(unittest.TestCase):
+
+    def test(self):
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, "IVF32,Flat")
+        index.train(ds.get_train())
+        k = 10
+        rs = np.random.RandomState(123)
+        subset = rs.choice(ds.nb, 50, replace=False)
+
+        params = faiss.SearchParametersIVF(
+            sel=faiss.IDSelectorBatch(subset),
+            nprobe=4
+        )
+
+        Dref, Iref = index.search(ds.get_queries(), k, params=params)
+
+        def is_member(idx):
+            return idx in subset
+
+        params = faiss.SearchParametersIVF(
+            sel=faiss.PyCallbackIDSelector(is_member),
+            nprobe=4
+        )
+
+        Dnew, Inew = index.search(ds.get_queries(), k, params=params)
+
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_almost_equal(Dref, Dnew, decimal=5)
+
+
+class TestSortedIDSelectorRange(unittest.TestCase):
+    """ to test the sorted id bounds, there are a few cases to consider """
+
+    def do_test_sorted(self, imin, imax, n=100):
+        selr = faiss.IDSelectorRange(imin, imax, True)
+        sp = faiss.swig_ptr
+        for seed in range(10):
+            rs = np.random.RandomState(seed)
+            ids = rs.choice(30, n).astype('int64')
+            ids.sort()
+            j01 = np.zeros(2, dtype='uint64')
+            selr.find_sorted_ids_bounds(
+                len(ids), sp(ids), sp(j01[:1]), sp(j01[1:]))
+            j0, j1 = j01.astype(int)
+            ref_idx, = np.where((ids >= imin) & (ids < imax))
+            np.testing.assert_array_equal(ref_idx, np.arange(j0, j1))
+
+    def test_sorted_in_range(self):
+        self.do_test_sorted(10, 20)
+
+    def test_sorted_out_0(self):
+        self.do_test_sorted(-10, 20)
+
+    def test_sorted_out_1(self):
+        self.do_test_sorted(10, 40)
+
+    def test_sorted_in_range_smalln(self):
+        self.do_test_sorted(10, 20, n=5)
+
+    def test_12_92(self):
+        selr = faiss.IDSelectorRange(30, 80, True)
+        ids = np.array([12, 92], dtype='int64')
+        j01 = np.zeros(2, dtype='uint64')
+        sp = faiss.swig_ptr
+        selr.find_sorted_ids_bounds(
+            len(ids), sp(ids), sp(j01[:1]), sp(j01[1:]))
+        print(j01)
+        assert j01[0] >= j01[1]
+
+
+class TestPrecomputed(unittest.TestCase):
+
+    def test_knn_and_range(self):
+        ds = datasets.SyntheticDataset(32, 1000, 100, 20)
+        index = faiss.index_factory(ds.d, "IVF32,Flat")
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        index.nprobe = 5
+        Dref, Iref = index.search(ds.get_queries(), 10)
+
+        Dq, Iq = index.quantizer.search(ds.get_queries(), index.nprobe)
+        Dnew, Inew = index.search_preassigned(ds.get_queries(), 10, Iq, Dq)
+        np.testing.assert_equal(Iref, Inew)
+        np.testing.assert_equal(Dref, Dnew)
+
+        r2 = float(np.median(Dref[:, 5]))
+        Lref, Dref, Iref = index.range_search(ds.get_queries(), r2)
+        assert Lref.size > 10   # make sure there is something to test...
+
+        Lnew, Dnew, Inew = index.range_search_preassigned(ds.get_queries(), r2, Iq, Dq)
+        check_ref_range_results(
+            Lref, Dref, Iref,
+            Lnew, Dnew, Inew
+        )
diff --git a/thirdparty/faiss/tests/test_simdlib.cpp b/thirdparty/faiss/tests/test_simdlib.cpp
new file mode 100644
index 000000000..58ebc8585
--- /dev/null
+++ b/thirdparty/faiss/tests/test_simdlib.cpp
@@ -0,0 +1,264 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/utils/simdlib.h>
+
+using namespace faiss;
+
+TEST(TestSIMDLib, TestCmpltAndBlendInplace) {
+    simd8float32 lowestValues(0, 1, 2, 3, 4, 5, 6, 7);
+    simd8uint32 lowestIndices(0, 1, 2, 3, 4, 5, 6, 7);
+
+    simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
+    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
+    cmplt_and_blend_inplace(
+            candidateValues0, candidateIndices0, lowestValues, lowestIndices);
+
+    simd8float32 candidateValues1(6, 6, 6, 6, 6, 6, 6, 6);
+    simd8uint32 candidateIndices1(20, 21, 22, 23, 24, 25, 26, 27);
+    cmplt_and_blend_inplace(
+            candidateValues1, candidateIndices1, lowestValues, lowestIndices);
+
+    simd8float32 candidateValues2(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 candidateIndices2(30, 31, 32, 33, 34, 35, 36, 37);
+    cmplt_and_blend_inplace(
+            candidateValues2, candidateIndices2, lowestValues, lowestIndices);
+
+    simd8float32 expectedValues(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 expectedIndices(0, 1, 2, 3, 4, 5, 16, 17);
+    ASSERT_TRUE(lowestValues.is_same_as(expectedValues));
+    ASSERT_TRUE(lowestIndices.is_same_as(expectedIndices));
+}
+
+TEST(TestSIMDLib, TestCmpltMinMaxFloat) {
+    simd8float32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8float32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
+
+    simd8float32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
+    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
+    simd8float32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
+    simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
+
+    cmplt_min_max_fast(
+            candidateValues0,
+            candidateIndices0,
+            currentValues0,
+            currentIndices0,
+            minValues,
+            minIndices,
+            maxValues,
+            maxIndices);
+
+    simd8float32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
+    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
+    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
+
+    simd8float32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
+    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
+    simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
+    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
+    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
+}
+
+TEST(TestSIMDLib, TestCmpltMinMaxInt) {
+    simd8uint32 minValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 minIndices(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 maxValues(0, 0, 0, 0, 0, 0, 0, 0);
+    simd8uint32 maxIndices(0, 0, 0, 0, 0, 0, 0, 0);
+
+    simd8uint32 candidateValues0(5, 5, 5, 5, 5, 5, 5, 5);
+    simd8uint32 candidateIndices0(10, 11, 12, 13, 14, 15, 16, 17);
+    simd8uint32 currentValues0(0, 1, 2, 3, 4, 5, 6, 7);
+    simd8uint32 currentIndices0(0, 1, 2, 3, 4, 5, 6, 7);
+
+    cmplt_min_max_fast(
+            candidateValues0,
+            candidateIndices0,
+            currentValues0,
+            currentIndices0,
+            minValues,
+            minIndices,
+            maxValues,
+            maxIndices);
+
+    simd8uint32 expectedMinValues(0, 1, 2, 3, 4, 5, 5, 5);
+    simd8uint32 expectedMinIndices(0, 1, 2, 3, 4, 5, 16, 17);
+    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
+    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
+
+    simd8uint32 expectedMaxValues(5, 5, 5, 5, 5, 5, 6, 7);
+    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
+    simd8uint32 expectedMaxIndices(10, 11, 12, 13, 14, 15, 6, 7);
+    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
+    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
+}
+
+TEST(TestSIMDLib, TestCmpltMinMaxInt16) {
+    simd16uint16 minValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd16uint16 minIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd16uint16 maxValues(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    simd16uint16 maxIndices(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+    simd16uint16 candidateValues0(
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005);
+    simd16uint16 candidateIndices0(
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            16,
+            17,
+            1010,
+            1011,
+            1012,
+            1013,
+            1014,
+            1015,
+            1016,
+            1017);
+    simd16uint16 currentValues0(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1006,
+            1007);
+    simd16uint16 currentIndices0(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1006,
+            1007);
+
+    cmplt_min_max_fast(
+            candidateValues0,
+            candidateIndices0,
+            currentValues0,
+            currentIndices0,
+            minValues,
+            minIndices,
+            maxValues,
+            maxIndices);
+
+    simd16uint16 expectedMinValues(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            5,
+            5,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1005,
+            1005);
+    simd16uint16 expectedMinIndices(
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            16,
+            17,
+            1000,
+            1001,
+            1002,
+            1003,
+            1004,
+            1005,
+            1016,
+            1017);
+    ASSERT_TRUE(minValues.is_same_as(expectedMinValues));
+    ASSERT_TRUE(minIndices.is_same_as(expectedMinIndices));
+
+    simd16uint16 expectedMaxValues(
+            5,
+            5,
+            5,
+            5,
+            5,
+            5,
+            6,
+            7,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1005,
+            1006,
+            1007);
+    // the result is not 10,11,12,13,14,5,6,7 because it is _fast version
+    simd16uint16 expectedMaxIndices(
+            10,
+            11,
+            12,
+            13,
+            14,
+            15,
+            6,
+            7,
+            1010,
+            1011,
+            1012,
+            1013,
+            1014,
+            1015,
+            1006,
+            1007);
+    ASSERT_TRUE(maxValues.is_same_as(expectedMaxValues));
+    ASSERT_TRUE(maxIndices.is_same_as(expectedMaxIndices));
+}
diff --git a/thirdparty/faiss/tests/test_sliding_ivf.cpp b/thirdparty/faiss/tests/test_sliding_ivf.cpp
index 851713c52..ea9e53d6b 100644
--- a/thirdparty/faiss/tests/test_sliding_ivf.cpp
+++ b/thirdparty/faiss/tests/test_sliding_ivf.cpp
@@ -22,8 +22,6 @@
 
 using namespace faiss;
 
-typedef Index::idx_t idx_t;
-
 // dimension of the vectors to index
 int d = 32;
 
@@ -81,7 +79,7 @@ void make_index_slices(
         Index* index = sub_indexes.back().get();
 
         auto xb = make_data(nb * d);
-        std::vector<faiss::Index::idx_t> ids(nb);
+        std::vector<faiss::idx_t> ids(nb);
         std::mt19937 rng;
         std::uniform_int_distribution<> distrib;
         for (int j = 0; j < nb; j++) {
diff --git a/thirdparty/faiss/tests/test_standalone_codec.py b/thirdparty/faiss/tests/test_standalone_codec.py
index b23e5e067..1e1993bb4 100644
--- a/thirdparty/faiss/tests/test_standalone_codec.py
+++ b/thirdparty/faiss/tests/test_standalone_codec.py
@@ -35,22 +35,23 @@ def do_encode_twice(self, factory_key):
 
         codes2 = codec.sa_encode(x2)
 
-        if 'IVF' not in factory_key:
-            self.assertTrue(np.all(codes == codes2))
-        else:
+        if 'IVF' in factory_key or 'RQ' in factory_key:
             # some rows are not reconstructed exactly because they
             # flip into another quantization cell
             nrowdiff = (codes != codes2).any(axis=1).sum()
             self.assertTrue(nrowdiff < 10)
+        else:
+            self.assertTrue(np.all(codes == codes2))
 
         x3 = codec.sa_decode(codes2)
-        if 'IVF' not in factory_key:
-            self.assertTrue(np.allclose(x2, x3))
-        else:
+
+        if 'IVF' in factory_key or 'RQ' in factory_key:
             diffs = np.abs(x2 - x3).sum(axis=1)
             avg = np.abs(x2).sum(axis=1).mean()
             diffs.sort()
             assert diffs[-10] < avg * 1e-5
+        else:
+            self.assertTrue(np.allclose(x2, x3))
 
     def test_SQ8(self):
         self.do_encode_twice('SQ8')
@@ -73,6 +74,9 @@ def test_IVFPQ6x8np(self):
     def test_LSH(self):
         self.do_encode_twice('LSHrt')
 
+    def test_RQ6x8(self):
+        self.do_encode_twice('RQ6x8')
+
 
 class TestIndexEquiv(unittest.TestCase):
 
@@ -296,15 +300,15 @@ def test_rw(self):
         for i in range(nbyte):
             self.assertTrue(((bignum >> (i * 8)) & 255) == bs[i])
 
-        for i in range(nbyte):
-            print(bin(bs[i] + 256)[3:], end=' ')
-        print()
+        #for i in range(nbyte):
+        #    print(bin(bs[i] + 256)[3:], end=' ')
+        # print()
 
         br = faiss.BitstringReader(swig_ptr(bs), nbyte)
 
         for nbit, xref in ctrl:
             xnew = br.read(nbit)
-            print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
+            # print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
             self.assertTrue(xnew == xref)
 
 
diff --git a/thirdparty/faiss/tests/test_threaded_index.cpp b/thirdparty/faiss/tests/test_threaded_index.cpp
index 359c1d8b1..b10d3806e 100644
--- a/thirdparty/faiss/tests/test_threaded_index.cpp
+++ b/thirdparty/faiss/tests/test_threaded_index.cpp
@@ -19,6 +19,8 @@ namespace {
 
 struct TestException : public std::exception {};
 
+using idx_t = faiss::idx_t;
+
 struct MockIndex : public faiss::Index {
     explicit MockIndex(idx_t d) : faiss::Index(d) {
         resetMock();
@@ -44,7 +46,8 @@ struct MockIndex : public faiss::Index {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const faiss::BitsetView bitset = nullptr) const override {
+            const faiss::SearchParameters* params) const override {
+        FAISS_THROW_IF_NOT(!params);
         nCalled = n;
         xCalled = x;
         kCalled = k;
@@ -65,14 +68,19 @@ struct MockIndex : public faiss::Index {
 
 template <typename IndexT>
 struct MockThreadedIndex : public faiss::ThreadedIndex<IndexT> {
-    using idx_t = faiss::Index::idx_t;
+    using idx_t = faiss::idx_t;
 
     explicit MockThreadedIndex(bool threaded)
             : faiss::ThreadedIndex<IndexT>(threaded) {}
 
     void add(idx_t, const float*) override {}
-    void search(idx_t, const float*, idx_t, float*, idx_t*,
-                const faiss::BitsetView) const override {}
+    void search(
+            idx_t,
+            const float*,
+            idx_t,
+            float*,
+            idx_t*,
+            const faiss::SearchParameters*) const override {}
     void reset() override {}
 };
 
@@ -172,7 +180,7 @@ TEST(ThreadedIndex, TestReplica) {
 
         std::vector<float> x(n * d);
         std::vector<float> distances(n * k);
-        std::vector<faiss::Index::idx_t> labels(n * k);
+        std::vector<faiss::idx_t> labels(n * k);
 
         replica.add(n, x.data());
 
@@ -221,7 +229,7 @@ TEST(ThreadedIndex, TestShards) {
 
         std::vector<float> x(n * d);
         std::vector<float> distances(n * k);
-        std::vector<faiss::Index::idx_t> labels(n * k);
+        std::vector<faiss::idx_t> labels(n * k);
 
         shards.add(n, x.data());
 
diff --git a/thirdparty/faiss/tests/test_transfer_invlists.cpp b/thirdparty/faiss/tests/test_transfer_invlists.cpp
index c0a9f67ec..309a331dd 100644
--- a/thirdparty/faiss/tests/test_transfer_invlists.cpp
+++ b/thirdparty/faiss/tests/test_transfer_invlists.cpp
@@ -33,7 +33,7 @@ int nlist = 40;
 
 using namespace faiss;
 
-typedef faiss::Index::idx_t idx_t;
+typedef faiss::idx_t idx_t;
 
 std::vector<float> get_data(size_t nb, int seed) {
     std::vector<float> x(nb * d);
diff --git a/thirdparty/faiss/tests/torch_test_contrib.py b/thirdparty/faiss/tests/torch_test_contrib.py
index 3036b19a0..5cb4038f5 100644
--- a/thirdparty/faiss/tests/torch_test_contrib.py
+++ b/thirdparty/faiss/tests/torch_test_contrib.py
@@ -340,5 +340,6 @@ def test_non_contiguous(self):
         with self.assertRaises(AssertionError):
             index.add(xb)
 
-        with self.assertRaises(ValueError):
-            index.add(xb.numpy())
+        # disabled since we now accept non-contiguous arrays
+        # with self.assertRaises(ValueError):
+        #    index.add(xb.numpy())
diff --git a/thirdparty/faiss/tutorial/cpp/1-Flat.cpp b/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
index edd1426be..819e41957 100644
--- a/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
+++ b/thirdparty/faiss/tutorial/cpp/1-Flat.cpp
@@ -12,7 +12,7 @@
 #include <faiss/IndexFlat.h>
 
 // 64-bit int
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 
 int main() {
     int d = 64;      // dimension
diff --git a/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp b/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
index e107c8fc6..febd5be04 100644
--- a/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
+++ b/thirdparty/faiss/tutorial/cpp/2-IVFFlat.cpp
@@ -13,7 +13,7 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 
 int main() {
     int d = 64;      // dimension
diff --git a/thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp b/thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp
index ba1bc5b4c..c84e52e86 100644
--- a/thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp
+++ b/thirdparty/faiss/tutorial/cpp/3-IVFPQ.cpp
@@ -12,7 +12,7 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFPQ.h>
 
-using idx_t = faiss::Index::idx_t;
+using idx_t = faiss::idx_t;
 
 int main() {
     int d = 64;      // dimension
diff --git a/thirdparty/faiss/tutorial/cpp/5-GPU.cpp b/thirdparty/faiss/tutorial/cpp/5-GPU.cpp
deleted file mode 100644
index 89f1e49ac..000000000
--- a/thirdparty/faiss/tutorial/cpp/5-GPU.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <unistd.h>
-
-#include <iostream>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/AuxIndexStructures.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/OnDiskInvertedLists.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/utils/distances.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-#define PRINT_RESULT 0
-
-void print_result(const char* unit, long number, long k, long nq, long *I) {
-    printf("%s: I (2 first results)=\n", unit);
-    for(int i = 0; i < number; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-
-    printf("%s: I (2 last results)=\n", unit);
-    for(int i = nq - number; i < nq; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-}
-
-
-int main() {
-    const char* filename = "index500k.index";
-    
-#if PRINT_RESULT
-    int number = 8;
-#endif
-
-    int d = 512;                            // dimension
-    int nq = 10;                        // nb of queries
-    int nprobe = 1;
-    float *xq = new float[d * nq];
-    for(int i = 0; i < nq; i++) {
-        for(int j = 0; j < d; j++) {
-            xq[d * i + j] = drand48();
-        }
-    }
-    faiss::distance_compute_blas_threshold = 800;
-
-    faiss::gpu::StandardGpuResources res;
-
-    int k = 8;
-    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
-
-    const char* index_description = "IVF16384,SQ8";
-//     const char* index_description = "IVF3276,SQ8";
-
-    faiss::Index *cpu_index = nullptr;
-    faiss::IndexIVF* cpu_ivf_index = nullptr;
-    if((access(filename,F_OK))==-1) {
-        // create database
-        long nb = 500000;                       // database size
-//        printf("-----------------------\n");
-        long size = d * nb;
-        float *xb = new float[size];
-        memset(xb, 0, size * sizeof(float));
-        printf("size: %ld\n", (size * sizeof(float)) );
-        for(long i = 0; i < nb; i++) {
-            for(long j = 0; j < d; j++) {
-                float rand = drand48();
-                xb[d * i + j] = rand;
-            }
-        }
-
-        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
-        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
-
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
-
-        assert(!device_index->is_trained);
-        device_index->train(nb, xb);
-        assert(device_index->is_trained);
-        device_index->add(nb, xb);  // add vectors to the index
-
-        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
-        printf("ntotal = %ld\n", device_index->ntotal);
-
-        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
-        faiss::write_index(cpu_index, filename);
-        printf("index.index is stored successfully.\n");
-        delete [] xb;
-    } else {
-        cpu_index = faiss::read_index(filename);
-    }
-
-    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    if(cpu_ivf_index != nullptr) {
-        cpu_ivf_index->to_readonly();
-    }
-
-    auto init_gpu =[&](int device_id, faiss::gpu::GpuClonerOptions* option) {
-        option->allInGpu = true;
-        faiss::Index* tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, cpu_index, option);
-        delete tmp_index;
-    };
-
-    auto gpu_executor = [&](int device_id, faiss::gpu::GpuClonerOptions* option) {
-    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, cpu_index, option);
-    delete tmp_index;
-    double t0 = getmillisecs ();
-    {
-        // cpu to gpu
-        option->allInGpu = true;
-
-        tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, cpu_index, option);
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-    }
-    double t1 = getmillisecs ();
-    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-
-    {
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-	if(option->allInGpu) {
-	    faiss::gpu::GpuIndexIVF* gpu_index_ivf =
-		dynamic_cast<faiss::gpu::GpuIndexIVF*>(gpu_index_ivf_ptr.get());
-	    gpu_index_ivf->setNumProbes(nprobe);
-        for(long i = 0; i < 1; ++ i) {
-            double t2 = getmillisecs();
-            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-            double t3 = getmillisecs();
-            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-        }
-	} else {
-	    faiss::IndexIVFScalarQuantizer* index_ivf =
-		dynamic_cast<faiss::IndexIVFScalarQuantizer*>(gpu_index_ivf_ptr.get());
-	    index_ivf->nprobe = nprobe;
-        for(long i = 0; i < 1; ++ i) {
-            double t2 = getmillisecs();
-	        index_ivf->search(nq, xq, k, D, I);
-            double t3 = getmillisecs();
-            printf("- GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-        }
-	}
-
-        // print results
-#if PRINT_RESULT
-        print_result("GPU", number, k, nq, I);
-#endif
-        delete [] I;
-        delete [] D;
-    }
-    double t4 = getmillisecs();
-
-    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
-
-    };
-    printf("----------------------------------\n");
-    auto cpu_executor = [&]() {       // search xq
-        printf("CPU: \n");
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-
-        double t4 = getmillisecs();
-        faiss::IndexIVF* ivf_index =
-            dynamic_cast<faiss::IndexIVF*>(cpu_index);
-        ivf_index->nprobe = nprobe;
-        cpu_index->search(nq, xq, k, D, I);
-        double t5 = getmillisecs();
-        printf("CPU execution time: %0.2f\n", t5 - t4);
-#if PRINT_RESULT
-        print_result("CPU", number, k, nq, I);
-#endif
-        delete [] I;
-        delete [] D;
-    };
-
-    for(long i = 0; i < 1; ++ i) {
-        cpu_executor();
-    }
-
-    faiss::gpu::GpuClonerOptions option0;
-    faiss::gpu::GpuClonerOptions option1;
-
-//    init_gpu(0, &option0);
-//    init_gpu(1, &option1);
-
-//    double tx = getmillisecs();
-    std::thread t1(gpu_executor, 0, &option0);
-    std::thread t2(gpu_executor, 1, &option1);
-    t1.join();
-    t2.join();
-//    double ty = getmillisecs();
-//    printf("Total GPU execution time: %0.2f\n", ty - tx);
-
-    delete [] xq;
-    return 0;
-}
diff --git a/thirdparty/faiss/tutorial/cpp/6-GPU.cpp b/thirdparty/faiss/tutorial/cpp/6-GPU.cpp
deleted file mode 100644
index 8afcc0fd1..000000000
--- a/thirdparty/faiss/tutorial/cpp/6-GPU.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <unistd.h>
-
-#include <iostream>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/gpu/GpuIndexIVFSQHybrid.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/AuxIndexStructures.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/IndexSQHybrid.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/utils/distances.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-#define PRINT_RESULT 0
-
-void print_result(const char* unit, long number, long k, long nq, long *I) {
-    printf("%s: I (2 first results)=\n", unit);
-    for(int i = 0; i < number; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-
-    printf("%s: I (2 last results)=\n", unit);
-    for(int i = nq - number; i < nq; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-}
-
-
-int main() {
-    const char* filename = "index500k-h.index";
-
-#if PRINT_RESULT
-    int number = 8;
-#endif
-
-    int d = 512;                            // dimension
-    int nq = 10;                        // nb of queries
-    int nprobe = 1;
-    float *xq = new float[d * nq];
-    for(int i = 0; i < nq; i++) {
-        for(int j = 0; j < d; j++) {
-            xq[d * i + j] = drand48();
-        }
-    }
-    faiss::distance_compute_blas_threshold = 800;
-
-    faiss::gpu::StandardGpuResources res;
-
-    int k = 8;
-    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
-
-    const char* index_description = "IVF16384,SQ8Hybrid";
-//     const char* index_description = "IVF3276,SQ8";
-
-    faiss::Index *cpu_index = nullptr;
-    faiss::IndexIVF* cpu_ivf_index = nullptr;
-    if((access(filename,F_OK))==-1) {
-        // create database
-        long nb = 500000;                       // database size
-//        printf("-----------------------\n");
-        long size = d * nb;
-        float *xb = new float[size];
-        memset(xb, 0, size * sizeof(float));
-        printf("size: %ld\n", (size * sizeof(float)) );
-        for(long i = 0; i < nb; i++) {
-            for(long j = 0; j < d; j++) {
-                float rand = drand48();
-                xb[d * i + j] = rand;
-            }
-        }
-
-        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
-        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
-
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
-
-        assert(!device_index->is_trained);
-        device_index->train(nb, xb);
-        assert(device_index->is_trained);
-        device_index->add(nb, xb);  // add vectors to the index
-
-        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
-        printf("ntotal = %ld\n", device_index->ntotal);
-
-        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
-        faiss::write_index(cpu_index, filename);
-        printf("index.index is stored successfully.\n");
-        delete [] xb;
-    } else {
-        cpu_index = faiss::read_index(filename);
-    }
-
-    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    if(cpu_ivf_index != nullptr) {
-        cpu_ivf_index->to_readonly();
-    }
-
-    auto gpu_executor = [&](int device_id, faiss::gpu::GpuClonerOptions* option, faiss::IndexComposition* index_composition) {
-        auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-        delete tmp_index;
-        double t0 = getmillisecs ();
-        {
-            // cpu to gpu
-            tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-            gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-        }
-        double t1 = getmillisecs ();
-        printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-
-        {
-            long *I = new long[k * nq];
-            float *D = new float[k * nq];
-
-            faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
-                    dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
-            gpu_index_ivf_hybrid->setNumProbes(nprobe);
-            for(long i = 0; i < 1; ++ i) {
-                double t2 = getmillisecs();
-                gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-                double t3 = getmillisecs();
-                printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-            }
-
-            // print results
-#if PRINT_RESULT
-            print_result("GPU", number, k, nq, I);
-#endif
-            delete [] I;
-            delete [] D;
-        }
-        double t4 = getmillisecs();
-
-        printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
-
-    };
-    printf("----------------------------------\n");
-    auto cpu_executor = [&](faiss::IndexComposition* index_composition) {       // search xq
-        printf("CPU: \n");
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-
-        double t4 = getmillisecs();
-        faiss::IndexIVF* ivf_index =
-                dynamic_cast<faiss::IndexIVF*>(cpu_index);
-        ivf_index->nprobe = nprobe;
-
-        faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
-        if(is_gpu_flat_index == nullptr) {
-            delete ivf_index->quantizer;
-            ivf_index->quantizer = index_composition->quantizer;
-        }
-
-        cpu_index->search(nq, xq, k, D, I);
-        double t5 = getmillisecs();
-        printf("CPU execution time: %0.2f\n", t5 - t4);
-#if PRINT_RESULT
-        print_result("CPU", number, k, nq, I);
-#endif
-        delete [] I;
-        delete [] D;
-    };
-
-
-    faiss::gpu::GpuClonerOptions option0;
-    faiss::gpu::GpuClonerOptions option1;
-
-    faiss::IndexComposition index_composition0;
-    index_composition0.index = cpu_index;
-    index_composition0.quantizer = nullptr;
-    index_composition0.mode = 0; // only quantizer
-
-    // Copy quantizer to GPU 0
-    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-
-    faiss::IndexComposition index_composition1;
-    index_composition1.index = cpu_index;
-    index_composition1.quantizer = nullptr;
-    index_composition1.mode = 0; // only quantizer
-
-    // Copy quantizer to GPU 1
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-    std::thread t_cpu1(cpu_executor, &index_composition0);
-    t_cpu1.join();
-    std::thread t_cpu2(cpu_executor, &index_composition1);
-    t_cpu2.join();
-
-    index_composition0.mode = 2; // only data
-    index_composition1.mode = 2; // only data
-
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-//    double tx = getmillisecs();
-    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
-    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
-    t1.join();
-    t2.join();
-
-//    std::thread t3(gpu_executor, 0, &option0, &index_composition0);
-//    std::thread t4(gpu_executor, 1, &option1, &index_composition1);
-//    t3.join();
-//    t4.join();
-//    double ty = getmillisecs();
-//    printf("Total GPU execution time: %0.2f\n", ty - tx);
-    cpu_executor(&index_composition0);
-    cpu_executor(&index_composition1);
-
-    delete [] xq;
-    return 0;
-}
diff --git a/thirdparty/faiss/tutorial/cpp/6-RUN.cpp b/thirdparty/faiss/tutorial/cpp/6-RUN.cpp
deleted file mode 100644
index 440a5707b..000000000
--- a/thirdparty/faiss/tutorial/cpp/6-RUN.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <unistd.h>
-
-#include <iostream>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/gpu/GpuIndexIVFSQHybrid.h>
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/AuxIndexStructures.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/IndexSQHybrid.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/utils/distances.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-#define PRINT_RESULT 0
-std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
-const int d = 512;                            // dimension
-const int nq = 1000;                        // nb of queries
-const int nprobe = 1;
-int k = 8;
-
-void
-print_result(const char* unit, long number, long k, long nq, long* I) {
-    printf("%s: I (2 first results)=\n", unit);
-    for (int i = 0; i < number; i++) {
-        for (int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-
-    printf("%s: I (2 last results)=\n", unit);
-    for (int i = nq - number; i < nq; i++) {
-        for (int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-}
-
-void
-cpu_executor(faiss::Index* cpu_index, float*& xq) {       // search xq
-    printf("CPU: \n");
-    long* I = new long[k * nq];
-    float* D = new float[k * nq];
-
-    double t4 = getmillisecs();
-    faiss::IndexIVF* ivf_index =
-        dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    ivf_index->nprobe = nprobe;
-    cpu_index->search(nq, xq, k, D, I);
-    double t5 = getmillisecs();
-    printf("CPU execution time: %0.2f\n", t5 - t4);
-#if PRINT_RESULT
-    print_result("CPU", number, k, nq, I);
-#endif
-    delete[] I;
-    delete[] D;
-};
-
-void
-hybrid_executor(faiss::Index* cpu_index,
-                faiss::IndexComposition* index_composition,
-                float*& xq) {       // search xq
-    printf("HYBRID: \n");
-    long* I = new long[k * nq];
-    float* D = new float[k * nq];
-
-    double t4 = getmillisecs();
-    faiss::IndexIVF* ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    ivf_index->nprobe = nprobe;
-
-    faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
-    if (is_gpu_flat_index == nullptr) {
-        delete ivf_index->quantizer;
-        ivf_index->quantizer = index_composition->quantizer;
-    }
-
-    cpu_index->search(nq, xq, k, D, I);
-    double t5 = getmillisecs();
-    printf("HYBRID execution time: %0.2f\n", t5 - t4);
-#if PRINT_RESULT
-    print_result("HYBRID", number, k, nq, I);
-#endif
-    delete[] I;
-    delete[] D;
-};
-
-void
-gpu_executor(faiss::gpu::StandardGpuResources& res,
-             int device_id,
-             faiss::gpu::GpuClonerOptions* option,
-             faiss::IndexComposition* index_composition,
-             float*& xq) {
-    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-    delete tmp_index;
-    double t0 = getmillisecs();
-    {
-        // cpu to gpu
-        tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-    }
-    double t1 = getmillisecs();
-    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-
-    {
-        long* I = new long[k * nq];
-        float* D = new float[k * nq];
-
-        faiss::gpu::GpuIndexIVFSQHybrid
-            * gpu_index_ivf_hybrid = dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
-        gpu_index_ivf_hybrid->setNumProbes(nprobe);
-        for (long i = 0; i < 1; ++i) {
-            double t2 = getmillisecs();
-            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-            double t3 = getmillisecs();
-            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-        }
-
-        // print results
-#if PRINT_RESULT
-        print_result("GPU", number, k, nq, I);
-#endif
-        delete[] I;
-        delete[] D;
-    }
-    double t4 = getmillisecs();
-
-    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
-
-};
-
-int
-main() {
-    const char* filename = "index500k-h.index";
-    faiss::gpu::StandardGpuResources res;
-
-#if PRINT_RESULT
-    int number = 8;
-#endif
-
-    float* xq = new float[d * nq];
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < d; j++) {
-            xq[d * i + j] = drand48();
-        }
-    }
-    faiss::distance_compute_blas_threshold = 800;
-
-    faiss::Index* cpu_index = nullptr;
-    faiss::IndexIVF* cpu_ivf_index = nullptr;
-    if ((access(filename, F_OK)) == -1) {
-        printf("index file not found.");
-        exit(-1);
-    } else {
-        cpu_index = faiss::read_index(filename);
-    }
-
-    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    if (cpu_ivf_index != nullptr) {
-        cpu_ivf_index->to_readonly();
-    }
-
-    printf("============================\n");
-    cpu_executor(cpu_index, xq);
-    cpu_executor(cpu_index, xq);
-    printf("============================\n");
-
-    faiss::gpu::GpuClonerOptions option0;
-    faiss::gpu::GpuClonerOptions option1;
-
-    faiss::IndexComposition index_composition0;
-    index_composition0.index = cpu_index;
-    index_composition0.quantizer = nullptr;
-    index_composition0.mode = 0; // only quantizer
-
-    // Copy quantizer to GPU 0
-    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-
-    faiss::IndexComposition index_composition1;
-    index_composition1.index = cpu_index;
-    index_composition1.quantizer = nullptr;
-    index_composition1.mode = 0; // only quantizer
-
-    // Copy quantizer to GPU 1
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-    hybrid_executor(cpu_index, &index_composition0, xq);
-    hybrid_executor(cpu_index, &index_composition1, xq);
-
-    printf("============================\n");
-
-    index_composition0.mode = 2; // only data
-    index_composition1.mode = 2; // only data
-
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-    gpu_executor(res, 0, &option0, &index_composition0, xq);
-    gpu_executor(res, 1, &option1, &index_composition1, xq);
-
-    printf("============================\n");
-
-    hybrid_executor(cpu_index, &index_composition0, xq);
-    hybrid_executor(cpu_index, &index_composition1, xq);
-
-    delete[] xq;
-    gpu_index_ivf_ptr = nullptr;
-    return 0;
-}
diff --git a/thirdparty/faiss/tutorial/cpp/7-GPU.cpp b/thirdparty/faiss/tutorial/cpp/7-GPU.cpp
deleted file mode 100644
index 13f8da1ba..000000000
--- a/thirdparty/faiss/tutorial/cpp/7-GPU.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <unistd.h>
-
-#include <iostream>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/gpu/GpuIndexIVFSQHybrid.h>
-
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/IndexSQHybrid.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/utils/distances.h>
-#include <faiss/clone_index.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-#define PRINT_RESULT 0
-
-void print_result(const char* unit, long number, long k, long nq, long *I) {
-    printf("%s: I (2 first results)=\n", unit);
-    for(int i = 0; i < number; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-
-    printf("%s: I (2 last results)=\n", unit);
-    for(int i = nq - number; i < nq; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-}
-
-void
-GpuLoad(faiss::gpu::StandardGpuResources* res,
-        int device_id,
-        faiss::gpu::GpuClonerOptions* option,
-        faiss::IndexComposition* index_composition,
-        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr
-        ) {
-
-    double t0 = getmillisecs ();
-
-    auto tmp_index = faiss::gpu::index_cpu_to_gpu(res, device_id, index_composition, option);
-    gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-
-    double t1 = getmillisecs ();
-    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-}
-
-void
-GpuExecutor(
-        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr,
-        faiss::gpu::StandardGpuResources& res,
-        int device_id,
-        faiss::gpu::GpuClonerOptions* option,
-        faiss::IndexComposition* index_composition,
-        int nq,
-        int nprobe,
-        int k,
-        float* xq) {
-    double t0 = getmillisecs ();
-    {
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-
-        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
-                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
-        gpu_index_ivf_hybrid->setNumProbes(nprobe);
-        for(long i = 0; i < 4; ++ i) {
-            double t2 = getmillisecs();
-            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-            double t3 = getmillisecs();
-            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-        }
-
-        // print results
-#if PRINT_RESULT
-        print_result("GPU", number, k, nq, I);
-#endif
-        delete [] I;
-        delete [] D;
-        gpu_index_ivf_ptr = nullptr;
-    }
-    double t4 = getmillisecs();
-
-    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
-}
-
-
-void
-GpuExecutor(
-        faiss::gpu::StandardGpuResources& res,
-        int device_id,
-        faiss::gpu::GpuClonerOptions* option,
-        faiss::IndexComposition* index_composition,
-        int nq,
-        int nprobe,
-        int k,
-        float* xq) {
-    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-    delete tmp_index;
-    double t0 = getmillisecs ();
-    // cpu to gpu
-    tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-    auto gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-
-    double t1 = getmillisecs ();
-    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-
-    {
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-
-        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
-                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
-        gpu_index_ivf_hybrid->setNumProbes(nprobe);
-        for(long i = 0; i < 4; ++ i) {
-            double t2 = getmillisecs();
-            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-            double t3 = getmillisecs();
-            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-        }
-
-        // print results
-#if PRINT_RESULT
-        print_result("GPU", number, k, nq, I);
-#endif
-        delete [] I;
-        delete [] D;
-        gpu_index_ivf_ptr = nullptr;
-    }
-    double t4 = getmillisecs();
-
-    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
-}
-
-void
-CpuExecutor(
-        faiss::IndexComposition* index_composition,
-        int nq,
-        int nprobe,
-        int k,
-        float* xq,
-        faiss::Index *cpu_index) {
-    printf("CPU: \n");
-    long *I = new long[k * nq];
-    float *D = new float[k * nq];
-
-    double t4 = getmillisecs();
-    faiss::IndexIVF* ivf_index =
-            dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    ivf_index->nprobe = nprobe;
-
-    faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
-    if(is_gpu_flat_index == nullptr) {
-        delete ivf_index->quantizer;
-        ivf_index->quantizer = index_composition->quantizer;
-    }
-
-    cpu_index->search(nq, xq, k, D, I);
-    double t5 = getmillisecs();
-    printf("CPU execution time: %0.2f\n", t5 - t4);
-#if PRINT_RESULT
-    print_result("CPU", number, k, nq, I);
-#endif
-    delete [] I;
-    delete [] D;
-}
-
-int main() {
-    const char* filename = "index500k-h.index";
-
-#if PRINT_RESULT
-    int number = 8;
-#endif
-
-    int d = 512;                            // dimension
-    int nq = 1000;                        // nb of queries
-    int nprobe = 8;
-    float *xq = new float[d * nq];
-    for(int i = 0; i < nq; i++) {
-        for(int j = 0; j < d; j++) {
-            xq[d * i + j] = drand48();
-        }
-    }
-    faiss::distance_compute_blas_threshold = 800;
-
-    faiss::gpu::StandardGpuResources res;
-
-    int k = 1000;
-    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
-
-    const char* index_description = "IVF16384,SQ8Hybrid";
-//     const char* index_description = "IVF3276,SQ8";
-
-    faiss::Index *cpu_index = nullptr;
-    faiss::IndexIVF* cpu_ivf_index = nullptr;
-    if((access(filename,F_OK))==-1) {
-        // create database
-        long nb = 500000;                       // database size
-//        printf("-----------------------\n");
-        long size = d * nb;
-        float *xb = new float[size];
-        memset(xb, 0, size * sizeof(float));
-        printf("size: %ld\n", (size * sizeof(float)) );
-        for(long i = 0; i < nb; i++) {
-            for(long j = 0; j < d; j++) {
-                float rand = drand48();
-                xb[d * i + j] = rand;
-            }
-        }
-
-        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
-        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
-
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
-
-        assert(!device_index->is_trained);
-        device_index->train(nb, xb);
-        assert(device_index->is_trained);
-        device_index->add(nb, xb);  // add vectors to the index
-
-        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
-        printf("ntotal = %ld\n", device_index->ntotal);
-
-        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
-        faiss::write_index(cpu_index, filename);
-        printf("index.index is stored successfully.\n");
-        delete [] xb;
-    } else {
-        cpu_index = faiss::read_index(filename);
-    }
-
-    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    if(cpu_ivf_index != nullptr) {
-        cpu_ivf_index->to_readonly();
-    }
-
-    faiss::gpu::GpuClonerOptions option0;
-    faiss::gpu::GpuClonerOptions option1;
-
-    option0.allInGpu = true;
-    option1.allInGpu = true;
-
-    faiss::IndexComposition index_composition0;
-    index_composition0.index = cpu_index;
-    index_composition0.quantizer = nullptr;
-    index_composition0.mode = 1; // only quantizer
-
-    // Copy quantizer to GPU 0
-    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-
-    faiss::IndexComposition index_composition1;
-    index_composition1.index = cpu_index;
-    index_composition1.quantizer = nullptr;
-    index_composition1.mode = 1; // only quantizer
-
-    // Copy quantizer to GPU 1
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-//    std::thread t_cpu1(cpu_executor, &index_composition0);
-//    t_cpu1.join();
-//    std::thread t_cpu2(cpu_executor, &index_composition1);
-//    t_cpu2.join();
-
-//    index_composition0.mode = 2; // only data
-//    index_composition1.mode = 2; // only data
-//
-//    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-//    delete index1;
-//    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-//    delete index1;
-
-//    double tx = getmillisecs();
-//    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
-//    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
-//    t1.join();
-//    t2.join();
-//    for(long i = 0; i < 10; ++ i) {
-//        std::shared_ptr<faiss::Index> gpu_index_ptr00;
-//        std::shared_ptr<faiss::Index> gpu_index_ptr01;
-//
-//        std::thread t00(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr00));
-////        std::thread t2(GpuLoad, &res, 1, &option1, &index_composition1, std::ref(gpu_index_ptr1));
-//        std::thread t01(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr01));
-//
-//        t00.join();
-//
-//        GpuExecutor(gpu_index_ptr00, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
-//
-//        t01.join();
-////        t2.join();
-//        GpuExecutor(gpu_index_ptr01, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
-////        GpuExecutor(gpu_index_ptr1, res, 1, &option1, &index_composition1, nq, nprobe, k, xq);
-//    }
-
-//    std::thread t3(gpu_executor, 0, &option0, &index_composition0);
-//    std::thread t4(gpu_executor, 1, &option1, &index_composition1);
-//    t3.join();
-//    t4.join();
-//    double ty = getmillisecs();
-//    printf("Total GPU execution time: %0.2f\n", ty - tx);
-
-    CpuExecutor(&index_composition0, nq, nprobe, k, xq, cpu_index);
-    CpuExecutor(&index_composition1, nq, nprobe, k, xq, cpu_index);
-
-    /////
-    delete [] xq;
-    return 0;
-}
-
diff --git a/thirdparty/faiss/tutorial/cpp/8-GPU.cpp b/thirdparty/faiss/tutorial/cpp/8-GPU.cpp
deleted file mode 100644
index 367271c8b..000000000
--- a/thirdparty/faiss/tutorial/cpp/8-GPU.cpp
+++ /dev/null
@@ -1,479 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <unistd.h>
-
-#include <iostream>
-
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuAutoTune.h>
-#include <faiss/gpu/GpuClonerOptions.h>
-#include <faiss/gpu/GpuCloner.h>
-#include <faiss/gpu/GpuIndexIVF.h>
-#include <faiss/gpu/GpuIndexIVFSQHybrid.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/IndexSQHybrid.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/utils/distances.h>
-#include <faiss/clone_index.h>
-#include <faiss/index_factory.h>
-
-using namespace faiss;
-
-#define PRINT_RESULT 0
-
-void print_result(const char* unit, long number, long k, long nq, long *I) {
-    printf("%s: I (2 first results)=\n", unit);
-    for(int i = 0; i < number; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-
-    printf("%s: I (2 last results)=\n", unit);
-    for(int i = nq - number; i < nq; i++) {
-        for(int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-}
-
-void
-GpuLoad(faiss::gpu::StandardGpuResources* res,
-        int device_id,
-        faiss::gpu::GpuClonerOptions* option,
-        faiss::IndexComposition* index_composition,
-        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr
-        ) {
-
-    double t0 = getmillisecs ();
-
-    auto tmp_index = faiss::gpu::index_cpu_to_gpu(res, device_id, index_composition, option);
-    gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-
-    double t1 = getmillisecs ();
-    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-}
-
-void
-GpuExecutor(
-        std::shared_ptr<faiss::Index>& gpu_index_ivf_ptr,
-        faiss::gpu::StandardGpuResources& res,
-        int device_id,
-        faiss::gpu::GpuClonerOptions* option,
-        faiss::IndexComposition* index_composition,
-        int nq,
-        int nprobe,
-        int k,
-        float* xq) {
-    double t0 = getmillisecs ();
-    {
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-
-        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
-                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
-        gpu_index_ivf_hybrid->setNumProbes(nprobe);
-        for(long i = 0; i < 4; ++ i) {
-            double t2 = getmillisecs();
-            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-            double t3 = getmillisecs();
-            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-        }
-
-        // print results
-#if PRINT_RESULT
-        print_result("GPU", number, k, nq, I);
-#endif
-        delete [] I;
-        delete [] D;
-        gpu_index_ivf_ptr = nullptr;
-    }
-    double t4 = getmillisecs();
-
-    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
-}
-
-
-void
-GpuExecutor(
-        faiss::gpu::StandardGpuResources& res,
-        int device_id,
-        faiss::gpu::GpuClonerOptions* option,
-        faiss::IndexComposition* index_composition,
-        int nq,
-        int nprobe,
-        int k,
-        float* xq) {
-    auto tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-    delete tmp_index;
-    double t0 = getmillisecs ();
-    // cpu to gpu
-    tmp_index = faiss::gpu::index_cpu_to_gpu(&res, device_id, index_composition, option);
-    auto gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-
-    double t1 = getmillisecs ();
-    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-
-    {
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-
-        faiss::gpu::GpuIndexIVFSQHybrid* gpu_index_ivf_hybrid =
-                dynamic_cast<faiss::gpu::GpuIndexIVFSQHybrid*>(gpu_index_ivf_ptr.get());
-        gpu_index_ivf_hybrid->setNumProbes(nprobe);
-        for(long i = 0; i < 4; ++ i) {
-            double t2 = getmillisecs();
-            gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-            double t3 = getmillisecs();
-            printf("* GPU: %d, execution time: %0.2f\n", device_id, t3 - t2);
-        }
-
-        // print results
-#if PRINT_RESULT
-        print_result("GPU", number, k, nq, I);
-#endif
-        delete [] I;
-        delete [] D;
-        gpu_index_ivf_ptr = nullptr;
-    }
-    double t4 = getmillisecs();
-
-    printf("GPU:%d total time: %0.2f\n", device_id, t4 - t0);
-}
-
-void
-CpuExecutor(
-        faiss::IndexComposition* index_composition,
-        int nq,
-        int nprobe,
-        int k,
-        float* xq,
-        faiss::Index *cpu_index) {
-    printf("CPU: \n");
-    long *I = new long[k * nq];
-    float *D = new float[k * nq];
-
-    double t4 = getmillisecs();
-    faiss::IndexIVF* ivf_index =
-            dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    ivf_index->nprobe = nprobe;
-
-    faiss::gpu::GpuIndexFlat* is_gpu_flat_index = dynamic_cast<faiss::gpu::GpuIndexFlat*>(ivf_index->quantizer);
-    if(is_gpu_flat_index == nullptr) {
-        delete ivf_index->quantizer;
-        ivf_index->quantizer = index_composition->quantizer;
-    }
-
-    cpu_index->search(nq, xq, k, D, I);
-    double t5 = getmillisecs();
-    printf("CPU execution time: %0.2f\n", t5 - t4);
-#if PRINT_RESULT
-    print_result("CPU", number, k, nq, I);
-#endif
-    delete [] I;
-    delete [] D;
-}
-
-void create_index(const char* filename, const char* index_description, long db_size, long d) {
-    faiss::gpu::StandardGpuResources res;
-    if((access(filename,F_OK))==-1) {
-        // create database
-        long size = d * db_size;
-        float *xb = new float[size];
-        memset(xb, 0, size * sizeof(float));
-        printf("size: %ld\n", (size * sizeof(float)) );
-        for(long i = 0; i < db_size; i++) {
-            for(long j = 0; j < d; j++) {
-                float rand = drand48();
-                xb[d * i + j] = rand;
-            }
-        }
-
-        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_INNER_PRODUCT);
-        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
-
-        std::shared_ptr<faiss::Index> gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
-
-        assert(!device_index->is_trained);
-        device_index->train(db_size, xb);
-        assert(device_index->is_trained);
-        device_index->add(db_size, xb);  // add vectors to the index
-
-        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
-        printf("ntotal = %ld\n", device_index->ntotal);
-
-        faiss::Index *cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
-        faiss::write_index(cpu_index, filename);
-        printf("index.index is stored successfully.\n");
-        delete [] xb;
-    }
-}
-
-void execute_index(const char* filename, int d, int nq, int nprobe, int k, float* xq) {
-    faiss::gpu::StandardGpuResources res;
-    faiss::Index* cpu_index = faiss::read_index(filename);
-    faiss::IndexIVF* cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-
-    if(cpu_ivf_index != nullptr) {
-        cpu_ivf_index->to_readonly();
-    }
-
-    faiss::gpu::GpuClonerOptions option0;
-    faiss::gpu::GpuClonerOptions option1;
-
-    option0.allInGpu = true;
-    option1.allInGpu = true;
-
-    faiss::IndexComposition index_composition0;
-    index_composition0.index = cpu_index;
-    index_composition0.quantizer = nullptr;
-    index_composition0.mode = 1; // only quantizer
-
-    // Copy quantizer to GPU 0
-    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-
-    faiss::IndexComposition index_composition1;
-    index_composition1.index = cpu_index;
-    index_composition1.quantizer = nullptr;
-    index_composition1.mode = 1; // only quantizer
-
-    // Copy quantizer to GPU 1
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-    //    std::thread t_cpu1(cpu_executor, &index_composition0);
-    //    t_cpu1.join();
-    //    std::thread t_cpu2(cpu_executor, &index_composition1);
-    //    t_cpu2.join();
-
-    index_composition0.mode = 2; // only data
-    index_composition1.mode = 2; // only data
-
-   // index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-   // delete index1;
-   // index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-   // delete index1;
-
-    //    double tx = getmillisecs();
-    //    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
-    //    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
-    //    t1.join();
-    //    t2.join();
-    for(long i = 0; i < 1; ++ i) {
-        std::shared_ptr<faiss::Index> gpu_index_ptr00;
-        std::shared_ptr<faiss::Index> gpu_index_ptr01;
-
-        std::thread t00(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr00));
-        //        std::thread t2(GpuLoad, &res, 1, &option1, &index_composition1, std::ref(gpu_index_ptr1));
-        std::thread t01(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr01));
-
-        t00.join();
-
-        GpuExecutor(gpu_index_ptr00, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
-
-        t01.join();
-        //        t2.join();
-        GpuExecutor(gpu_index_ptr01, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
-    //        GpuExecutor(gpu_index_ptr1, res, 1, &option1, &index_composition1, nq, nprobe, k, xq);
-    }
-
-    delete index_composition0.quantizer;
-    delete index_composition1.quantizer;
-    delete cpu_index;
-}
-
-int main() {
-    const char* filename = "index500k-h.index";
-    int d = 512;                          // dimension
-    int nq = 1000;                        // nb of queries
-    int nprobe = 16;
-    int k = 1000;
-    float *xq = new float[d * nq];
-    for(int i = 0; i < nq; i++) {
-        for(int j = 0; j < d; j++) {
-            xq[d * i + j] = drand48();
-        }
-    }
-
-    long db_size = 500000;
-    const char* index_description = "IVF16384,SQ8Hybrid";
-    create_index(filename, index_description, db_size, d);
-    for(long i = 0; i < 1000; ++ i) {
-        execute_index(filename, d, nq, nprobe, k, xq);
-    }
-    delete[] xq;
-    xq = nullptr;
-    return 0;
-}
-
-/*
-int main() {
-    const char* filename = "index500k-h.index";
-
-#if PRINT_RESULT
-    int number = 8;
-#endif
-
-    int d = 512;                          // dimension
-    int nq = 1000;                        // nb of queries
-    int nprobe = 16;
-    float *xq = new float[d * nq];
-    for(int i = 0; i < nq; i++) {
-        for(int j = 0; j < d; j++) {
-            xq[d * i + j] = drand48();
-        }
-    }
-    faiss::distance_compute_blas_threshold = 800;
-
-    faiss::gpu::StandardGpuResources res;
-
-    int k = 1000;
-    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
-
-    const char* index_description = "IVF16384,SQ8Hybrid";
-//     const char* index_description = "IVF3276,SQ8";
-
-    faiss::Index *cpu_index = nullptr;
-    faiss::IndexIVF* cpu_ivf_index = nullptr;
-    if((access(filename,F_OK))==-1) {
-        // create database
-        long nb = 500000;                       // database size
-//        printf("-----------------------\n");
-        long size = d * nb;
-        float *xb = new float[size];
-        memset(xb, 0, size * sizeof(float));
-        printf("size: %ld\n", (size * sizeof(float)) );
-        for(long i = 0; i < nb; i++) {
-            for(long j = 0; j < d; j++) {
-                float rand = drand48();
-                xb[d * i + j] = rand;
-            }
-        }
-
-        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_INNER_PRODUCT);
-        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
-
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
-
-        assert(!device_index->is_trained);
-        device_index->train(nb, xb);
-        assert(device_index->is_trained);
-        device_index->add(nb, xb);  // add vectors to the index
-
-        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
-        printf("ntotal = %ld\n", device_index->ntotal);
-
-        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
-        faiss::write_index(cpu_index, filename);
-        printf("index.index is stored successfully.\n");
-        delete [] xb;
-    } else {
-        cpu_index = faiss::read_index(filename);
-    }
-
-    cpu_ivf_index = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-    if(cpu_ivf_index != nullptr) {
-        cpu_ivf_index->to_readonly();
-    }
-
-    faiss::gpu::GpuClonerOptions option0;
-    faiss::gpu::GpuClonerOptions option1;
-
-    option0.allInGpu = true;
-    option1.allInGpu = true;
-
-    faiss::IndexComposition index_composition0;
-    index_composition0.index = cpu_index;
-    index_composition0.quantizer = nullptr;
-    index_composition0.mode = 1; // only quantizer
-
-    // Copy quantizer to GPU 0
-    auto index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-
-    faiss::IndexComposition index_composition1;
-    index_composition1.index = cpu_index;
-    index_composition1.quantizer = nullptr;
-    index_composition1.mode = 1; // only quantizer
-
-    // Copy quantizer to GPU 1
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-//    std::thread t_cpu1(cpu_executor, &index_composition0);
-//    t_cpu1.join();
-//    std::thread t_cpu2(cpu_executor, &index_composition1);
-//    t_cpu2.join();
-
-    index_composition0.mode = 2; // only data
-    index_composition1.mode = 2; // only data
-
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 0, &index_composition0, &option0);
-    delete index1;
-    index1 = faiss::gpu::index_cpu_to_gpu(&res, 1, &index_composition1, &option1);
-    delete index1;
-
-//    double tx = getmillisecs();
-//    std::thread t1(gpu_executor, 0, &option0, &index_composition0);
-//    std::thread t2(gpu_executor, 1, &option1, &index_composition1);
-//    t1.join();
-//    t2.join();
-    for(long i = 0; i < 10; ++ i) {
-        std::shared_ptr<faiss::Index> gpu_index_ptr00;
-        std::shared_ptr<faiss::Index> gpu_index_ptr01;
-
-        std::thread t00(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr00));
-//        std::thread t2(GpuLoad, &res, 1, &option1, &index_composition1, std::ref(gpu_index_ptr1));
-        std::thread t01(GpuLoad, &res, 0, &option0, &index_composition0, std::ref(gpu_index_ptr01));
-
-        t00.join();
-
-        GpuExecutor(gpu_index_ptr00, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
-
-        t01.join();
-//        t2.join();
-        GpuExecutor(gpu_index_ptr01, res, 0, &option0, &index_composition0, nq, nprobe, k, xq);
-//        GpuExecutor(gpu_index_ptr1, res, 1, &option1, &index_composition1, nq, nprobe, k, xq);
-    }
-
-//    std::thread t3(gpu_executor, 0, &option0, &index_composition0);
-//    std::thread t4(gpu_executor, 1, &option1, &index_composition1);
-//    t3.join();
-//    t4.join();
-//    double ty = getmillisecs();
-//    printf("Total GPU execution time: %0.2f\n", ty - tx);
-//    CpuExecutor(&index_composition0, nq, nprobe, k, xq, cpu_index);
-//    CpuExecutor(&index_composition1, nq, nprobe, k, xq, cpu_index);
-
-    /////
-    delete [] xq;
-    return 0;
-}
-*/
diff --git a/thirdparty/faiss/tutorial/cpp/9-BinaryFlat.cpp b/thirdparty/faiss/tutorial/cpp/9-BinaryFlat.cpp
deleted file mode 100644
index 547cc6d88..000000000
--- a/thirdparty/faiss/tutorial/cpp/9-BinaryFlat.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-
-#include <faiss/IndexBinaryFlat.h>
-#include <sys/time.h>
-#include <unistd.h>
-
-// #define TEST_HAMMING
-
-long int getTime(timeval end, timeval start) {
-	return 1000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec)/1000;
-}
-
-int main() {
-    // freopen("0.txt", "w", stdout);
-
-    size_t d = 128;                          // dimension
-    size_t nb = 40000000;                    // database size
-    size_t nq = 10;                          // nb of queries
-
-    uint8_t *xb = new uint8_t[d * nb / sizeof(uint8_t)];
-    uint8_t *xq = new uint8_t[d * nq / sizeof(uint8_t)];
-
-    // skip 0
-    lrand48();
-
-    size_t size_to_long = d * nb / sizeof(int32_t);
-    for(size_t i = 0; i < size_to_long; i++) {
-        ((int32_t*)xb)[i] = lrand48();
-    }
-
-    size_to_long = d * nq / sizeof(long int);
-    for(size_t i = 0; i < size_to_long; i++) {
-        ((int32_t*)xq)[i] = lrand48();
-    }
-#ifdef TEST_HAMMING
-    printf("test haming\n");
-    faiss::IndexBinaryFlat index(d, faiss::MetricType::METRIC_Hamming);
-#else
-    faiss::IndexBinaryFlat index(d, faiss::MetricType::METRIC_Jaccard);
-#endif
-    index.add(nb, xb);
-    printf("ntotal = %ld d = %d\n", index.ntotal, index.d);
-
-    int k = 10;
-
-#if 0
-    {       // sanity check: search 5 first vectors of xb
-        int64_t *I = new int64_t[k * 5];
-        int32_t *D = new int32_t[k * 5];
-        float *d_float = reinterpret_cast<float*>(D);
-
-        index.search(5, xb, k, D, I);
-
-        // print results
-        for(int i = 0; i < 5; i++) {
-            for(int j = 0; j < k; j++)
-#ifdef TEST_HAMMING
-                printf("%8ld %d\n", I[i * k + j], D[i * k + j]);
-#else 
-                printf("%8ld %.08f\n", I[i * k + j], d_float[i * k + j]);
-#endif
-            printf("\n");
-        }
-
-        delete [] I;
-        delete [] D;
-    }
-#endif
-
-    {       // search xq
-        int64_t *I = new int64_t[k * nq];
-        int32_t *D = new int32_t[k * nq];
-        float *d_float = reinterpret_cast<float*>(D);
-
-        for (int loop = 1; loop <= nq; loop ++) {
-            timeval t0;
-            gettimeofday(&t0, 0);
-
-            index.search(loop, xq, k, D, I);
-
-            timeval t1;
-            gettimeofday(&t1, 0);
-            printf("search nq %d time %ldms\n", loop, getTime(t1,t0));
-#if 0
-            for (int i = 0; i < loop; i++) {
-                for(int j = 0; j < k; j++)
-#ifdef TEST_HAMMING
-                    printf("%8ld %d\n", I[i * k + j], D[i * k + j]);
-#else 
-                    printf("%8ld %.08f\n", I[j + i * k], d_float[j + i * k]);
-#endif
-                printf("\n");
-            }
-#endif
-        }
-
-        delete [] I;
-        delete [] D;
-    }
-
-    delete [] xb;
-    delete [] xq;
-
-    return 0;
-}
-
-
diff --git a/thirdparty/faiss/tutorial/cpp/CMakeLists.txt b/thirdparty/faiss/tutorial/cpp/CMakeLists.txt
index 9a9ec6ce0..7361b33a0 100644
--- a/thirdparty/faiss/tutorial/cpp/CMakeLists.txt
+++ b/thirdparty/faiss/tutorial/cpp/CMakeLists.txt
@@ -18,24 +18,3 @@ target_link_libraries(4-GPU PRIVATE faiss)
 
 add_executable(5-Multiple-GPUs EXCLUDE_FROM_ALL 5-Multiple-GPUs.cpp)
 target_link_libraries(5-Multiple-GPUs PRIVATE faiss)
-
-add_executable(5-GPU EXCLUDE_FROM_ALL 5-GPU.cpp)
-target_link_libraries(5-GPU PRIVATE faiss)
-
-add_executable(6-GPU EXCLUDE_FROM_ALL 6-GPU.cpp)
-target_link_libraries(6-GPU PRIVATE faiss)
-
-add_executable(6-RUN EXCLUDE_FROM_ALL 6-RUN.cpp)
-target_link_libraries(6-RUN PRIVATE faiss)
-
-add_executable(7-GPU EXCLUDE_FROM_ALL 7-GPU.cpp)
-target_link_libraries(7-GPU PRIVATE faiss)
-
-add_executable(8-GPU EXCLUDE_FROM_ALL 8-GPU.cpp)
-target_link_libraries(8-GPU PRIVATE faiss)
-
-add_executable(9-BinaryFlat EXCLUDE_FROM_ALL 9-BinaryFlat.cpp)
-target_link_libraries(9-BinaryFlat PRIVATE faiss)
-
-add_executable(tutorial_faiss_test EXCLUDE_FROM_ALL tutorial_faiss_test.cpp)
-target_link_libraries(tutorial_faiss_test PRIVATE faiss)
\ No newline at end of file
diff --git a/thirdparty/faiss/tutorial/cpp/tutorial_faiss_test.cpp b/thirdparty/faiss/tutorial/cpp/tutorial_faiss_test.cpp
deleted file mode 100644
index aea4409d8..000000000
--- a/thirdparty/faiss/tutorial/cpp/tutorial_faiss_test.cpp
+++ /dev/null
@@ -1,378 +0,0 @@
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <unistd.h>
-
-#include <iostream>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/index_io.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuAutoTune.h>
-
-
-#include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/AuxIndexStructures.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/VectorTransform.h>
-#include <faiss/IndexLSH.h>
-#include <faiss/IndexPQ.h>
-#include <faiss/IndexIVF.h>
-#include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFSpectralHash.h>
-#include <faiss/MetaIndexes.h>
-#include <faiss/IndexScalarQuantizer.h>
-#include <faiss/IndexHNSW.h>
-#include <faiss/invlists/OnDiskInvertedLists.h>
-#include <faiss/IndexBinaryFlat.h>
-#include <faiss/IndexBinaryFromFloat.h>
-#include <faiss/IndexBinaryHNSW.h>
-#include <faiss/IndexBinaryIVF.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/utils/utils.h>
-
-
-using namespace faiss;
-
-void
-generate_file(const char *filename,
-                   long nb,
-                   long dimension,
-                   std::string index_desc,
-                   faiss::gpu::StandardGpuResources &res) {
-    long size = dimension * nb;
-    float *xb = new float[size];
-    printf("size: %lf(GB)\n", (size * sizeof(float)) / (3 * 1024.0 * 1024 * 1024));
-    for (long i = 0; i < nb; i++) {
-        for (long j = 0; j < dimension; j++) {
-            float rand = drand48();
-            xb[dimension * i + j] = rand;
-        }
-    }
-
-    faiss::Index *ori_index = faiss::index_factory(dimension, index_desc.c_str(), faiss::METRIC_L2);
-    auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
-
-    assert(!device_index->is_trained);
-    device_index->train(nb, xb);
-    assert(device_index->is_trained);
-    device_index->add(nb, xb);
-
-    faiss::Index *cpu_index = faiss::gpu::index_gpu_to_cpu((device_index));
-    faiss::write_index(cpu_index, filename);
-    printf("index: %s is stored successfully.\n", filename);
-    delete[] xb;
-
-    return;
-}
-
-faiss::Index *
-get_index(const char *filename) {
-    return faiss::read_index(filename);
-}
-
-void
-execute_on_gpu(faiss::Index *index, float *xq, long nq, long k, long nprobe,
-    faiss::gpu::StandardGpuResources &res, long* I, float* D) {
-
-    double t0 = getmillisecs();
-
-    faiss::gpu::CpuToGpuClonerOptions option;
-    option.readonly = true;
-    faiss::Index *tmp_index = faiss::gpu::cpu_to_gpu(&res, 0, index, &option);
-    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-
-    double t1 = getmillisecs();
-    printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-
-
-    double t2 = getmillisecs();
-    faiss::gpu::GpuIndexIVF *gpu_index_ivf =
-        dynamic_cast<faiss::gpu::GpuIndexIVF *>(gpu_index_ivf_ptr.get());
-    gpu_index_ivf->setNumProbes(nprobe);
-
-    gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-    double t3 = getmillisecs();
-    printf("GPU execution time: %0.2f\n", t3 - t2);
-}
-
-void execute_on_cpu(faiss::Index *index, float* xq, long nq, long k, long nprobe, long* I, float* D) {
-    faiss::IndexIVF* ivf_index =
-        dynamic_cast<faiss::IndexIVF*>(index);
-    ivf_index->nprobe = nprobe;
-    index->search(nq, xq, k, D, I);
-}
-
-float *construct_queries(long nq, long dimension) {
-    float *xq = new float[dimension * nq];
-    for (int i = 0; i < nq; i++) {
-        for (int j = 0; j < dimension; j++) {
-            xq[dimension * i + j] = drand48();
-        }
-    }
-    return xq;
-}
-
-void print_result(long number, long nq, long k, long *I, float *D) {
-    printf("I (%ld first results)=\n", number);
-    for (int i = 0; i < number; i++) {
-        for (int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-
-    printf("I (%ld last results)=\n", number);
-    for (int i = nq - number; i < nq; i++) {
-        for (int j = 0; j < k; j++)
-            printf("%5ld ", I[i * k + j]);
-        printf("\n");
-    }
-}
-
-void faiss_setting() {
-    faiss::distance_compute_blas_threshold = 800;
-}
-
-int main() {
-    const char *filename = "index5.index";
-
-#if 0
-    long dimension = 512;
-    long nb = 6000000;
-    long nq = 1000;
-    long topk = 16;
-    long print_number = 8;
-    long nprobe = 32;
-
-    std::string index_desc = "IVF16384,SQ8";
-    faiss::gpu::StandardGpuResources res;
-    if ((access(filename, F_OK)) == -1) {
-        printf("file doesn't exist, create one\n");
-        generate_file(filename, nb, dimension, index_desc, res);
-    }
-
-    // Construct queries
-    float *xq = construct_queries(nq, dimension);
-
-    // Read index
-    faiss::Index *index = get_index(filename);
-
-    // Execute on GPU
-    long *I = new long[topk * nq];
-    float *D = new float[topk * nq];
-    execute_on_gpu(index, xq, nq, topk, nprobe, res, I, D);
-
-    // Print results
-    print_result(print_number, nq, topk, I, D);
-    delete[] I; I = nullptr;
-    delete[] D; D = nullptr;
-
-    // Execute on CPU
-    I = new long[topk * nq];
-    D = new float[topk * nq];
-    execute_on_cpu(index, xq, nq, topk, nprobe, I, D);
-
-    // Print results
-    print_result(print_number, nq, topk, I, D);
-    delete[] I;
-    delete[] D;
-
-    return 0;
-#else
-    int number = 8;
-    int d = 512;                            // dimension
-    int nq = 1000;                        // nb of queries
-    int nprobe = 16;
-    float *xq = new float[d * nq];
-    for(int i = 0; i < nq; i++) {
-        for(int j = 0; j < d; j++) {
-            xq[d * i + j] = drand48();
-//            printf("%lf ", xq[d * i + j]);
-        }
-//        xq[d * i] += i / 1000.;
-//        printf("\n");
-    }
-    faiss::distance_compute_blas_threshold = 800;
-
-    faiss::gpu::StandardGpuResources res;
-
-    int k = 16;
-    std::shared_ptr<faiss::Index> gpu_index_ivf_ptr;
-
-    const char* index_description = "IVF16384,SQ8";
-    // const char* index_description = "IVF3276,Flat";
-//    Index *index_factory (int d, const char *description,
-//                          MetricType metric = METRIC_L2);
-
-    faiss::Index *cpu_index = nullptr;
-    if((access(filename,F_OK))==-1) {
-        long nb = 6000000;
-        long dimension = d;
-        printf("file doesn't exist, create one\n");
-        generate_file(filename, nb, dimension, index_description, res);
-        /*
-        // create database
-                               // database size
-//        printf("-----------------------\n");
-        long size = d * nb;
-        float *xb = new float[size];
-        memset(xb, 0, size * sizeof(float));
-        printf("size: %ld\n", (size * sizeof(float)) );
-        for(long i = 0; i < nb; i++) {
-            for(long j = 0; j < d; j++) {
-                float rand = drand48();
-                xb[d * i + j] = rand;
-//                printf("%lf ", xb[d * i + j]);
-            }
-//            xb[d * i] += i / 1000.;
-//            printf("\n");
-        }
-
-        // Using an IVF index
-        // here we specify METRIC_L2, by default it performs inner-product search
-
-        faiss::Index *ori_index = faiss::index_factory(d, index_description, faiss::METRIC_L2);
-        auto device_index = faiss::gpu::index_cpu_to_gpu(&res, 0, ori_index);
-
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(device_index);
-
-        assert(!device_index->is_trained);
-        device_index->train(nb, xb);
-        assert(device_index->is_trained);
-        device_index->add(nb, xb);  // add vectors to the index
-
-        printf("is_trained = %s\n", device_index->is_trained ? "true" : "false");
-        printf("ntotal = %ld\n", device_index->ntotal);
-
-        cpu_index = faiss::gpu::index_gpu_to_cpu ((device_index));
-        faiss::write_index(cpu_index, filename);
-        printf("index.index is stored successfully.\n");
-        delete [] xb;
-         */
-    } else {
-        cpu_index = get_index(filename);
-    }
-
-    {
-        // cpu to gpu
-        double t0 = getmillisecs ();
-        faiss::gpu::CpuToGpuClonerOptions option;
-        option.readonly = true;
-        faiss::Index* tmp_index = faiss::gpu::cpu_to_gpu(&res, 0, cpu_index, &option);
-
-        gpu_index_ivf_ptr = std::shared_ptr<faiss::Index>(tmp_index);
-
-        // Gpu index dump
-
-        auto gpu_index_ivf_sq_ptr = dynamic_cast<faiss::gpu::GpuIndexIVFSQ*>(tmp_index);
-//        gpu_index_ivf_sq_ptr->dump();
-        double t1 = getmillisecs ();
-        printf("CPU to GPU loading time: %0.2f\n", t1 - t0);
-        // // Cpu index dump
-        // auto cpu_index_ivf_sq_ptr = dynamic_cast<faiss::IndexIVF*>(cpu_index);
-        // cpu_index_ivf_sq_ptr->dump();
-    }
-
-
-    {       // search xq
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-        double t2 = getmillisecs();
-        faiss::gpu::GpuIndexIVF* gpu_index_ivf =
-            dynamic_cast<faiss::gpu::GpuIndexIVF*>(gpu_index_ivf_ptr.get());
-        gpu_index_ivf->setNumProbes(nprobe);
-
-        gpu_index_ivf_ptr->search(nq, xq, k, D, I);
-        double t3 = getmillisecs();
-        printf("GPU execution time: %0.2f\n", t3 - t2);
-
-        // print results
-        printf("GPU: \n");
-#if 0
-        printf("GPU: I (2 first results)=\n");
-        for(int i = 0; i < number; i++) {
-            for(int j = 0; j < k; j++)
-                printf("GPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-
-        printf("GPU: I (2 last results)=\n");
-        for(int i = nq - number; i < nq; i++) {
-            for(int j = 0; j < k; j++)
-                printf("GPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-#else
-        printf("I (2 first results)=\n");
-        for(int i = 0; i < number; i++) {
-            for(int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("I (2 last results)=\n");
-        for(int i = nq - number; i < nq; i++) {
-            for(int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-#endif
-        delete [] I;
-        delete [] D;
-    }
-    printf("----------------------------------\n");
-    {       // search xq
-        printf("CPU: \n");
-        long *I = new long[k * nq];
-        float *D = new float[k * nq];
-
-        double t4 = getmillisecs();
-        faiss::IndexIVF* ivf_index =
-            dynamic_cast<faiss::IndexIVF*>(cpu_index);
-        ivf_index->nprobe = nprobe;
-        cpu_index->search(nq, xq, k, D, I);
-        double t5 = getmillisecs();
-        printf("CPU execution time: %0.2f\n", t5 - t4);
-#if 0
-        // print results
-        printf("CPU: I (2 first results)=\n");
-        for(int i = 0; i < number; i++) {
-            for(int j = 0; j < k; j++)
-                printf("CPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-
-        printf("CPU: I (2 last results)=\n");
-        for(int i = nq - number; i < nq; i++) {
-            for(int j = 0; j < k; j++)
-                printf("CPU: %5ld(%f) ", I[i * k + j], D[i * k + j]);
-            printf("\n");
-        }
-#else
-        // print results
-        printf("I (2 first results)=\n");
-        for(int i = 0; i < number; i++) {
-            for(int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-
-        printf("I (2 last results)=\n");
-        for(int i = nq - number; i < nq; i++) {
-            for(int j = 0; j < k; j++)
-                printf("%5ld ", I[i * k + j]);
-            printf("\n");
-        }
-#endif
-        delete [] I;
-        delete [] D;
-    }
-
-
-    delete [] xq;
-    return 0;
-#endif
-}
\ No newline at end of file