From 7b1c27a53541cd337f5b959b7f2f178b81d876a8 Mon Sep 17 00:00:00 2001 From: Jim Apple Date: Fri, 23 Dec 2016 18:45:39 -0800 Subject: [PATCH] Add a variant of cuckoo filters in which buckets may overlap. This demonstrates lookup times that are as good as cuckoo filter lookup times, or better, while having space usage close to that of the semi-sorted cuckoo filter. This is based on Lehman, Eric, and Rina Panigrahy. "3.5-way cuckoo hashing for the price of 2-and-a-bit." European Symposium on Algorithms. Springer, Berlin, Heidelberg, 2009. --- benchmarks/bulk-insert-and-query.cc | 89 +++++------- benchmarks/conext-figure5.cc | 48 ++++--- benchmarks/filter-api.h | 63 +++++++++ src/shingle.h | 208 ++++++++++++++++++++++++++++ 4 files changed, 335 insertions(+), 73 deletions(-) create mode 100644 benchmarks/filter-api.h create mode 100644 src/shingle.h diff --git a/benchmarks/bulk-insert-and-query.cc b/benchmarks/bulk-insert-and-query.cc index c507a23..eecc930 100644 --- a/benchmarks/bulk-insert-and-query.cc +++ b/benchmarks/bulk-insert-and-query.cc @@ -13,38 +13,41 @@ // 55: // Million Find Find Find Find Find optimal wasted // adds/sec 0% 25% 50% 75% 100% ε bits/item bits/item space -// Cuckoo12 23.78 37.24 35.04 37.17 37.35 36.35 0.131% 18.30 9.58 91.1% -// SemiSort13 11.63 17.55 17.08 17.14 17.54 22.32 0.064% 18.30 10.62 72.4% -// Cuckoo8 35.31 49.32 50.24 49.98 48.32 50.49 2.044% 12.20 5.61 117.4% -// SemiSort9 13.99 22.23 22.78 22.13 23.16 24.06 1.207% 12.20 6.37 91.5% -// Cuckoo16 27.06 36.94 37.12 35.31 36.81 35.10 0.009% 24.40 13.46 81.4% -// SemiSort17 10.37 15.70 15.84 15.78 15.55 15.93 0.004% 24.40 14.72 65.8% -// SimdBlock8 74.22 72.34 74.23 74.34 74.69 74.32 0.508% 12.20 7.62 60.1% -// time: 14.34 seconds +// Cuckoo12 27.15 30.20 40.99 41.18 40.83 41.61 0.128% 18.30 9.61 90.5% +// SemiSort13 11.21 18.29 18.15 18.26 18.46 17.55 0.065% 18.30 10.58 72.9% +// Shingle12 21.34 40.58 40.80 40.82 40.66 40.91 0.062% 18.30 10.66 71.8% +// Cuckoo8 42.06 45.61 54.74 53.58 55.83 54.35 2.071% 12.20 5.59 118.1% +// SemiSort9 15.18 24.40 25.77 14.41 25.57 26.05 1.214% 12.20 6.36 91.7% +// Cuckoo16 31.81 39.52 40.61 40.41 40.09 40.08 0.010% 24.40 13.30 83.5% +// SemiSort17 11.24 16.73 16.55 16.71 16.77 16.34 0.005% 24.40 14.44 69.0% +// SimdBlock8 81.48 84.58 86.63 86.63 83.58 87.26 0.485% 12.20 7.69 58.7% +// time: 14.06 seconds // // 75: // Million Find Find Find Find Find optimal wasted // adds/sec 0% 25% 50% 75% 100% ε bits/item bits/item space -// Cuckoo12 15.61 37.24 37.23 37.34 37.15 37.36 0.173% 13.42 9.18 46.2% -// SemiSort13 8.77 17.11 15.70 17.34 17.73 18.86 0.087% 13.42 10.17 31.9% -// Cuckoo8 23.46 48.81 48.14 39.48 49.28 49.65 2.806% 8.95 5.16 73.6% -// SemiSort9 11.14 23.98 20.80 23.37 24.35 21.41 1.428% 8.95 6.13 46.0% -// Cuckoo16 15.08 36.64 36.75 36.83 36.59 36.74 0.011% 17.90 13.11 36.5% -// SemiSort17 8.02 15.63 15.66 15.87 15.67 15.88 0.006% 17.90 14.02 27.6% -// SimdBlock8 73.26 74.41 74.28 70.86 72.02 70.69 2.071% 8.95 5.59 60.0% -// time: 18.06 seconds +// Cuckoo12 18.27 41.87 41.80 40.89 39.83 41.95 0.170% 13.42 9.20 45.9% +// SemiSort13 8.65 18.48 14.31 18.63 18.78 14.83 0.087% 13.42 10.17 31.9% +// Shingle12 11.00 40.80 41.14 41.34 41.30 41.41 0.088% 13.42 10.16 32.1% +// Cuckoo8 28.13 53.47 55.73 56.40 56.30 56.50 2.797% 8.95 5.16 73.4% +// SemiSort9 12.43 25.76 26.30 25.91 16.99 26.46 1.438% 8.95 6.12 46.2% +// Cuckoo16 17.71 40.93 41.09 41.19 41.31 40.84 0.012% 17.90 13.00 37.7% +// SemiSort17 8.46 16.99 17.06 15.84 13.75 17.06 0.006% 17.90 14.10 26.9% +// SimdBlock8 88.56 88.43 84.02 87.45 88.91 88.38 2.054% 8.95 5.61 59.6% +// time: 16.27 seconds // // 85: // Million Find Find Find Find Find optimal wasted // adds/sec 0% 25% 50% 75% 100% ε bits/item bits/item space -// Cuckoo12 22.74 32.49 32.69 32.58 32.85 32.71 0.102% 23.69 9.94 138.3% -// SemiSort13 9.97 13.16 13.15 13.54 16.01 19.58 0.056% 23.69 10.80 119.4% -// Cuckoo8 30.67 36.86 36.79 37.09 36.97 36.87 1.581% 15.79 5.98 163.9% -// SemiSort9 10.96 15.49 15.37 15.40 15.18 15.63 1.047% 15.79 6.58 140.1% -// Cuckoo16 27.84 33.74 33.72 33.69 33.75 33.62 0.007% 31.58 13.80 128.8% -// SemiSort17 9.51 12.83 12.80 12.64 12.86 12.50 0.004% 31.58 14.65 115.6% -// SimdBlock8 54.84 58.37 59.73 59.13 60.11 60.12 0.144% 15.79 9.44 67.3% -// time: 19.43 seconds +// Cuckoo12 25.80 37.66 37.97 38.01 37.94 37.87 0.098% 23.69 9.99 137.1% +// SemiSort13 9.60 14.38 14.51 14.34 12.69 14.56 0.048% 23.69 11.02 114.8% +// Shingle12 21.77 37.25 36.65 37.44 37.55 35.79 0.052% 23.69 10.91 117.1% +// Cuckoo8 36.73 40.92 40.99 41.51 40.96 41.39 1.574% 15.79 5.99 163.6% +// SemiSort9 11.39 16.76 16.57 16.68 16.25 16.82 1.049% 15.79 6.57 140.2% +// Cuckoo16 33.98 37.85 38.70 38.92 38.76 38.95 0.006% 31.58 13.98 125.9% +// SemiSort17 10.30 13.39 14.30 14.21 14.34 14.40 0.004% 31.58 14.61 116.2% +// SimdBlock8 66.62 72.34 72.50 71.38 72.43 72.09 0.141% 15.79 9.48 66.6% +// time: 16.50 seconds // #include @@ -54,7 +57,9 @@ #include #include "cuckoofilter.h" +#include "filter-api.h" #include "random.h" +#include "shingle.h" #include "simd-block.h" #include "timing.h" @@ -119,38 +124,6 @@ basic_ostream& operator<<( return os; } -template -struct FilterAPI {}; - -template class TableType> -struct FilterAPI> { - using Table = CuckooFilter; - static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); } - static void Add(uint64_t key, Table * table) { - if (0 != table->Add(key)) { - throw logic_error("The filter is too small to hold all of the elements"); - } - } - static bool Contain(uint64_t key, const Table * table) { - return (0 == table->Contain(key)); - } -}; - -template <> -struct FilterAPI> { - using Table = SimdBlockFilter<>; - static Table ConstructFromAddCount(size_t add_count) { - Table ans(ceil(log2(add_count * 8.0 / CHAR_BIT))); - return ans; - } - static void Add(uint64_t key, Table* table) { - table->Add(key); - } - static bool Contain(uint64_t key, const Table * table) { - return table->Find(key); - } -}; - template Statistics FilterBenchmark( size_t add_count, const vector& to_add, const vector& to_lookup) { @@ -224,6 +197,10 @@ int main(int argc, char * argv[]) { cout << setw(NAME_WIDTH) << "SemiSort13" << cf << endl; + cf = FilterBenchmark>(add_count, to_add, to_lookup); + + cout << setw(NAME_WIDTH) << "Shingle12" << cf << endl; + cf = FilterBenchmark< CuckooFilter>( add_count, to_add, to_lookup); diff --git a/benchmarks/conext-figure5.cc b/benchmarks/conext-figure5.cc index 2f759de..fc1eb3d 100644 --- a/benchmarks/conext-figure5.cc +++ b/benchmarks/conext-figure5.cc @@ -1,22 +1,24 @@ // This benchmark reproduces the CoNEXT 2014 results found in "Figure 5: Lookup -// performance when a filter achieves its capacity." It takes about two minutes to run on -// an Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz. +// performance when a filter achieves its capacity." It takes about three minutes to run +// on an Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz. // // Results: // fraction of queries on existing items/lookup throughput (million OPS) -// CF ss-CF -// 0.00% 24.79 9.37 -// 25.00% 24.65 9.57 -// 50.00% 24.84 9.57 -// 75.00% 24.86 9.62 -// 100.00% 24.89 9.96 +// CF ss-CF Shingle +// 0.00% 26.10 10.07 27.37 +// 25.00% 25.92 10.65 27.40 +// 50.00% 26.00 10.65 27.18 +// 75.00% 25.95 10.79 27.21 +// 100.00% 25.89 10.64 27.28 #include #include #include #include "cuckoofilter.h" +#include "filter-api.h" #include "random.h" +#include "shingle.h" #include "timing.h" using namespace std; @@ -29,14 +31,21 @@ const size_t SAMPLE_SIZE = 1000 * 1000; // The time (in seconds) to lookup SAMPLE_SIZE keys in which 0%, 25%, 50%, 75%, and 100% // of the keys looked up are found. template -array CuckooBenchmark( +array Benchmark( size_t add_count, const vector& to_add, const vector& to_lookup) { - Table cuckoo(add_count); + Table filter = FilterAPI::ConstructFromAddCount(add_count); array result; // Add values until failure or until we run out of values to add: size_t added = 0; - while (added < to_add.size() && 0 == cuckoo.Add(to_add[added])) ++added; + while (added < to_add.size()) { + try { + FilterAPI
::Add(to_add[added], &filter); + } catch (...) { + break; + } + ++added; + } // A value to track to prevent the compiler from optimizing out all lookups: size_t found_count = 0; @@ -44,7 +53,9 @@ array CuckooBenchmark( const auto to_lookup_mixed = MixIn(&to_lookup[0], &to_lookup[SAMPLE_SIZE], &to_add[0], &to_add[added], found_percent); auto start_time = NowNanos(); - for (const auto v : to_lookup_mixed) found_count += (0 == cuckoo.Contain(v)); + for (const auto v : to_lookup_mixed) { + found_count += FilterAPI
::Contain(v, &filter); + } auto lookup_time = NowNanos() - start_time; result[found_percent * 4] = lookup_time / (1000.0 * 1000.0 * 1000.0); } @@ -64,21 +75,24 @@ int main() { const vector to_lookup = GenerateRandom64(SAMPLE_SIZE); // Calculate metrics: - const auto cf = CuckooBenchmark< - CuckooFilter>( + const auto cf = Benchmark>( add_count, to_add, to_lookup); - const auto sscf = CuckooBenchmark< - CuckooFilter>( + const auto sscf = Benchmark>( add_count, to_add, to_lookup); + const auto qcf = Benchmark>(add_count, to_add, to_lookup); cout << "fraction of queries on existing items/lookup throughput (million OPS) " << endl; cout << setw(10) << "" - << " " << setw(10) << right << "CF" << setw(10) << right << "ss-CF" << endl; + << " " << setw(10) << right << "CF" << setw(10) << right << "ss-CF" + << setw(10) << right << "Shingle" << endl; for (const double found_percent : {0.0, 0.25, 0.50, 0.75, 1.00}) { cout << fixed << setprecision(2) << setw(10) << right << 100 * found_percent << "%"; cout << setw(10) << right << (SAMPLE_SIZE / cf[found_percent * 4]) / (1000 * 1000); cout << setw(10) << right << (SAMPLE_SIZE / sscf[found_percent * 4]) / (1000 * 1000); + cout << setw(10) << right << (SAMPLE_SIZE / qcf[found_percent * 4]) / (1000 * 1000); cout << endl; } } diff --git a/benchmarks/filter-api.h b/benchmarks/filter-api.h new file mode 100644 index 0000000..b99a6e5 --- /dev/null +++ b/benchmarks/filter-api.h @@ -0,0 +1,63 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "cuckoofilter.h" +#include "shingle.h" +#include "simd-block.h" + +template +struct FilterAPI {}; + +template class TableType> +struct FilterAPI< + cuckoofilter::CuckooFilter> { + using Table = cuckoofilter::CuckooFilter; + static Table ConstructFromAddCount(std::size_t add_count) { + return Table(add_count); + } + static void Add(std::uint64_t key, Table *table) { + if (0 != table->Add(key)) { + throw std::logic_error( + "The cuckoo filter is too small to hold all of the elements"); + } + } + static bool Contain(std::uint64_t key, const Table *table) { + return (0 == table->Contain(key)); + } +}; + +template <> +struct FilterAPI> { + using Table = SimdBlockFilter<>; + static Table ConstructFromAddCount(std::size_t add_count) { + Table ans(std::ceil(std::log2(add_count * 8.0 / CHAR_BIT))); + return ans; + } + static void Add(std::uint64_t key, Table *table) { table->Add(key); } + static bool Contain(std::uint64_t key, const Table *table) { + return table->Find(key); + } +}; + +template +struct FilterAPI> { + using Table = Shingle; + static Table ConstructFromAddCount(size_t add_count) { + return Table(ceil(log2(add_count * 12.75 / 12.0))); + } + static void Add(std::uint64_t key, Table *table) { + if (!table->Add(key)) { + throw std::logic_error( + "The quotient filter is too small to hold all of the elements"); + } + } + static bool Contain(std::uint64_t key, const Table *table) { + return table->Contain(key); + } +}; diff --git a/src/shingle.h b/src/shingle.h new file mode 100644 index 0000000..29d0900 --- /dev/null +++ b/src/shingle.h @@ -0,0 +1,208 @@ +#pragma once + +// Cuckoo filters in which the buckets can overlap. See Lehman, Eric, and Rina +// Panigrahy. "3.5-way cuckoo hashing for the price of 2-and-a-bit." European +// Symposium on Algorithms. Springer, Berlin, Heidelberg, 2009. + +#include +#include +#include +#include + +#include "bitsutil.h" +#include "hashutil.h" + +template +class Shingle { + using uint16_t = ::std::uint16_t; + using uint64_t = ::std::uint64_t; + + // The low-order `bits` bits of the result are 1; all others are 0. + static constexpr uint64_t Mask(int bits) { + return (static_cast(1) << bits) - 1; + } + + // The two halves of the table are stored interleaved, A[0] then B[0] then + // A[1] then B[1], and so on. Each slot has 12 bits, and we store A[i] and + // B[i] together in a `Cell` of three bytes (24 bits). We use the eleven + // high-order bits to store a fingerprint and the bottom bit to indicate if + // the fingerprint is offset from the original bucket it hashed to. + // + // In this class and below, methods that can operate on Cells have a + // template parameter that is true if the value from the array A is + // to be manipulated and false if the value from the array B is to be + // manipulated. + // + // The fingerprint 0x0 is reserved to indicate an empty slot. Keys hashing to + // 0x0 are considered to have a hash of 0x1. + + using Cell = ::std::array; + + static_assert(sizeof(Cell[3]) == 9, "Cells are not packed tightly"); + + HashFamily hasher_; + // A and B have the same length, which is a power of 2. imask_ is one less + // than that length + const uint64_t imask_; + // fp_hash_ uses delta-universal hashing (of the multiply-shift type) to + // derive an index in B from the index in A plus a hash of the fingerprint. + const uint64_t fp_hash_; + Cell *const data_; + size_t filled_; // Number of non-empty slots. + + // Get the fingerprint and offset from index i. The table is A if ISA is true. + template + [[gnu::always_inline]] uint64_t Get(uint64_t i) const { + const uint16_t result = + *reinterpret_cast(&data_[i][1 - ISA]); + if (ISA) { + return result & 0x0fff; + } else { + return result >> 4; + } + } + + // Set the fingerprint and offset at index i to the low-order 12 bits of + // x. The table is A if ISA is true. + template + [[gnu::always_inline]] void Set(uint64_t i, uint64_t x) { + uint16_t &result = *reinterpret_cast(&data_[i][1 - ISA]); + if (ISA) { + result = x | (result & 0xf000); + } else { + result = (x << 4) | (result & 0x000f); + } + } + + uint64_t ReIndex(uint64_t idx, uint64_t fp) const { + return (idx ^ ((fp_hash_ * fp) >> 11)) & imask_; + } + + // Set (ISA ? A : B)[idx + offset] = fp and return the index and fingerpritn + // that was previously there. + template + [[gnu::always_inline]] void Swap(uint64_t idx, uint64_t offset, uint64_t fp, + uint64_t *result_idx, + uint64_t *result_fp) { + idx += offset; + fp = offset | (fp << 1); + *result_idx = idx; + *result_fp = Get(idx); + if (*result_fp & 1) --*result_idx; + *result_fp >>= 1; + Set(idx, fp); + } + + // Helper function for Add(), below. Places fp in one of its two slots (idx or + // idx+1) in (ISA ? A : B), and recurses if necessary. + template + void AddHelp(uint64_t idx, uint64_t fp) { + for (uint64_t offset : {0, 1}) { + const uint64_t q = idx + offset; + const uint64_t fp_now = Get(q); + if (0 == fp_now) { + uint64_t fp_later = offset | (fp << 1); + Set(q, fp_later); + ++filled_; + return; + } + } + + // Do a short local search to see if some items in the next bucket can be + // pushed to later slots, ala robin-hood linear probing. + if (0 == (Get(idx + 1) & 0x1)) { + if (0 == Get(idx + 2)) { + Set(idx + 2, 0x1 | Get(idx + 1)); + Set(idx + 1, 0x1 | (fp << 1)); + ++filled_; + return; + } else if (0 == (Get(idx + 2) & 0x1)) { + if (0 == Get(idx + 3)) { + Set(idx + 3, 0x1 | Get(idx + 2)); + Set(idx + 2, 0x1 | Get(idx + 1)); + Set(idx + 1, 0x1 | (fp << 1)); + ++filled_; + return; + } + } + } + + // Kick out a random key from the two slots: + uint64_t offset = std::rand() % 2; + // TODO: replace random search with BFS or iterative deepening + Swap(idx, offset, fp, &idx, &fp); + // TODO: replace recursion with iteration + return AddHelp(ReIndex(idx, fp), fp); + } + + // Helper for Delete(), below. Returns true if the key was found. + template + [[gnu::always_inline]] bool DeleteHelp(uint64_t idx, uint64_t fp) { + for (uint64_t offset : {0, 1}) { + uint64_t i = idx + offset, f = offset | (fp << 1); + if (Get(i) == f) { + Set(i, 0); + return true; + } + } + if (ISA) return DeleteHelp(ReIndex(idx, fp), fp); + return false; + } + + public: + explicit Shingle(int log2_slots) + : hasher_(), + // Each array has half of the slots + imask_(Mask(log2_slots - 1)), + fp_hash_([]() { + ::std::random_device random; + uint64_t result = random(); + return (result << 32) | random(); + }()), + // Add two extra SlotPairs at the end so 64-bit operations don't read + // past the end and SEGFAULT. + data_(new Cell[imask_ + 3]()), + filled_(0) {} + + ~Shingle() { delete[] data_; } + + uint64_t SizeInBytes() const { return sizeof(Cell) * (imask_ + 3); } + + bool Add(uint64_t key) { + if ((static_cast(filled_) / (2 * (imask_ + 1))) > (12.0 / 12.75)) { + return false; + } + key = hasher_(key); + uint64_t idx = (key >> 11) & imask_, fp = key & Mask(11); + fp += (0 == fp); // Since 0 is the empty slot, re-target zero remainders. + AddHelp(idx, fp); + return true; + } + + [[gnu::always_inline]] bool Contain(uint64_t key) const { + key = hasher_(key); + uint64_t idx = (key >> 11) & imask_, fp = key & Mask(11); + fp += (fp == 0); + auto idx2 = ReIndex(idx, fp); + constexpr uint64_t A_SLOTS_MASK = Mask(12) + (Mask(12) << 24), + B_SLOTS_MASK = A_SLOTS_MASK << 12; + uint64_t slots = + (~A_SLOTS_MASK) | *reinterpret_cast(&data_[idx]); + auto slots2 = + (~B_SLOTS_MASK) | *reinterpret_cast(&data_[idx2]); + auto slots_all = slots & slots2; + + uint64_t fp_all = fp * 0x002002002002ull; + fp_all |= 0x001001000000ull; + + return haszero12(fp_all ^ slots_all); + } + + bool Delete(uint64_t key) { + key = hasher_(key); + const uint64_t idx = (key >> 11) & imask_; + uint64_t fp = key & Mask(11); + fp += (0 == fp); + return DeleteHelp<>(idx, fp); + } +};