From 2cbc715aed210c6d7c0121fe89eb4a13b71743e1 Mon Sep 17 00:00:00 2001 From: Jim Apple Date: Sat, 14 Dec 2019 16:07:06 -0800 Subject: [PATCH] Use "huge pages" when on unix-like operating systems This has a positive effect on performance for a small (and tunable) effect on space usage. --- Makefile | 1 + benchmarks/Makefile | 2 +- src/memory.cc | 71 +++++++++++++++++++++++++++++++++++++++++++++ src/memory.h | 41 ++++++++++++++++++++++++++ src/packedtable.h | 9 +++--- src/simd-block.h | 17 +++++++---- src/singletable.h | 13 +++++---- 7 files changed, 138 insertions(+), 16 deletions(-) create mode 100644 src/memory.cc create mode 100644 src/memory.h diff --git a/Makefile b/Makefile index 9da574b..bf89fe8 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ LDFLAGS+= -Wall -lpthread -lssl -lcrypto LIBOBJECTS = \ ./src/hashutil.o \ + ./src/memory.o HEADERS = $(wildcard src/*.h) ALIB = libcuckoofilter.a diff --git a/benchmarks/Makefile b/benchmarks/Makefile index 32bfc90..f342f02 100644 --- a/benchmarks/Makefile +++ b/benchmarks/Makefile @@ -8,7 +8,7 @@ LDFLAGS+= -Wall -lpthread -lssl -lcrypto HEADERS = $(wildcard ../src/*.h) *.h -SRC = ../src/hashutil.cc +SRC = ../src/hashutil.cc ../src/memory.cc .PHONY: all diff --git a/src/memory.cc b/src/memory.cc new file mode 100644 index 0000000..912bffb --- /dev/null +++ b/src/memory.cc @@ -0,0 +1,71 @@ +#include "memory.h" + +#include +#include + +#if defined(MMAP) + +#include // mmap, munmap +#include // errno +#include // std::bad_alloc +#include // std::runtime_error + +static constexpr uint64_t HUGE_PAGE_SIZE = ((uint64_t)1) << 21; + +// OVERAGE_LIMIT is how much wiggle room there is on allocating more +// memory than specifically requested. +static constexpr double OVERAGE_LIMIT = 0.05; + +#if defined(__linux__) && __linux__ +#define MMAP_ZERO_FILLED true +#else +#define MMAP_ZERO_FILLED false +#endif // __linux__ + +#endif // MMAP + +namespace cuckoofilter { + +void *Allocate(std::size_t bytes, std::size_t *actual_bytes) noexcept(!kMmap) { +#if defined(MMAP) + const double overage = + static_cast(HUGE_PAGE_SIZE - bytes % HUGE_PAGE_SIZE) / + static_cast(bytes); + if (overage < OVERAGE_LIMIT) { + bytes = ((bytes + HUGE_PAGE_SIZE - 1) / HUGE_PAGE_SIZE) * HUGE_PAGE_SIZE; + *actual_bytes = bytes; + errno = 0; + void *result = mmap(NULL, bytes, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_HUGETLB | MAP_ANONYMOUS, -1, 0); + if (MAP_FAILED == result) { + throw std::runtime_error(std::strerror(errno)); + } + if (!MMAP_ZERO_FILLED) std::memset(result, 0, bytes); + return result; + } +#endif // MMAP + *actual_bytes = bytes; + void * result; + const int malloc_failed = posix_memalign(&result, 64, bytes); + if (malloc_failed) throw std::runtime_error(std::strerror(malloc_failed)); + std::memset(result, 0, bytes); + return result; +} + +void Deallocate(void *p, std::size_t bytes) noexcept(!kMmap) { +#if defined(MMAP) + const double overage = + static_cast(HUGE_PAGE_SIZE - bytes % HUGE_PAGE_SIZE) / + static_cast(bytes); + if (overage < OVERAGE_LIMIT) { + bytes = ((bytes + HUGE_PAGE_SIZE - 1) / HUGE_PAGE_SIZE) * HUGE_PAGE_SIZE; + errno = 0; + const int fail = munmap(p, bytes); + if (fail != 0) throw std::runtime_error(std::strerror(errno)); + return; + } +#endif // MMAP + std::free(p); +} + +} // namespace cuckoofilter diff --git a/src/memory.h b/src/memory.h new file mode 100644 index 0000000..ff4e5e2 --- /dev/null +++ b/src/memory.h @@ -0,0 +1,41 @@ +#ifndef CUCKOO_FILTER_MEMORY_H_ +#define CUCKOO_FILTER_MEMORY_H_ + +// This file provides two functions dealing with memory +// allocation. They abstract out complexities like allocating aligned +// memory and using 2MB "huge pages" rather than the usual 4KB pages, +// hopefully thereby reducing TLB misses. +// +// In benchmarking on 126M elements, this induced a <1.5% space +// increase, a 9% decrease in the wall-clock time to run +// bulk-insert-and-query.exe, as well as a 56% reduction in page +// faults and a 99% reduction in dTLB misses. + +#include + +#if defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \ + (((defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && __GLIBC__ == 2 && \ + __GLIBC_MINOR__ <= 19) || \ + (defined(_DEFAULT_SOURCE) && \ + ((__GLIBC__ > 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19)))) + +#define MMAP + +#endif // MMAP + +static constexpr bool kMmap = +#if defined(MMAP) + true; +#else + false; +#endif // MMAP + +namespace cuckoofilter { + +void *Allocate(std::size_t bytes, std::size_t *actual_bytes) noexcept(!kMmap); + +void Deallocate(void *p, std::size_t bytes) noexcept(!kMmap); + +} // namespace cuckoofilter + +#endif // CUCKOO_FILTER_MEMORY_H_ diff --git a/src/packedtable.h b/src/packedtable.h index 5b84473..2ad9db5 100644 --- a/src/packedtable.h +++ b/src/packedtable.h @@ -5,6 +5,7 @@ #include #include "debug.h" +#include "memory.h" #include "permencoding.h" #include "printutil.h" @@ -20,6 +21,7 @@ class PackedTable { // using a pointer adds one more indirection size_t len_; + size_t bytes_; size_t num_buckets_; char *buckets_; PermEncoding perm_; @@ -29,12 +31,11 @@ class PackedTable { // NOTE(binfan): use 7 extra bytes to avoid overrun as we // always read a uint64 len_ = kBytesPerBucket * num_buckets_ + 7; - buckets_ = new char[len_]; - memset(buckets_, 0, len_); + buckets_ = reinterpret_cast(Allocate(len_, &bytes_)); } ~PackedTable() { - delete[] buckets_; + Deallocate(buckets_, len_); } size_t NumBuckets() const { @@ -46,7 +47,7 @@ class PackedTable { } size_t SizeInBytes() const { - return len_; + return bytes_; } std::string Info() const { diff --git a/src/simd-block.h b/src/simd-block.h index 652ad56..086426c 100644 --- a/src/simd-block.h +++ b/src/simd-block.h @@ -21,6 +21,7 @@ #include #include "hashutil.h" +#include "memory.h" using uint32_t = ::std::uint32_t; using uint64_t = ::std::uint64_t; @@ -41,6 +42,10 @@ class SimdBlockFilter { // log_num_buckets_ is the log (base 2) of the number of buckets in the directory: const int log_num_buckets_; + // The number of bytes returned by the allocation function; might be + // slightly more than sizeof(Bucket) * (1 << log_num_buckets_) + uint64_t actual_bytes_; + // directory_mask_ is (1 << log_num_buckets_) - 1. It is precomputed in the contructor // for efficiency reasons: const uint32_t directory_mask_; @@ -54,13 +59,14 @@ class SimdBlockFilter { explicit SimdBlockFilter(const int log_heap_space); SimdBlockFilter(SimdBlockFilter&& that) : log_num_buckets_(that.log_num_buckets_), + actual_bytes_(that.actual_bytes_), directory_mask_(that.directory_mask_), directory_(that.directory_), hasher_(that.hasher_) {} ~SimdBlockFilter() noexcept; void Add(const uint64_t key) noexcept; bool Find(const uint64_t key) const noexcept; - uint64_t SizeInBytes() const { return sizeof(Bucket) * (1ull << log_num_buckets_); } + uint64_t SizeInBytes() const { return actual_bytes_; } private: // A helper function for Insert()/Find(). Turns a 32-bit hash into a 256-bit Bucket @@ -85,15 +91,14 @@ SimdBlockFilter::SimdBlockFilter(const int log_heap_space) throw ::std::runtime_error("SimdBlockFilter does not work without AVX2 instructions"); } const size_t alloc_size = 1ull << (log_num_buckets_ + LOG_BUCKET_BYTE_SIZE); - const int malloc_failed = - posix_memalign(reinterpret_cast(&directory_), 64, alloc_size); - if (malloc_failed) throw ::std::bad_alloc(); - memset(directory_, 0, alloc_size); + directory_ = reinterpret_cast( + cuckoofilter::Allocate(alloc_size, &actual_bytes_)); } template SimdBlockFilter::~SimdBlockFilter() noexcept { - free(directory_); + const size_t alloc_size = 1ull << (log_num_buckets_ + LOG_BUCKET_BYTE_SIZE); + cuckoofilter::Deallocate(directory_, alloc_size); directory_ = nullptr; } diff --git a/src/singletable.h b/src/singletable.h index 8fd40b3..a580e3d 100644 --- a/src/singletable.h +++ b/src/singletable.h @@ -7,6 +7,7 @@ #include "bitsutil.h" #include "debug.h" +#include "memory.h" #include "printutil.h" namespace cuckoofilter { @@ -30,15 +31,17 @@ class SingleTable { // using a pointer adds one more indirection Bucket *buckets_; size_t num_buckets_; + size_t bytes_; public: explicit SingleTable(const size_t num) : num_buckets_(num) { - buckets_ = new Bucket[num_buckets_ + kPaddingBuckets]; - memset(buckets_, 0, kBytesPerBucket * (num_buckets_ + kPaddingBuckets)); + const size_t bytes = sizeof(Bucket) * (num_buckets_ + kPaddingBuckets); + buckets_ = reinterpret_cast(Allocate(bytes, &bytes_)); } - ~SingleTable() { - delete[] buckets_; + ~SingleTable() noexcept(!kMmap) { + const size_t bytes = sizeof(Bucket) * (num_buckets_ + kPaddingBuckets); + Deallocate(buckets_, bytes); } size_t NumBuckets() const { @@ -46,7 +49,7 @@ class SingleTable { } size_t SizeInBytes() const { - return kBytesPerBucket * num_buckets_; + return bytes_; } size_t SizeInTags() const {